9% faster vorbis (on a K8). Originally committed as revision 5898 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -35,6 +35,9 @@ | |||
| /* snow.c */ | |||
| void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count); | |||
| /* vorbis.c */ | |||
| void vorbis_inverse_coupling(float *mag, float *ang, int blocksize); | |||
| uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, }; | |||
| uint32_t squareTbl[512] = {0, }; | |||
| @@ -4090,6 +4093,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||
| c->inner_add_yblock = ff_snow_inner_add_yblock; | |||
| #endif | |||
| #ifdef CONFIG_VORBIS_DECODER | |||
| c->vorbis_inverse_coupling = vorbis_inverse_coupling; | |||
| #endif | |||
| c->shrink[0]= ff_img_copy_plane; | |||
| c->shrink[1]= ff_shrink22; | |||
| c->shrink[2]= ff_shrink44; | |||
| @@ -307,6 +307,8 @@ typedef struct DSPContext { | |||
| void (*h261_loop_filter)(uint8_t *src, int stride); | |||
| void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); | |||
| /* (I)DCT */ | |||
| void (*fdct)(DCTELEM *block/* align 16*/); | |||
| void (*fdct248)(DCTELEM *block/* align 16*/); | |||
| @@ -2711,6 +2711,59 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) | |||
| } | |||
| #endif | |||
| static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) | |||
| { | |||
| int i; | |||
| asm volatile("pxor %%mm7, %%mm7":); | |||
| for(i=0; i<blocksize; i+=2) { | |||
| asm volatile( | |||
| "movq %0, %%mm0 \n\t" | |||
| "movq %1, %%mm1 \n\t" | |||
| "movq %%mm0, %%mm2 \n\t" | |||
| "movq %%mm1, %%mm3 \n\t" | |||
| "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 | |||
| "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 | |||
| "pslld $31, %%mm2 \n\t" // keep only the sign bit | |||
| "pxor %%mm2, %%mm1 \n\t" | |||
| "movq %%mm3, %%mm4 \n\t" | |||
| "pand %%mm1, %%mm3 \n\t" | |||
| "pandn %%mm1, %%mm4 \n\t" | |||
| "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |||
| "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |||
| "movq %%mm3, %1 \n\t" | |||
| "movq %%mm0, %0 \n\t" | |||
| :"+m"(mag[i]), "+m"(ang[i]) | |||
| ::"memory" | |||
| ); | |||
| } | |||
| asm volatile("emms"); | |||
| } | |||
| static void vorbis_inverse_coupling_sse2(float *mag, float *ang, int blocksize) | |||
| { | |||
| int i; | |||
| for(i=0; i<blocksize; i+=4) { | |||
| asm volatile( | |||
| "movaps %0, %%xmm0 \n\t" | |||
| "movaps %1, %%xmm1 \n\t" | |||
| "pxor %%xmm2, %%xmm2 \n\t" | |||
| "pxor %%xmm3, %%xmm3 \n\t" | |||
| "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 | |||
| "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 | |||
| "pslld $31, %%xmm2 \n\t" // keep only the sign bit | |||
| "pxor %%xmm2, %%xmm1 \n\t" | |||
| "movaps %%xmm3, %%xmm4 \n\t" | |||
| "pand %%xmm1, %%xmm3 \n\t" | |||
| "pandn %%xmm1, %%xmm4 \n\t" | |||
| "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) | |||
| "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) | |||
| "movaps %%xmm3, %1 \n\t" | |||
| "movaps %%xmm0, %0 \n\t" | |||
| :"+m"(mag[i]), "+m"(ang[i]) | |||
| ::"memory" | |||
| ); | |||
| } | |||
| } | |||
| #ifdef CONFIG_SNOW_ENCODER | |||
| extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); | |||
| extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width); | |||
| @@ -3137,6 +3190,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
| c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; | |||
| } | |||
| #endif | |||
| if(mm_flags & MM_SSE2) | |||
| c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse2; | |||
| else if(mm_flags & MM_SSE) | |||
| c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |||
| } | |||
| #ifdef CONFIG_ENCODERS | |||
| @@ -929,6 +929,7 @@ static int vorbis_decode_init(AVCodecContext *avccontext) { | |||
| int i, j, hdr_type; | |||
| vc->avccontext = avccontext; | |||
| dsputil_init(&vc->dsp, avccontext); | |||
| if (!headers_len) { | |||
| av_log(avccontext, AV_LOG_ERROR, "Extradata corrupt.\n"); | |||
| @@ -1443,6 +1444,31 @@ static int vorbis_residue_decode(vorbis_context *vc, vorbis_residue *vr, uint_fa | |||
| return 0; | |||
| } | |||
| void vorbis_inverse_coupling(float *mag, float *ang, int blocksize) | |||
| { | |||
| int i; | |||
| for(i=0; i<blocksize; i++) | |||
| { | |||
| if (mag[i]>0.0) { | |||
| if (ang[i]>0.0) { | |||
| ang[i]=mag[i]-ang[i]; | |||
| } else { | |||
| float temp=ang[i]; | |||
| ang[i]=mag[i]; | |||
| mag[i]+=temp; | |||
| } | |||
| } else { | |||
| if (ang[i]>0.0) { | |||
| ang[i]+=mag[i]; | |||
| } else { | |||
| float temp=ang[i]; | |||
| ang[i]=mag[i]; | |||
| mag[i]-=temp; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // Decode the audio packet using the functions above | |||
| #define BIAS 385 | |||
| @@ -1541,26 +1567,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||
| mag=vc->channel_residues+res_chan[mapping->magnitude[i]]*blocksize/2; | |||
| ang=vc->channel_residues+res_chan[mapping->angle[i]]*blocksize/2; | |||
| for(j=0;j<blocksize/2;++j) { | |||
| float temp; | |||
| if (mag[j]>0.0) { | |||
| if (ang[j]>0.0) { | |||
| ang[j]=mag[j]-ang[j]; | |||
| } else { | |||
| temp=ang[j]; | |||
| ang[j]=mag[j]; | |||
| mag[j]+=temp; | |||
| } | |||
| } else { | |||
| if (ang[j]>0.0) { | |||
| ang[j]+=mag[j]; | |||
| } else { | |||
| temp=ang[j]; | |||
| ang[j]=mag[j]; | |||
| mag[j]-=temp; | |||
| } | |||
| } | |||
| } | |||
| vc->dsp.vorbis_inverse_coupling(mag, ang, blocksize/2); | |||
| } | |||
| // Dotproduct | |||
| @@ -87,6 +87,7 @@ typedef struct { | |||
| typedef struct vorbis_context_s { | |||
| AVCodecContext *avccontext; | |||
| GetBitContext gb; | |||
| DSPContext dsp; | |||
| MDCTContext mdct0; | |||
| MDCTContext mdct1; | |||