6% faster vorbis and wma. Originally committed as revision 5954 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
| @@ -594,6 +594,8 @@ void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3], | |||
| FFTSample type */ | |||
| typedef float FFTSample; | |||
| struct MDCTContext; | |||
| typedef struct FFTComplex { | |||
| FFTSample re, im; | |||
| } FFTComplex; | |||
| @@ -605,6 +607,8 @@ typedef struct FFTContext { | |||
| FFTComplex *exptab; | |||
| FFTComplex *exptab1; /* only used by SSE code */ | |||
| void (*fft_calc)(struct FFTContext *s, FFTComplex *z); | |||
| void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| } FFTContext; | |||
| int ff_fft_init(FFTContext *s, int nbits, int inverse); | |||
| @@ -635,6 +639,8 @@ typedef struct MDCTContext { | |||
| int ff_mdct_init(MDCTContext *s, int nbits, int inverse); | |||
| void ff_imdct_calc(MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| void ff_mdct_calc(MDCTContext *s, FFTSample *out, | |||
| const FFTSample *input, FFTSample *tmp); | |||
| void ff_mdct_end(MDCTContext *s); | |||
| @@ -54,6 +54,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||
| s->exptab[i].im = s1; | |||
| } | |||
| s->fft_calc = ff_fft_calc_c; | |||
| s->imdct_calc = ff_imdct_calc; | |||
| s->exptab1 = NULL; | |||
| /* compute constant table for HAVE_SSE version */ | |||
| @@ -62,11 +63,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||
| int has_vectors = 0; | |||
| #if defined(HAVE_MMX) | |||
| #ifdef HAVE_MM3DNOW | |||
| has_vectors = mm_support() & (MM_3DNOW | MM_3DNOWEXT | MM_SSE | MM_SSE2); | |||
| #else | |||
| has_vectors = mm_support() & (MM_SSE | MM_SSE2); | |||
| #endif | |||
| #endif | |||
| #if defined(HAVE_ALTIVEC) && !defined(ALTIVEC_USE_REFERENCE_C_CODE) | |||
| has_vectors = mm_support() & MM_ALTIVEC; | |||
| @@ -98,6 +95,8 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||
| } while (nblocks != 0); | |||
| av_freep(&s->exptab); | |||
| #if defined(HAVE_MMX) | |||
| if (has_vectors & MM_3DNOWEXT) | |||
| s->imdct_calc = ff_imdct_calc_3dn2; | |||
| #ifdef HAVE_MM3DNOW | |||
| if (has_vectors & MM_3DNOWEXT) | |||
| /* 3DNowEx for Athlon(XP) */ | |||
| @@ -1,6 +1,6 @@ | |||
| /* | |||
| * FFT/MDCT transform with Extended 3DNow! optimizations | |||
| * Copyright (c) 2006 Zuxy MENG Jie. | |||
| * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt | |||
| * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. | |||
| * | |||
| * This library is free software; you can redistribute it and/or | |||
| @@ -134,3 +134,84 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |||
| } | |||
| #endif | |||
| void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | |||
| const FFTSample *input, FFTSample *tmp) | |||
| { | |||
| int k, n8, n4, n2, n; | |||
| const uint16_t *revtab = s->fft.revtab; | |||
| const FFTSample *tcos = s->tcos; | |||
| const FFTSample *tsin = s->tsin; | |||
| const FFTSample *in1, *in2; | |||
| FFTComplex *z = (FFTComplex *)tmp; | |||
| n = 1 << s->nbits; | |||
| n2 = n >> 1; | |||
| n4 = n >> 2; | |||
| n8 = n >> 3; | |||
| /* pre rotation */ | |||
| in1 = input; | |||
| in2 = input + n2 - 1; | |||
| for(k = 0; k < n4; k++) { | |||
| asm volatile( | |||
| "movd %1, %%mm0 \n\t" | |||
| "movd %3, %%mm1 \n\t" | |||
| "punpckldq %2, %%mm0 \n\t" | |||
| "punpckldq %4, %%mm1 \n\t" | |||
| "movq %%mm0, %%mm2 \n\t" | |||
| "pfmul %%mm1, %%mm0 \n\t" | |||
| "pswapd %%mm1, %%mm1 \n\t" | |||
| "pfmul %%mm1, %%mm2 \n\t" | |||
| "pfpnacc %%mm2, %%mm0 \n\t" | |||
| "movq %%mm0, %0 \n\t" | |||
| :"=m"(z[revtab[k]]) | |||
| :"m"(in2[-2*k]), "m"(in1[2*k]), | |||
| "m"(tcos[k]), "m"(tsin[k]) | |||
| ); | |||
| } | |||
| ff_fft_calc(&s->fft, z); | |||
| /* post rotation + reordering */ | |||
| for(k = 0; k < n4; k++) { | |||
| asm volatile( | |||
| "movq %0, %%mm0 \n\t" | |||
| "movd %1, %%mm1 \n\t" | |||
| "punpckldq %2, %%mm1 \n\t" | |||
| "movq %%mm0, %%mm2 \n\t" | |||
| "pfmul %%mm1, %%mm0 \n\t" | |||
| "pswapd %%mm1, %%mm1 \n\t" | |||
| "pfmul %%mm1, %%mm2 \n\t" | |||
| "pfpnacc %%mm2, %%mm0 \n\t" | |||
| "movq %%mm0, %0 \n\t" | |||
| :"+m"(z[k]) | |||
| :"m"(tcos[k]), "m"(tsin[k]) | |||
| ); | |||
| } | |||
| asm volatile("movd %0, %%mm7" ::"r"(1<<31)); | |||
| for(k = 0; k < n8; k++) { | |||
| asm volatile( | |||
| "movq %4, %%mm0 \n\t" | |||
| "pswapd %5, %%mm1 \n\t" | |||
| "movq %%mm0, %%mm2 \n\t" | |||
| "pxor %%mm7, %%mm2 \n\t" | |||
| "punpckldq %%mm1, %%mm2 \n\t" | |||
| "pswapd %%mm2, %%mm3 \n\t" | |||
| "punpckhdq %%mm1, %%mm0 \n\t" | |||
| "pswapd %%mm0, %%mm4 \n\t" | |||
| "pxor %%mm7, %%mm0 \n\t" | |||
| "pxor %%mm7, %%mm4 \n\t" | |||
| "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re } | |||
| "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im } | |||
| "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im } | |||
| "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } | |||
| :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), | |||
| "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) | |||
| :"m"(z[n8+k]), "m"(z[n8-1-k]) | |||
| :"memory" | |||
| ); | |||
| } | |||
| asm volatile("emms"); | |||
| } | |||
| @@ -1598,7 +1598,7 @@ static int vorbis_parse_audio_packet(vorbis_context *vc) { | |||
| saved_start=vc->saved_start; | |||
| ff_imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); | |||
| vc->mdct0.fft.imdct_calc(vc->modes[mode_number].blockflag ? &vc->mdct1 : &vc->mdct0, buf, ch_floor_ptr, buf_tmp); | |||
| if (vc->modes[mode_number].blockflag) { | |||
| // -- overlap/add | |||
| @@ -1113,7 +1113,7 @@ static int wma_decode_block(WMADecodeContext *s) | |||
| n = s->block_len; | |||
| n4 = s->block_len / 2; | |||
| ff_imdct_calc(&s->mdct_ctx[bsize], | |||
| s->mdct_ctx[bsize].fft.imdct_calc(&s->mdct_ctx[bsize], | |||
| output, s->coefs[ch], s->mdct_tmp); | |||
| /* XXX: optimize all that by build the window and | |||