6% faster SSE FFT on Conroe, 2.5% on Penryn.
Signed-off-by: Janne Grunau <janne-ffmpeg@jannau.net>
(cherry picked from commit e6b1ed693a)
tags/n0.8
| @@ -44,7 +44,7 @@ av_cold void ff_fft_init_arm(FFTContext *s) | |||||
| s->imdct_calc = ff_imdct_calc_neon; | s->imdct_calc = ff_imdct_calc_neon; | ||||
| s->imdct_half = ff_imdct_half_neon; | s->imdct_half = ff_imdct_half_neon; | ||||
| s->mdct_calc = ff_mdct_calc_neon; | s->mdct_calc = ff_mdct_calc_neon; | ||||
| s->permutation = FF_MDCT_PERM_INTERLEAVE; | |||||
| s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; | |||||
| } | } | ||||
| } | } | ||||
| @@ -97,6 +97,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||
| if (!s->tmp_buf) | if (!s->tmp_buf) | ||||
| goto fail; | goto fail; | ||||
| s->inverse = inverse; | s->inverse = inverse; | ||||
| s->fft_permutation = FF_FFT_PERM_DEFAULT; | |||||
| s->fft_permute = ff_fft_permute_c; | s->fft_permute = ff_fft_permute_c; | ||||
| s->fft_calc = ff_fft_calc_c; | s->fft_calc = ff_fft_calc_c; | ||||
| @@ -113,8 +114,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||
| for(j=4; j<=nbits; j++) { | for(j=4; j<=nbits; j++) { | ||||
| ff_init_ff_cos_tabs(j); | ff_init_ff_cos_tabs(j); | ||||
| } | } | ||||
| for(i=0; i<n; i++) | |||||
| s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; | |||||
| for(i=0; i<n; i++) { | |||||
| int j = i; | |||||
| if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) | |||||
| j = (j&~3) | ((j>>1)&1) | ((j<<1)&2); | |||||
| s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j; | |||||
| } | |||||
| return 0; | return 0; | ||||
| fail: | fail: | ||||
| @@ -44,7 +44,10 @@ struct FFTContext { | |||||
| void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
| void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
| void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
| int permutation; | |||||
| int fft_permutation; | |||||
| #define FF_FFT_PERM_DEFAULT 0 | |||||
| #define FF_FFT_PERM_SWAP_LSBS 1 | |||||
| int mdct_permutation; | |||||
| #define FF_MDCT_PERM_NONE 0 | #define FF_MDCT_PERM_NONE 0 | ||||
| #define FF_MDCT_PERM_INTERLEAVE 1 | #define FF_MDCT_PERM_INTERLEAVE 1 | ||||
| }; | }; | ||||
| @@ -71,7 +71,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) | |||||
| s->mdct_bits = nbits; | s->mdct_bits = nbits; | ||||
| s->mdct_size = n; | s->mdct_size = n; | ||||
| n4 = n >> 2; | n4 = n >> 2; | ||||
| s->permutation = FF_MDCT_PERM_NONE; | |||||
| s->mdct_permutation = FF_MDCT_PERM_NONE; | |||||
| if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) | if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) | ||||
| goto fail; | goto fail; | ||||
| @@ -80,7 +80,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) | |||||
| if (!s->tcos) | if (!s->tcos) | ||||
| goto fail; | goto fail; | ||||
| switch (s->permutation) { | |||||
| switch (s->mdct_permutation) { | |||||
| case FF_MDCT_PERM_NONE: | case FF_MDCT_PERM_NONE: | ||||
| s->tsin = s->tcos + n4; | s->tsin = s->tcos + n4; | ||||
| tstep = 1; | tstep = 1; | ||||
| @@ -30,6 +30,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) | |||||
| s->imdct_half = ff_imdct_half_sse; | s->imdct_half = ff_imdct_half_sse; | ||||
| s->fft_permute = ff_fft_permute_sse; | s->fft_permute = ff_fft_permute_sse; | ||||
| s->fft_calc = ff_fft_calc_sse; | s->fft_calc = ff_fft_calc_sse; | ||||
| s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; | |||||
| } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { | } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { | ||||
| /* 3DNowEx for K7 */ | /* 3DNowEx for K7 */ | ||||
| s->imdct_calc = ff_imdct_calc_3dn2; | s->imdct_calc = ff_imdct_calc_3dn2; | ||||
| @@ -51,6 +51,7 @@ SECTION_RODATA | |||||
| %define M_SQRT1_2 0.70710678118654752440 | %define M_SQRT1_2 0.70710678118654752440 | ||||
| ps_root2: times 4 dd M_SQRT1_2 | ps_root2: times 4 dd M_SQRT1_2 | ||||
| ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | ||||
| ps_p1p1m1p1: dd 0, 0, 1<<31, 0 | |||||
| ps_m1p1: dd 1<<31, 0 | ps_m1p1: dd 1<<31, 0 | ||||
| %assign i 16 | %assign i 16 | ||||
| @@ -95,54 +96,51 @@ section .text align=16 | |||||
| SWAP %3, %6 | SWAP %3, %6 | ||||
| %endmacro | %endmacro | ||||
| ; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |||||
| ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} | |||||
| ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | ||||
| %macro T4_SSE 3 | %macro T4_SSE 3 | ||||
| mova %3, %1 | mova %3, %1 | ||||
| shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |||||
| shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |||||
| addps %1, %2 ; {t1,t2,t6,t5} | |||||
| subps %3, %2 ; {t3,t4,-t8,t7} | |||||
| xorps %3, [ps_p1p1m1p1] | |||||
| mova %2, %1 | mova %2, %1 | ||||
| addps %1, %3 ; {t1,t2,t6,t5} | |||||
| subps %2, %3 ; {t3,t4,t8,t7} | |||||
| shufps %1, %3, 0x44 ; {t1,t2,t3,t4} | |||||
| shufps %2, %3, 0xbe ; {t6,t5,t7,t8} | |||||
| mova %3, %1 | mova %3, %1 | ||||
| shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |||||
| shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |||||
| addps %1, %2 ; {r0,i0,r1,i1} | |||||
| subps %3, %2 ; {r2,i2,r3,i3} | |||||
| mova %2, %1 | mova %2, %1 | ||||
| addps %1, %3 ; {r0,i0,r1,i1} | |||||
| subps %2, %3 ; {r2,i2,r3,i3} | |||||
| mova %3, %1 | |||||
| shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |||||
| shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |||||
| SWAP %2, %3 | |||||
| shufps %1, %3, 0x88 ; {r0,r1,r2,r3} | |||||
| shufps %2, %3, 0xdd ; {i0,i1,i2,i3} | |||||
| %endmacro | %endmacro | ||||
| %macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |||||
| mova %5, %3 | |||||
| shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |||||
| shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |||||
| ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} | |||||
| ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} | |||||
| %macro T8_SSE 6 | |||||
| mova %6, %3 | mova %6, %3 | ||||
| subps %3, %5 ; {r5,i5,r7,i7} | |||||
| addps %6, %5 ; {t1,t2,t3,t4} | |||||
| mova %5, %3 | |||||
| shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |||||
| subps %3, %4 ; {r5,i5,r7,i7} | |||||
| addps %6, %4 ; {t1,t2,t3,t4} | |||||
| mova %4, %3 | |||||
| shufps %4, %4, 0xb1 ; {i5,r5,i7,r7} | |||||
| mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | ||||
| mulps %5, [ps_root2] | |||||
| addps %3, %5 ; {t8,t7,ta,t9} | |||||
| mova %5, %6 | |||||
| mulps %4, [ps_root2] | |||||
| addps %3, %4 ; {t8,t7,ta,t9} | |||||
| mova %4, %6 | |||||
| shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | ||||
| shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |||||
| shufps %4, %3, 0x9c ; {t1,t4,t7,ta} | |||||
| mova %3, %6 | mova %3, %6 | ||||
| addps %6, %5 ; {t1,t2,t9,ta} | |||||
| subps %3, %5 ; {t6,t5,tc,tb} | |||||
| mova %5, %6 | |||||
| addps %6, %4 ; {t1,t2,t9,ta} | |||||
| subps %3, %4 ; {t6,t5,tc,tb} | |||||
| mova %4, %6 | |||||
| shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | ||||
| shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |||||
| shufps %4, %3, 0x8d ; {t2,ta,t6,tc} | |||||
| mova %3, %1 | mova %3, %1 | ||||
| mova %4, %2 | |||||
| mova %5, %2 | |||||
| addps %1, %6 ; {r0,r1,r2,r3} | addps %1, %6 ; {r0,r1,r2,r3} | ||||
| addps %2, %5 ; {i0,i1,i2,i3} | |||||
| addps %2, %4 ; {i0,i1,i2,i3} | |||||
| subps %3, %6 ; {r4,r5,r6,r7} | subps %3, %6 ; {r4,r5,r6,r7} | ||||
| subps %4, %5 ; {i4,i5,i6,i7} | |||||
| subps %5, %4 ; {i4,i5,i6,i7} | |||||
| SWAP %4, %5 | |||||
| %endmacro | %endmacro | ||||
| ; scheduled for cpu-bound sizes | ; scheduled for cpu-bound sizes | ||||