6% faster SSE FFT on Conroe, 2.5% on Penryn.
Signed-off-by: Janne Grunau <janne-ffmpeg@jannau.net>
(cherry picked from commit e6b1ed693a
)
tags/n0.8
@@ -44,7 +44,7 @@ av_cold void ff_fft_init_arm(FFTContext *s) | |||||
s->imdct_calc = ff_imdct_calc_neon; | s->imdct_calc = ff_imdct_calc_neon; | ||||
s->imdct_half = ff_imdct_half_neon; | s->imdct_half = ff_imdct_half_neon; | ||||
s->mdct_calc = ff_mdct_calc_neon; | s->mdct_calc = ff_mdct_calc_neon; | ||||
s->permutation = FF_MDCT_PERM_INTERLEAVE; | |||||
s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; | |||||
} | } | ||||
} | } | ||||
@@ -97,6 +97,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||
if (!s->tmp_buf) | if (!s->tmp_buf) | ||||
goto fail; | goto fail; | ||||
s->inverse = inverse; | s->inverse = inverse; | ||||
s->fft_permutation = FF_FFT_PERM_DEFAULT; | |||||
s->fft_permute = ff_fft_permute_c; | s->fft_permute = ff_fft_permute_c; | ||||
s->fft_calc = ff_fft_calc_c; | s->fft_calc = ff_fft_calc_c; | ||||
@@ -113,8 +114,12 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) | |||||
for(j=4; j<=nbits; j++) { | for(j=4; j<=nbits; j++) { | ||||
ff_init_ff_cos_tabs(j); | ff_init_ff_cos_tabs(j); | ||||
} | } | ||||
for(i=0; i<n; i++) | |||||
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; | |||||
for(i=0; i<n; i++) { | |||||
int j = i; | |||||
if (s->fft_permutation == FF_FFT_PERM_SWAP_LSBS) | |||||
j = (j&~3) | ((j>>1)&1) | ((j<<1)&2); | |||||
s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = j; | |||||
} | |||||
return 0; | return 0; | ||||
fail: | fail: | ||||
@@ -44,7 +44,10 @@ struct FFTContext { | |||||
void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); | ||||
int permutation; | |||||
int fft_permutation; | |||||
#define FF_FFT_PERM_DEFAULT 0 | |||||
#define FF_FFT_PERM_SWAP_LSBS 1 | |||||
int mdct_permutation; | |||||
#define FF_MDCT_PERM_NONE 0 | #define FF_MDCT_PERM_NONE 0 | ||||
#define FF_MDCT_PERM_INTERLEAVE 1 | #define FF_MDCT_PERM_INTERLEAVE 1 | ||||
}; | }; | ||||
@@ -71,7 +71,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) | |||||
s->mdct_bits = nbits; | s->mdct_bits = nbits; | ||||
s->mdct_size = n; | s->mdct_size = n; | ||||
n4 = n >> 2; | n4 = n >> 2; | ||||
s->permutation = FF_MDCT_PERM_NONE; | |||||
s->mdct_permutation = FF_MDCT_PERM_NONE; | |||||
if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) | if (ff_fft_init(s, s->mdct_bits - 2, inverse) < 0) | ||||
goto fail; | goto fail; | ||||
@@ -80,7 +80,7 @@ av_cold int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale) | |||||
if (!s->tcos) | if (!s->tcos) | ||||
goto fail; | goto fail; | ||||
switch (s->permutation) { | |||||
switch (s->mdct_permutation) { | |||||
case FF_MDCT_PERM_NONE: | case FF_MDCT_PERM_NONE: | ||||
s->tsin = s->tcos + n4; | s->tsin = s->tcos + n4; | ||||
tstep = 1; | tstep = 1; | ||||
@@ -30,6 +30,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) | |||||
s->imdct_half = ff_imdct_half_sse; | s->imdct_half = ff_imdct_half_sse; | ||||
s->fft_permute = ff_fft_permute_sse; | s->fft_permute = ff_fft_permute_sse; | ||||
s->fft_calc = ff_fft_calc_sse; | s->fft_calc = ff_fft_calc_sse; | ||||
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; | |||||
} else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { | } else if (has_vectors & AV_CPU_FLAG_3DNOWEXT && HAVE_AMD3DNOWEXT) { | ||||
/* 3DNowEx for K7 */ | /* 3DNowEx for K7 */ | ||||
s->imdct_calc = ff_imdct_calc_3dn2; | s->imdct_calc = ff_imdct_calc_3dn2; | ||||
@@ -51,6 +51,7 @@ SECTION_RODATA | |||||
%define M_SQRT1_2 0.70710678118654752440 | %define M_SQRT1_2 0.70710678118654752440 | ||||
ps_root2: times 4 dd M_SQRT1_2 | ps_root2: times 4 dd M_SQRT1_2 | ||||
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 | ||||
ps_p1p1m1p1: dd 0, 0, 1<<31, 0 | |||||
ps_m1p1: dd 1<<31, 0 | ps_m1p1: dd 1<<31, 0 | ||||
%assign i 16 | %assign i 16 | ||||
@@ -95,54 +96,51 @@ section .text align=16 | |||||
SWAP %3, %6 | SWAP %3, %6 | ||||
%endmacro | %endmacro | ||||
; in: %1={r0,i0,r1,i1} %2={r2,i2,r3,i3} | |||||
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} | |||||
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} | ||||
%macro T4_SSE 3 | %macro T4_SSE 3 | ||||
mova %3, %1 | mova %3, %1 | ||||
shufps %1, %2, 0x64 ; {r0,i0,r3,i2} | |||||
shufps %3, %2, 0xce ; {r1,i1,r2,i3} | |||||
addps %1, %2 ; {t1,t2,t6,t5} | |||||
subps %3, %2 ; {t3,t4,-t8,t7} | |||||
xorps %3, [ps_p1p1m1p1] | |||||
mova %2, %1 | mova %2, %1 | ||||
addps %1, %3 ; {t1,t2,t6,t5} | |||||
subps %2, %3 ; {t3,t4,t8,t7} | |||||
shufps %1, %3, 0x44 ; {t1,t2,t3,t4} | |||||
shufps %2, %3, 0xbe ; {t6,t5,t7,t8} | |||||
mova %3, %1 | mova %3, %1 | ||||
shufps %1, %2, 0x44 ; {t1,t2,t3,t4} | |||||
shufps %3, %2, 0xbe ; {t6,t5,t7,t8} | |||||
addps %1, %2 ; {r0,i0,r1,i1} | |||||
subps %3, %2 ; {r2,i2,r3,i3} | |||||
mova %2, %1 | mova %2, %1 | ||||
addps %1, %3 ; {r0,i0,r1,i1} | |||||
subps %2, %3 ; {r2,i2,r3,i3} | |||||
mova %3, %1 | |||||
shufps %1, %2, 0x88 ; {r0,r1,r2,r3} | |||||
shufps %3, %2, 0xdd ; {i0,i1,i2,i3} | |||||
SWAP %2, %3 | |||||
shufps %1, %3, 0x88 ; {r0,r1,r2,r3} | |||||
shufps %2, %3, 0xdd ; {i0,i1,i2,i3} | |||||
%endmacro | %endmacro | ||||
%macro T8_SSE 6 ; r0,i0,r1,i1,t0,t1 | |||||
mova %5, %3 | |||||
shufps %3, %4, 0x44 ; {r4,i4,r6,i6} | |||||
shufps %5, %4, 0xee ; {r5,i5,r7,i7} | |||||
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} | |||||
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} | |||||
%macro T8_SSE 6 | |||||
mova %6, %3 | mova %6, %3 | ||||
subps %3, %5 ; {r5,i5,r7,i7} | |||||
addps %6, %5 ; {t1,t2,t3,t4} | |||||
mova %5, %3 | |||||
shufps %5, %5, 0xb1 ; {i5,r5,i7,r7} | |||||
subps %3, %4 ; {r5,i5,r7,i7} | |||||
addps %6, %4 ; {t1,t2,t3,t4} | |||||
mova %4, %3 | |||||
shufps %4, %4, 0xb1 ; {i5,r5,i7,r7} | |||||
mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | mulps %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} | ||||
mulps %5, [ps_root2] | |||||
addps %3, %5 ; {t8,t7,ta,t9} | |||||
mova %5, %6 | |||||
mulps %4, [ps_root2] | |||||
addps %3, %4 ; {t8,t7,ta,t9} | |||||
mova %4, %6 | |||||
shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | shufps %6, %3, 0x36 ; {t3,t2,t9,t8} | ||||
shufps %5, %3, 0x9c ; {t1,t4,t7,ta} | |||||
shufps %4, %3, 0x9c ; {t1,t4,t7,ta} | |||||
mova %3, %6 | mova %3, %6 | ||||
addps %6, %5 ; {t1,t2,t9,ta} | |||||
subps %3, %5 ; {t6,t5,tc,tb} | |||||
mova %5, %6 | |||||
addps %6, %4 ; {t1,t2,t9,ta} | |||||
subps %3, %4 ; {t6,t5,tc,tb} | |||||
mova %4, %6 | |||||
shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | shufps %6, %3, 0xd8 ; {t1,t9,t5,tb} | ||||
shufps %5, %3, 0x8d ; {t2,ta,t6,tc} | |||||
shufps %4, %3, 0x8d ; {t2,ta,t6,tc} | |||||
mova %3, %1 | mova %3, %1 | ||||
mova %4, %2 | |||||
mova %5, %2 | |||||
addps %1, %6 ; {r0,r1,r2,r3} | addps %1, %6 ; {r0,r1,r2,r3} | ||||
addps %2, %5 ; {i0,i1,i2,i3} | |||||
addps %2, %4 ; {i0,i1,i2,i3} | |||||
subps %3, %6 ; {r4,r5,r6,r7} | subps %3, %6 ; {r4,r5,r6,r7} | ||||
subps %4, %5 ; {i4,i5,i6,i7} | |||||
subps %5, %4 ; {i4,i5,i6,i7} | |||||
SWAP %4, %5 | |||||
%endmacro | %endmacro | ||||
; scheduled for cpu-bound sizes | ; scheduled for cpu-bound sizes | ||||