There's no benefit from using blendps here except on CPUs with AVX, where it's faster than shufps according to Intel's documentation. As such, rename the sse4 functions to sse/sse2 and use shufps instead. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com>tags/n2.5
@@ -245,15 +245,27 @@ pack_6ch_%2_to_%1_u_int %+ SUFFIX | |||
mov%3 m4, [srcq+src4q] | |||
mov%3 m5, [srcq+src5q] | |||
%7 x,x,x,x,m7,x | |||
%if cpuflag(sse4) | |||
%if cpuflag(sse) | |||
SBUTTERFLYPS 0, 1, 6 | |||
SBUTTERFLYPS 2, 3, 6 | |||
SBUTTERFLYPS 4, 5, 6 | |||
%if cpuflag(avx) | |||
blendps m6, m4, m0, 1100b | |||
%else | |||
movaps m6, m4 | |||
shufps m4, m0, q3210 | |||
SWAP 4,6 | |||
%endif | |||
movlhps m0, m2 | |||
movhlps m4, m2 | |||
%if cpuflag(avx) | |||
blendps m2, m5, m1, 1100b | |||
%else | |||
movaps m2, m5 | |||
shufps m5, m1, q3210 | |||
SWAP 2,5 | |||
%endif | |||
movlhps m1, m3 | |||
movhlps m5, m3 | |||
@@ -380,6 +392,10 @@ CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N | |||
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N | |||
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N | |||
INIT_XMM sse | |||
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N | |||
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N | |||
INIT_XMM sse2 | |||
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N | |||
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N | |||
@@ -431,6 +447,10 @@ UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT | |||
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT | |||
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT | |||
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT | |||
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT | |||
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT | |||
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT | |||
INIT_XMM ssse3 | |||
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N | |||
@@ -440,15 +460,6 @@ UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N | |||
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT | |||
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT | |||
INIT_XMM sse4 | |||
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N | |||
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N | |||
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT | |||
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT | |||
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT | |||
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT | |||
%if HAVE_AVX_EXTERNAL | |||
INIT_XMM avx | |||
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N | |||
@@ -58,7 +58,12 @@ MULTI_CAPS_FUNC(SSE2, sse2) | |||
ac->simd_f = ff_pack_6ch_float_to_float_a_mmx; | |||
} | |||
} | |||
if(EXTERNAL_SSE(mm_flags)) { | |||
if(channels == 6) { | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_pack_6ch_float_to_float_a_sse; | |||
} | |||
} | |||
if(EXTERNAL_SSE2(mm_flags)) { | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_int32_to_float_a_sse2; | |||
@@ -105,6 +110,12 @@ MULTI_CAPS_FUNC(SSE2, sse2) | |||
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLT) | |||
ac->simd_f = ff_unpack_2ch_float_to_int16_a_sse2; | |||
} | |||
if(channels == 6) { | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_pack_6ch_int32_to_float_a_sse2; | |||
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) | |||
ac->simd_f = ff_pack_6ch_float_to_int32_a_sse2; | |||
} | |||
} | |||
if(EXTERNAL_SSSE3(mm_flags)) { | |||
if(channels == 2) { | |||
@@ -116,16 +127,6 @@ MULTI_CAPS_FUNC(SSE2, sse2) | |||
ac->simd_f = ff_unpack_2ch_int16_to_float_a_ssse3; | |||
} | |||
} | |||
if(EXTERNAL_SSE4(mm_flags)) { | |||
if(channels == 6) { | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_pack_6ch_float_to_float_a_sse4; | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_pack_6ch_int32_to_float_a_sse4; | |||
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) | |||
ac->simd_f = ff_pack_6ch_float_to_int32_a_sse4; | |||
} | |||
} | |||
if(EXTERNAL_AVX(mm_flags)) { | |||
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) | |||
ac->simd_f = ff_int32_to_float_a_avx; | |||