Partially based on patches by clsid2 in ffdshow-tryout. ff_float_interleave6() x86 improvements by Loren Merrit.tags/n0.8
| @@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src, | |||
| } | |||
| } | |||
| void ff_float_interleave_c(float *dst, const float **src, unsigned int len, | |||
| int channels) | |||
| { | |||
| int j, c; | |||
| unsigned int i; | |||
| if (channels == 2) { | |||
| for (i = 0; i < len; i++) { | |||
| dst[2*i] = src[0][i]; | |||
| dst[2*i+1] = src[1][i]; | |||
| } | |||
| } else if (channels == 1 && len < INT_MAX / sizeof(float)) { | |||
| memcpy(dst, src[0], len * sizeof(float)); | |||
| } else { | |||
| for (c = 0; c < channels; c++) | |||
| for (i = 0, j = c; i < len; i++, j += channels) | |||
| dst[j] = src[c][i]; | |||
| } | |||
| } | |||
| av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c; | |||
| c->float_to_int16 = float_to_int16_c; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_c; | |||
| c->float_interleave = ff_float_interleave_c; | |||
| if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); | |||
| if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx); | |||
| @@ -68,8 +68,17 @@ typedef struct FmtConvertContext { | |||
| */ | |||
| void (*float_to_int16_interleave)(int16_t *dst, const float **src, | |||
| long len, int channels); | |||
| /** | |||
| * Convert an array of interleaved float to multiple arrays of float. | |||
| */ | |||
| void (*float_interleave)(float *dst, const float **src, unsigned int len, | |||
| int channels); | |||
| } FmtConvertContext; | |||
| void ff_float_interleave_c(float *dst, const float **src, unsigned int len, | |||
| int channels); | |||
| void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); | |||
| void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); | |||
| @@ -20,6 +20,7 @@ | |||
| ;****************************************************************************** | |||
| %include "x86inc.asm" | |||
| %include "x86util.asm" | |||
| section .text align=16 | |||
| @@ -89,3 +90,143 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow | |||
| %undef pswapd | |||
| FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |||
| %undef cvtps2pi | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_float_interleave6(float *dst, const float **src, unsigned int len); | |||
| ;----------------------------------------------------------------------------- | |||
| %macro BUTTERFLYPS 3 | |||
| movaps m%3, m%1 | |||
| unpcklps m%1, m%2 | |||
| unpckhps m%3, m%2 | |||
| SWAP %2, %3 | |||
| %endmacro | |||
| %macro FLOAT_INTERLEAVE6 2 | |||
| cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5 | |||
| %ifdef ARCH_X86_64 | |||
| %define lend r10d | |||
| mov lend, r2d | |||
| %else | |||
| %define lend dword r2m | |||
| %endif | |||
| mov src1q, [srcq+1*gprsize] | |||
| mov src2q, [srcq+2*gprsize] | |||
| mov src3q, [srcq+3*gprsize] | |||
| mov src4q, [srcq+4*gprsize] | |||
| mov src5q, [srcq+5*gprsize] | |||
| mov srcq, [srcq] | |||
| sub src1q, srcq | |||
| sub src2q, srcq | |||
| sub src3q, srcq | |||
| sub src4q, srcq | |||
| sub src5q, srcq | |||
| .loop: | |||
| %ifidn %1, sse | |||
| movaps m0, [srcq] | |||
| movaps m1, [srcq+src1q] | |||
| movaps m2, [srcq+src2q] | |||
| movaps m3, [srcq+src3q] | |||
| movaps m4, [srcq+src4q] | |||
| movaps m5, [srcq+src5q] | |||
| BUTTERFLYPS 0, 1, 6 | |||
| BUTTERFLYPS 2, 3, 6 | |||
| BUTTERFLYPS 4, 5, 6 | |||
| movaps m6, m4 | |||
| shufps m4, m0, 0xe4 | |||
| movlhps m0, m2 | |||
| movhlps m6, m2 | |||
| movaps [dstq ], m0 | |||
| movaps [dstq+16], m4 | |||
| movaps [dstq+32], m6 | |||
| movaps m6, m5 | |||
| shufps m5, m1, 0xe4 | |||
| movlhps m1, m3 | |||
| movhlps m6, m3 | |||
| movaps [dstq+48], m1 | |||
| movaps [dstq+64], m5 | |||
| movaps [dstq+80], m6 | |||
| %else ; mmx | |||
| movq m0, [srcq] | |||
| movq m1, [srcq+src1q] | |||
| movq m2, [srcq+src2q] | |||
| movq m3, [srcq+src3q] | |||
| movq m4, [srcq+src4q] | |||
| movq m5, [srcq+src5q] | |||
| SBUTTERFLY dq, 0, 1, 6 | |||
| SBUTTERFLY dq, 2, 3, 6 | |||
| SBUTTERFLY dq, 4, 5, 6 | |||
| movq [dstq ], m0 | |||
| movq [dstq+ 8], m2 | |||
| movq [dstq+16], m4 | |||
| movq [dstq+24], m1 | |||
| movq [dstq+32], m3 | |||
| movq [dstq+40], m5 | |||
| %endif | |||
| add srcq, mmsize | |||
| add dstq, mmsize*6 | |||
| sub lend, mmsize/4 | |||
| jg .loop | |||
| %ifidn %1, mmx | |||
| emms | |||
| %endif | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX | |||
| FLOAT_INTERLEAVE6 mmx, 0 | |||
| INIT_XMM | |||
| FLOAT_INTERLEAVE6 sse, 7 | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_float_interleave2(float *dst, const float **src, unsigned int len); | |||
| ;----------------------------------------------------------------------------- | |||
| %macro FLOAT_INTERLEAVE2 2 | |||
| cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1 | |||
| mov src1q, [srcq+gprsize] | |||
| mov srcq, [srcq ] | |||
| sub src1q, srcq | |||
| .loop | |||
| MOVPS m0, [srcq ] | |||
| MOVPS m1, [srcq+src1q ] | |||
| MOVPS m3, [srcq +mmsize] | |||
| MOVPS m4, [srcq+src1q+mmsize] | |||
| MOVPS m2, m0 | |||
| PUNPCKLDQ m0, m1 | |||
| PUNPCKHDQ m2, m1 | |||
| MOVPS m1, m3 | |||
| PUNPCKLDQ m3, m4 | |||
| PUNPCKHDQ m1, m4 | |||
| MOVPS [dstq ], m0 | |||
| MOVPS [dstq+1*mmsize], m2 | |||
| MOVPS [dstq+2*mmsize], m3 | |||
| MOVPS [dstq+3*mmsize], m1 | |||
| add srcq, mmsize*2 | |||
| add dstq, mmsize*4 | |||
| sub lend, mmsize/2 | |||
| jg .loop | |||
| %ifidn %1, mmx | |||
| emms | |||
| %endif | |||
| REP_RET | |||
| %endmacro | |||
| INIT_MMX | |||
| %define MOVPS movq | |||
| %define PUNPCKLDQ punpckldq | |||
| %define PUNPCKHDQ punpckhdq | |||
| FLOAT_INTERLEAVE2 mmx, 0 | |||
| INIT_XMM | |||
| %define MOVPS movaps | |||
| %define PUNPCKLDQ unpcklps | |||
| %define PUNPCKHDQ unpckhps | |||
| FLOAT_INTERLEAVE2 sse, 5 | |||
| @@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long | |||
| float_to_int16_interleave_3dnow(dst, src, len, channels); | |||
| } | |||
| void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); | |||
| void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); | |||
| void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); | |||
| void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); | |||
| static void float_interleave_mmx(float *dst, const float **src, | |||
| unsigned int len, int channels) | |||
| { | |||
| if (channels == 2) { | |||
| ff_float_interleave2_mmx(dst, src, len); | |||
| } else if (channels == 6) | |||
| ff_float_interleave6_mmx(dst, src, len); | |||
| else | |||
| ff_float_interleave_c(dst, src, len, channels); | |||
| } | |||
| static void float_interleave_sse(float *dst, const float **src, | |||
| unsigned int len, int channels) | |||
| { | |||
| if (channels == 2) { | |||
| ff_float_interleave2_sse(dst, src, len); | |||
| } else if (channels == 6) | |||
| ff_float_interleave6_sse(dst, src, len); | |||
| else | |||
| ff_float_interleave_c(dst, src, len, channels); | |||
| } | |||
| void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) | |||
| { | |||
| int mm_flags = av_get_cpu_flags(); | |||
| if (mm_flags & AV_CPU_FLAG_MMX) { | |||
| c->float_interleave = float_interleave_mmx; | |||
| if(mm_flags & AV_CPU_FLAG_3DNOW){ | |||
| if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
| @@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; | |||
| c->float_to_int16 = float_to_int16_sse; | |||
| c->float_to_int16_interleave = float_to_int16_interleave_sse; | |||
| c->float_interleave = float_interleave_sse; | |||
| } | |||
| if(mm_flags & AV_CPU_FLAG_SSE2){ | |||
| c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; | |||