This gets rid of a variable-length array and a for loop in C code. Signed-off-by: Martin Storsjö <martin@martin.st>tags/n1.0
| @@ -115,6 +115,84 @@ FLOAT_TO_INT16 sse, 0 | |||
| FLOAT_TO_INT16 3dnow, 0 | |||
| %undef cvtps2pi | |||
| ;------------------------------------------------------------------------------ | |||
| ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); | |||
| ;------------------------------------------------------------------------------ | |||
| %macro FLOAT_TO_INT16_STEP 2 | |||
| cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2 | |||
| add lenq, lenq | |||
| lea srcq, [srcq+2*lenq] | |||
| lea step3q, [stepq*3] | |||
| neg lenq | |||
| .loop: | |||
| %ifidn %1, sse2 | |||
| cvtps2dq m0, [srcq+2*lenq ] | |||
| cvtps2dq m1, [srcq+2*lenq+16] | |||
| packssdw m0, m1 | |||
| movd v1d, m0 | |||
| psrldq m0, 4 | |||
| movd v2d, m0 | |||
| psrldq m0, 4 | |||
| mov [dstq], v1w | |||
| mov [dstq+stepq*4], v2w | |||
| shr v1d, 16 | |||
| shr v2d, 16 | |||
| mov [dstq+stepq*2], v1w | |||
| mov [dstq+step3q*2], v2w | |||
| lea dstq, [dstq+stepq*8] | |||
| movd v1d, m0 | |||
| psrldq m0, 4 | |||
| movd v2d, m0 | |||
| mov [dstq], v1w | |||
| mov [dstq+stepq*4], v2w | |||
| shr v1d, 16 | |||
| shr v2d, 16 | |||
| mov [dstq+stepq*2], v1w | |||
| mov [dstq+step3q*2], v2w | |||
| lea dstq, [dstq+stepq*8] | |||
| %else | |||
| cvtps2pi m0, [srcq+2*lenq ] | |||
| cvtps2pi m1, [srcq+2*lenq+ 8] | |||
| cvtps2pi m2, [srcq+2*lenq+16] | |||
| cvtps2pi m3, [srcq+2*lenq+24] | |||
| packssdw m0, m1 | |||
| packssdw m2, m3 | |||
| movd v1d, m0 | |||
| psrlq m0, 32 | |||
| movd v2d, m0 | |||
| mov [dstq], v1w | |||
| mov [dstq+stepq*4], v2w | |||
| shr v1d, 16 | |||
| shr v2d, 16 | |||
| mov [dstq+stepq*2], v1w | |||
| mov [dstq+step3q*2], v2w | |||
| lea dstq, [dstq+stepq*8] | |||
| movd v1d, m2 | |||
| psrlq m2, 32 | |||
| movd v2d, m2 | |||
| mov [dstq], v1w | |||
| mov [dstq+stepq*4], v2w | |||
| shr v1d, 16 | |||
| shr v2d, 16 | |||
| mov [dstq+stepq*2], v1w | |||
| mov [dstq+step3q*2], v2w | |||
| lea dstq, [dstq+stepq*8] | |||
| %endif | |||
| add lenq, 16 | |||
| js .loop | |||
| %ifnidn %1, sse2 | |||
| emms | |||
| %endif | |||
| REP_RET | |||
| %endmacro | |||
| INIT_XMM | |||
| FLOAT_TO_INT16_STEP sse2, 2 | |||
| INIT_MMX | |||
| FLOAT_TO_INT16_STEP sse, 0 | |||
| %define cvtps2pi pf2id | |||
| FLOAT_TO_INT16_STEP 3dnow, 0 | |||
| %undef cvtps2pi | |||
| ;------------------------------------------------------------------------------- | |||
| ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); | |||
| @@ -25,6 +25,7 @@ | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/fmtconvert.h" | |||
| #include "libavcodec/dsputil.h" | |||
| #if HAVE_YASM | |||
| @@ -35,6 +36,10 @@ void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); | |||
| void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); | |||
| void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); | |||
| void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); | |||
| void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); | |||
| void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); | |||
| void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); | |||
| void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); | |||
| void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); | |||
| @@ -48,12 +53,9 @@ void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len | |||
| #define FLOAT_TO_INT16_INTERLEAVE(cpu) \ | |||
| /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ | |||
| static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ | |||
| DECLARE_ALIGNED(16, int16_t, tmp)[len];\ | |||
| int i,j,c;\ | |||
| int c;\ | |||
| for(c=0; c<channels; c++){\ | |||
| ff_float_to_int16_##cpu(tmp, src[c], len);\ | |||
| for(i=0, j=c; i<len; i++, j+=channels)\ | |||
| dst[j] = tmp[i];\ | |||
| ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\ | |||
| }\ | |||
| }\ | |||
| \ | |||