and use in scale_coefficients() for the floating-point AC-3 encoder.tags/n0.8
| @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len, | |||
| } while (len > 0); | |||
| } | |||
| av_cold void ff_ac3dsp_init(AC3DSPContext *c) | |||
| static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len) | |||
| { | |||
| const float scale = 1 << 24; | |||
| do { | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| *dst++ = lrintf(*src++ * scale); | |||
| len -= 8; | |||
| } while (len > 0); | |||
| } | |||
| av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact) | |||
| { | |||
| c->ac3_exponent_min = ac3_exponent_min_c; | |||
| c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c; | |||
| c->ac3_lshift_int16 = ac3_lshift_int16_c; | |||
| c->ac3_rshift_int32 = ac3_rshift_int32_c; | |||
| c->float_to_fixed24 = float_to_fixed24_c; | |||
| if (HAVE_MMX) | |||
| ff_ac3dsp_init_x86(c); | |||
| ff_ac3dsp_init_x86(c, bit_exact); | |||
| } | |||
| @@ -68,9 +68,22 @@ typedef struct AC3DSPContext { | |||
| * constraints: range [0,31] | |||
| */ | |||
| void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift); | |||
| /** | |||
| * Convert an array of float in range [-1.0,1.0] to int32_t with range | |||
| * [-(1<<24),(1<<24)] | |||
| * | |||
| * @param dst destination array of int32_t. | |||
| * constraints: 16-byte aligned | |||
| * @param src source array of float. | |||
| * constraints: 16-byte aligned | |||
| * @param len number of elements to convert. | |||
| * constraints: multiple of 32 greater than zero | |||
| */ | |||
| void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len); | |||
| } AC3DSPContext; | |||
| void ff_ac3dsp_init (AC3DSPContext *c); | |||
| void ff_ac3dsp_init_x86(AC3DSPContext *c); | |||
| void ff_ac3dsp_init (AC3DSPContext *c, int bit_exact); | |||
| void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact); | |||
| #endif /* AVCODEC_AC3DSP_H */ | |||
| @@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx) | |||
| avctx->coded_frame= avcodec_alloc_frame(); | |||
| dsputil_init(&s->dsp, avctx); | |||
| ff_ac3dsp_init(&s->ac3dsp); | |||
| ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT); | |||
| return 0; | |||
| init_fail: | |||
| @@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s) | |||
| */ | |||
| static void scale_coefficients(AC3EncodeContext *s) | |||
| { | |||
| int i; | |||
| for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++) | |||
| s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24); | |||
| s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer, | |||
| AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels); | |||
| } | |||
| @@ -22,6 +22,11 @@ | |||
| %include "x86inc.asm" | |||
| %include "x86util.asm" | |||
| SECTION_RODATA | |||
| ; 16777216.0f - used in ff_float_to_fixed24() | |||
| pf_1_24: times 4 dd 0x4B800000 | |||
| SECTION .text | |||
| ;----------------------------------------------------------------------------- | |||
| @@ -178,3 +183,113 @@ INIT_MMX | |||
| AC3_SHIFT r, 32, psrad, mmx | |||
| INIT_XMM | |||
| AC3_SHIFT r, 32, psrad, sse2 | |||
| ;----------------------------------------------------------------------------- | |||
| ; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) | |||
| ;----------------------------------------------------------------------------- | |||
| ; The 3DNow! version is not bit-identical because pf2id uses truncation rather | |||
| ; than round-to-nearest. | |||
| INIT_MMX | |||
| cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len | |||
| movq m0, [pf_1_24] | |||
| .loop: | |||
| movq m1, [srcq ] | |||
| movq m2, [srcq+8 ] | |||
| movq m3, [srcq+16] | |||
| movq m4, [srcq+24] | |||
| pfmul m1, m0 | |||
| pfmul m2, m0 | |||
| pfmul m3, m0 | |||
| pfmul m4, m0 | |||
| pf2id m1, m1 | |||
| pf2id m2, m2 | |||
| pf2id m3, m3 | |||
| pf2id m4, m4 | |||
| movq [dstq ], m1 | |||
| movq [dstq+8 ], m2 | |||
| movq [dstq+16], m3 | |||
| movq [dstq+24], m4 | |||
| add srcq, 32 | |||
| add dstq, 32 | |||
| sub lend, 8 | |||
| ja .loop | |||
| REP_RET | |||
| INIT_XMM | |||
| cglobal float_to_fixed24_sse, 3,3,3, dst, src, len | |||
| movaps m0, [pf_1_24] | |||
| .loop: | |||
| movaps m1, [srcq ] | |||
| movaps m2, [srcq+16] | |||
| mulps m1, m0 | |||
| mulps m2, m0 | |||
| cvtps2pi mm0, m1 | |||
| movhlps m1, m1 | |||
| cvtps2pi mm1, m1 | |||
| cvtps2pi mm2, m2 | |||
| movhlps m2, m2 | |||
| cvtps2pi mm3, m2 | |||
| movq [dstq ], mm0 | |||
| movq [dstq+ 8], mm1 | |||
| movq [dstq+16], mm2 | |||
| movq [dstq+24], mm3 | |||
| add srcq, 32 | |||
| add dstq, 32 | |||
| sub lend, 8 | |||
| ja .loop | |||
| REP_RET | |||
| INIT_XMM | |||
| cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len | |||
| movaps m0, [pf_1_24] | |||
| .loop: | |||
| movaps m1, [srcq ] | |||
| movaps m2, [srcq+16 ] | |||
| movaps m3, [srcq+32 ] | |||
| movaps m4, [srcq+48 ] | |||
| %ifdef m8 | |||
| movaps m5, [srcq+64 ] | |||
| movaps m6, [srcq+80 ] | |||
| movaps m7, [srcq+96 ] | |||
| movaps m8, [srcq+112] | |||
| %endif | |||
| mulps m1, m0 | |||
| mulps m2, m0 | |||
| mulps m3, m0 | |||
| mulps m4, m0 | |||
| %ifdef m8 | |||
| mulps m5, m0 | |||
| mulps m6, m0 | |||
| mulps m7, m0 | |||
| mulps m8, m0 | |||
| %endif | |||
| cvtps2dq m1, m1 | |||
| cvtps2dq m2, m2 | |||
| cvtps2dq m3, m3 | |||
| cvtps2dq m4, m4 | |||
| %ifdef m8 | |||
| cvtps2dq m5, m5 | |||
| cvtps2dq m6, m6 | |||
| cvtps2dq m7, m7 | |||
| cvtps2dq m8, m8 | |||
| %endif | |||
| movdqa [dstq ], m1 | |||
| movdqa [dstq+16 ], m2 | |||
| movdqa [dstq+32 ], m3 | |||
| movdqa [dstq+48 ], m4 | |||
| %ifdef m8 | |||
| movdqa [dstq+64 ], m5 | |||
| movdqa [dstq+80 ], m6 | |||
| movdqa [dstq+96 ], m7 | |||
| movdqa [dstq+112], m8 | |||
| add srcq, 128 | |||
| add dstq, 128 | |||
| sub lenq, 32 | |||
| %else | |||
| add srcq, 64 | |||
| add dstq, 64 | |||
| sub lenq, 16 | |||
| %endif | |||
| ja .loop | |||
| REP_RET | |||
| @@ -38,7 +38,11 @@ extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in | |||
| extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); | |||
| extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); | |||
| av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) | |||
| extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); | |||
| extern void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); | |||
| extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); | |||
| av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) | |||
| { | |||
| int mm_flags = av_get_cpu_flags(); | |||
| @@ -49,13 +53,22 @@ av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c) | |||
| c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; | |||
| c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { | |||
| if (!bit_exact) { | |||
| c->float_to_fixed24 = ff_float_to_fixed24_3dnow; | |||
| } | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) { | |||
| c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; | |||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) { | |||
| c->float_to_fixed24 = ff_float_to_fixed24_sse; | |||
| } | |||
| if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) { | |||
| c->ac3_exponent_min = ff_ac3_exponent_min_sse2; | |||
| c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; | |||
| c->float_to_fixed24 = ff_float_to_fixed24_sse2; | |||
| if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) { | |||
| c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; | |||
| c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; | |||