* commit '4cb6964244fd6c099383d8b7e99731e72cc844b9': dcadec: simplify decoding of VQ high frequencies Conflicts: configure libavcodec/dcadec.c Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.2-rc1
| @@ -1540,7 +1540,6 @@ HAVE_LIST=" | |||||
| altivec_h | altivec_h | ||||
| arpa_inet_h | arpa_inet_h | ||||
| asm_mod_q | asm_mod_q | ||||
| asm_mod_y | |||||
| asm_types_h | asm_types_h | ||||
| atomic_cas_ptr | atomic_cas_ptr | ||||
| atomics_native | atomics_native | ||||
| @@ -4147,7 +4146,6 @@ EOF | |||||
| $ARCH_EXT_LIST_ARM | $ARCH_EXT_LIST_ARM | ||||
| check_inline_asm asm_mod_q '"add r0, %Q0, %R0" :: "r"((long long)0)' | check_inline_asm asm_mod_q '"add r0, %Q0, %R0" :: "r"((long long)0)' | ||||
| check_inline_asm asm_mod_y '"vmul.i32 d0, d0, %y0" :: "x"(0)' | |||||
| [ $target_os != win32 ] && enabled_all armv6t2 shared !pic && enable_weak_pic | [ $target_os != win32 ] && enabled_all armv6t2 shared !pic && enable_weak_pic | ||||
| @@ -79,27 +79,4 @@ static inline int decode_blockcodes(int code1, int code2, int levels, | |||||
| #endif | #endif | ||||
| #if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y | |||||
| #define int8x8_fmul_int32 int8x8_fmul_int32 | |||||
| static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, | |||||
| float *dst, const int8_t *src, int scale) | |||||
| { | |||||
| __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" | |||||
| "vld1.8 {d0}, [%1,:64] \n" | |||||
| "vmovl.s8 q0, d0 \n" | |||||
| "vmovl.s16 q1, d1 \n" | |||||
| "vmovl.s16 q0, d0 \n" | |||||
| "vcvt.f32.s32 q0, q0 \n" | |||||
| "vcvt.f32.s32 q1, q1 \n" | |||||
| "vmul.f32 q0, q0, %y2 \n" | |||||
| "vmul.f32 q1, q1, %y2 \n" | |||||
| "vst1.32 {q0-q1}, [%m0,:128] \n" | |||||
| : "=Um"(*(float (*)[8])dst) | |||||
| : "r"(src), "x"(scale) | |||||
| : "d0", "d1", "d2", "d3"); | |||||
| } | |||||
| #endif | |||||
| #endif /* AVCODEC_ARM_DCA_H */ | #endif /* AVCODEC_ARM_DCA_H */ | ||||
| @@ -49,14 +49,10 @@ | |||||
| #if ARCH_ARM | #if ARCH_ARM | ||||
| # include "arm/dca.h" | # include "arm/dca.h" | ||||
| #endif | #endif | ||||
| #if ARCH_X86 | |||||
| # include "x86/dca.h" | |||||
| #endif | |||||
| //#define TRACE | //#define TRACE | ||||
| #define DCA_PRIM_CHANNELS_MAX (7) | #define DCA_PRIM_CHANNELS_MAX (7) | ||||
| #define DCA_SUBBANDS (64) | |||||
| #define DCA_ABITS_MAX (32) /* Should be 28 */ | #define DCA_ABITS_MAX (32) /* Should be 28 */ | ||||
| #define DCA_SUBSUBFRAMES_MAX (4) | #define DCA_SUBSUBFRAMES_MAX (4) | ||||
| #define DCA_SUBFRAMES_MAX (16) | #define DCA_SUBFRAMES_MAX (16) | ||||
| @@ -403,7 +399,7 @@ typedef struct { | |||||
| int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< prediction VQ coefs | int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< prediction VQ coefs | ||||
| int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit allocation index | int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit allocation index | ||||
| int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< transition mode (transients) | int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< transition mode (transients) | ||||
| int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2]; ///< scale factors (2 if transient) | |||||
| int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale factors (2 if transient) | |||||
| int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint subband scale factors codebook | int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint subband scale factors codebook | ||||
| int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors | int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors | ||||
| float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo downmix coefficients | float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo downmix coefficients | ||||
| @@ -416,7 +412,7 @@ typedef struct { | |||||
| uint8_t core_downmix_amode; ///< audio channel arrangement of embedded downmix | uint8_t core_downmix_amode; ///< audio channel arrangement of embedded downmix | ||||
| uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///< embedded downmix coefficients (9-bit codes) | uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///< embedded downmix coefficients (9-bit codes) | ||||
| int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands | |||||
| int32_t high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands | |||||
| float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low frequency effect data | float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low frequency effect data | ||||
| int lfe_scale_factor; | int lfe_scale_factor; | ||||
| @@ -1249,14 +1245,6 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values) | |||||
| static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 }; | static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 }; | ||||
| static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; | static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; | ||||
| #ifndef int8x8_fmul_int32 | |||||
| static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst, | |||||
| const int8_t *src, int scale) | |||||
| { | |||||
| dsp->int8x8_fmul_int32(dst, src, scale); | |||||
| } | |||||
| #endif | |||||
| static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) | static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) | ||||
| { | { | ||||
| int k, l; | int k, l; | ||||
| @@ -1381,20 +1369,16 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) | |||||
| /* | /* | ||||
| * Decode VQ encoded high frequencies | * Decode VQ encoded high frequencies | ||||
| */ | */ | ||||
| for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) { | |||||
| /* 1 vector -> 32 samples but we only need the 8 samples | |||||
| * for this subsubframe. */ | |||||
| int hfvq = s->high_freq_vq[k][l]; | |||||
| if (s->subband_activity[k] > s->vq_start_subband[k]) { | |||||
| if (!s->debug_flag & 0x01) { | if (!s->debug_flag & 0x01) { | ||||
| av_log(s->avctx, AV_LOG_DEBUG, | av_log(s->avctx, AV_LOG_DEBUG, | ||||
| "Stream with high frequencies VQ coding\n"); | "Stream with high frequencies VQ coding\n"); | ||||
| s->debug_flag |= 0x01; | s->debug_flag |= 0x01; | ||||
| } | } | ||||
| int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l], | |||||
| &high_freq_vq[hfvq][subsubframe * 8], | |||||
| s->scale_factor[k][l][0]); | |||||
| s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k], | |||||
| high_freq_vq, subsubframe * 8, | |||||
| s->scale_factor[k], s->vq_start_subband[k], | |||||
| s->subband_activity[k]); | |||||
| } | } | ||||
| } | } | ||||
| @@ -24,12 +24,22 @@ | |||||
| #include "libavutil/intreadwrite.h" | #include "libavutil/intreadwrite.h" | ||||
| #include "dcadsp.h" | #include "dcadsp.h" | ||||
| static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale) | |||||
| static void decode_hf_c(float dst[DCA_SUBBANDS][8], | |||||
| const int32_t vq_num[DCA_SUBBANDS], | |||||
| const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| int32_t scale[DCA_SUBBANDS][2], | |||||
| intptr_t start, intptr_t end) | |||||
| { | { | ||||
| float fscale = scale / 16.0; | |||||
| int i; | |||||
| for (i = 0; i < 8; i++) | |||||
| dst[i] = src[i] * fscale; | |||||
| int i, l; | |||||
| for (l = start; l < end; l++) { | |||||
| /* 1 vector -> 32 samples but we only need the 8 samples | |||||
| * for this subsubframe. */ | |||||
| const int8_t *ptr = &hf_vq[vq_num[l]][vq_offset]; | |||||
| float fscale = scale[l][0] * (1 / 16.0); | |||||
| for (i = 0; i < 8; i++) | |||||
| dst[l][i] = ptr[i] * fscale; | |||||
| } | |||||
| } | } | ||||
| static inline void | static inline void | ||||
| @@ -96,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s) | |||||
| s->lfe_fir[0] = dca_lfe_fir0_c; | s->lfe_fir[0] = dca_lfe_fir0_c; | ||||
| s->lfe_fir[1] = dca_lfe_fir1_c; | s->lfe_fir[1] = dca_lfe_fir1_c; | ||||
| s->qmf_32_subbands = dca_qmf_32_subbands; | s->qmf_32_subbands = dca_qmf_32_subbands; | ||||
| s->int8x8_fmul_int32 = int8x8_fmul_int32_c; | |||||
| s->decode_hf = decode_hf_c; | |||||
| if (ARCH_ARM) ff_dcadsp_init_arm(s); | if (ARCH_ARM) ff_dcadsp_init_arm(s); | ||||
| if (ARCH_X86) ff_dcadsp_init_x86(s); | if (ARCH_X86) ff_dcadsp_init_x86(s); | ||||
| } | } | ||||
| @@ -22,6 +22,8 @@ | |||||
| #include "avfft.h" | #include "avfft.h" | ||||
| #include "synth_filter.h" | #include "synth_filter.h" | ||||
| #define DCA_SUBBANDS 64 | |||||
| typedef struct DCADSPContext { | typedef struct DCADSPContext { | ||||
| void (*lfe_fir[2])(float *out, const float *in, const float *coefs); | void (*lfe_fir[2])(float *out, const float *in, const float *coefs); | ||||
| void (*qmf_32_subbands)(float samples_in[32][8], int sb_act, | void (*qmf_32_subbands)(float samples_in[32][8], int sb_act, | ||||
| @@ -30,7 +32,11 @@ typedef struct DCADSPContext { | |||||
| int *synth_buf_offset, float synth_buf2[32], | int *synth_buf_offset, float synth_buf2[32], | ||||
| const float window[512], float *samples_out, | const float window[512], float *samples_out, | ||||
| float raXin[32], float scale); | float raXin[32], float scale); | ||||
| void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale); | |||||
| void (*decode_hf)(float dst[DCA_SUBBANDS][8], | |||||
| const int32_t vq_num[DCA_SUBBANDS], | |||||
| const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| int32_t scale[DCA_SUBBANDS][2], | |||||
| intptr_t start, intptr_t end); | |||||
| } DCADSPContext; | } DCADSPContext; | ||||
| void ff_dcadsp_init(DCADSPContext *s); | void ff_dcadsp_init(DCADSPContext *s); | ||||
| @@ -26,18 +26,35 @@ pf_inv16: times 4 dd 0x3D800000 ; 1/16 | |||||
| SECTION_TEXT | SECTION_TEXT | ||||
| ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale) | |||||
| %macro INT8X8_FMUL_INT32 0 | |||||
| cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale | |||||
| cvtsi2ss m0, scalem | |||||
| ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS], | |||||
| ; const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| ; int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end) | |||||
| %macro DECODE_HF 0 | |||||
| cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end | |||||
| lea srcq, [srcq + offsetq] | |||||
| shl startq, 2 | |||||
| mov offsetd, endm | |||||
| %define DICT offsetq | |||||
| shl offsetq, 2 | |||||
| mov endm, offsetq | |||||
| .loop: | |||||
| %if ARCH_X86_64 | |||||
| mov offsetd, [scaleq + 2 * startq] | |||||
| cvtsi2ss m0, offsetd | |||||
| %else | |||||
| cvtsi2ss m0, [scaleq + 2 * startq] | |||||
| %endif | |||||
| mov offsetd, [numq + startq] | |||||
| mulss m0, [pf_inv16] | mulss m0, [pf_inv16] | ||||
| shl DICT, 5 | |||||
| shufps m0, m0, 0 | shufps m0, m0, 0 | ||||
| %if cpuflag(sse2) | %if cpuflag(sse2) | ||||
| %if cpuflag(sse4) | %if cpuflag(sse4) | ||||
| pmovsxbd m1, [srcq+0] | |||||
| pmovsxbd m2, [srcq+4] | |||||
| pmovsxbd m1, [srcq + DICT + 0] | |||||
| pmovsxbd m2, [srcq + DICT + 4] | |||||
| %else | %else | ||||
| movq m1, [srcq] | |||||
| movq m1, [srcq + DICT] | |||||
| punpcklbw m1, m1 | punpcklbw m1, m1 | ||||
| mova m2, m1 | mova m2, m1 | ||||
| punpcklwd m1, m1 | punpcklwd m1, m1 | ||||
| @@ -48,8 +65,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale | |||||
| cvtdq2ps m1, m1 | cvtdq2ps m1, m1 | ||||
| cvtdq2ps m2, m2 | cvtdq2ps m2, m2 | ||||
| %else | %else | ||||
| movd mm0, [srcq+0] | |||||
| movd mm1, [srcq+4] | |||||
| movd mm0, [srcq + DICT + 0] | |||||
| movd mm1, [srcq + DICT + 4] | |||||
| punpcklbw mm0, mm0 | punpcklbw mm0, mm0 | ||||
| punpcklbw mm1, mm1 | punpcklbw mm1, mm1 | ||||
| movq mm2, mm0 | movq mm2, mm0 | ||||
| @@ -67,27 +84,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale | |||||
| cvtpi2ps m3, mm2 | cvtpi2ps m3, mm2 | ||||
| cvtpi2ps m4, mm3 | cvtpi2ps m4, mm3 | ||||
| shufps m0, m0, 0 | shufps m0, m0, 0 | ||||
| emms | |||||
| shufps m1, m3, q1010 | shufps m1, m3, q1010 | ||||
| shufps m2, m4, q1010 | shufps m2, m4, q1010 | ||||
| %endif | %endif | ||||
| mulps m1, m0 | mulps m1, m0 | ||||
| mulps m2, m0 | mulps m2, m0 | ||||
| mova [dstq+ 0], m1 | |||||
| mova [dstq+16], m2 | |||||
| mova [dstq + 8 * startq + 0], m1 | |||||
| mova [dstq + 8 * startq + 16], m2 | |||||
| add startq, 4 | |||||
| cmp startq, endm | |||||
| jl .loop | |||||
| .end: | |||||
| %if notcpuflag(sse2) | |||||
| emms | |||||
| %endif | |||||
| REP_RET | REP_RET | ||||
| %endmacro | %endmacro | ||||
| %if ARCH_X86_32 | %if ARCH_X86_32 | ||||
| INIT_XMM sse | INIT_XMM sse | ||||
| INT8X8_FMUL_INT32 | |||||
| DECODE_HF | |||||
| %endif | %endif | ||||
| INIT_XMM sse2 | INIT_XMM sse2 | ||||
| INT8X8_FMUL_INT32 | |||||
| DECODE_HF | |||||
| INIT_XMM sse4 | INIT_XMM sse4 | ||||
| INT8X8_FMUL_INT32 | |||||
| DECODE_HF | |||||
| ; %1=v0/v1 %2=in1 %3=in2 | ; %1=v0/v1 %2=in1 %3=in2 | ||||
| %macro FIR_LOOP 2-3 | %macro FIR_LOOP 2-3 | ||||
| @@ -23,9 +23,15 @@ | |||||
| #include "libavutil/x86/cpu.h" | #include "libavutil/x86/cpu.h" | ||||
| #include "libavcodec/dcadsp.h" | #include "libavcodec/dcadsp.h" | ||||
| void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale); | |||||
| void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale); | |||||
| void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale); | |||||
| void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], | |||||
| const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); | |||||
| void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], | |||||
| const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); | |||||
| void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], | |||||
| const int8_t hf_vq[1024][32], intptr_t vq_offset, | |||||
| int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); | |||||
| void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); | void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs); | ||||
| void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); | void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs); | ||||
| @@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) | |||||
| if (EXTERNAL_SSE(cpu_flags)) { | if (EXTERNAL_SSE(cpu_flags)) { | ||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse; | |||||
| s->decode_hf = ff_decode_hf_sse; | |||||
| #endif | #endif | ||||
| s->lfe_fir[0] = ff_dca_lfe_fir0_sse; | s->lfe_fir[0] = ff_dca_lfe_fir0_sse; | ||||
| s->lfe_fir[1] = ff_dca_lfe_fir1_sse; | s->lfe_fir[1] = ff_dca_lfe_fir1_sse; | ||||
| } | } | ||||
| if (EXTERNAL_SSE2(cpu_flags)) { | if (EXTERNAL_SSE2(cpu_flags)) { | ||||
| s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2; | |||||
| s->decode_hf = ff_decode_hf_sse2; | |||||
| } | } | ||||
| if (EXTERNAL_SSE4(cpu_flags)) { | if (EXTERNAL_SSE4(cpu_flags)) { | ||||
| s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4; | |||||
| s->decode_hf = ff_decode_hf_sse4; | |||||
| } | } | ||||
| } | } | ||||