Merge commit '4cb6964244fd6c099383d8b7e99731e72cc844b9'

* commit '4cb6964244fd6c099383d8b7e99731e72cc844b9': dcadec: simplify decoding of VQ high frequencies Conflicts: configure libavcodec/dcadec.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · fb3c33f3cd
--- a/configure
+++ b/configure
@@ -1540,7 +1540,6 @@ HAVE_LIST="
    altivec_h
    arpa_inet_h
    asm_mod_q
    asm_mod_y
    asm_types_h
    atomic_cas_ptr
    atomics_native
@@ -4147,7 +4146,6 @@ EOF
            $ARCH_EXT_LIST_ARM
    check_inline_asm asm_mod_q '"add r0, %Q0, %R0" :: "r"((long long)0)'
    check_inline_asm asm_mod_y '"vmul.i32 d0, d0, %y0" :: "x"(0)'
    [ $target_os != win32 ] && enabled_all armv6t2 shared !pic && enable_weak_pic
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -79,27 +79,4 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
 #endif
 #if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
 #define int8x8_fmul_int32 int8x8_fmul_int32
 static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
                                     float *dst, const int8_t *src, int scale)
 {
    __asm__ ("vcvt.f32.s32 %2,  %2,  #4         \n"
             "vld1.8       {d0},     [%1,:64]   \n"
             "vmovl.s8     q0,  d0              \n"
             "vmovl.s16    q1,  d1              \n"
             "vmovl.s16    q0,  d0              \n"
             "vcvt.f32.s32 q0,  q0              \n"
             "vcvt.f32.s32 q1,  q1              \n"
             "vmul.f32     q0,  q0,  %y2        \n"
             "vmul.f32     q1,  q1,  %y2        \n"
             "vst1.32      {q0-q1},  [%m0,:128] \n"
             : "=Um"(*(float (*)[8])dst)
             : "r"(src), "x"(scale)
             : "d0", "d1", "d2", "d3");
 }
 #endif
 #endif /* AVCODEC_ARM_DCA_H */
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -49,14 +49,10 @@
 #if ARCH_ARM
 #   include "arm/dca.h"
 #endif
 #if ARCH_X86
 #   include "x86/dca.h"
 #endif
 //#define TRACE
 #define DCA_PRIM_CHANNELS_MAX  (7)
 #define DCA_SUBBANDS          (64)
 #define DCA_ABITS_MAX         (32)      /* Should be 28 */
 #define DCA_SUBSUBFRAMES_MAX   (4)
 #define DCA_SUBFRAMES_MAX     (16)
@@ -403,7 +399,7 @@ typedef struct {
    int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];      ///< prediction VQ coefs
    int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];           ///< bit allocation index
    int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];    ///< transition mode (transients)
    int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];    ///< scale factors (2 if transient)
    int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale factors (2 if transient)
    int joint_huff[DCA_PRIM_CHANNELS_MAX];                       ///< joint subband scale factors codebook
    int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors
    float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2];            ///< stereo downmix coefficients
@@ -416,7 +412,7 @@ typedef struct {
    uint8_t  core_downmix_amode;                                 ///< audio channel arrangement of embedded downmix
    uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4];   ///< embedded downmix coefficients (9-bit codes)
    int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];       ///< VQ encoded high frequency subbands
    int32_t  high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS];  ///< VQ encoded high frequency subbands
    float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)];      ///< Low frequency effect data
    int lfe_scale_factor;
@@ -1249,14 +1245,6 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
 static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
 static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
 #ifndef int8x8_fmul_int32
 static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
                                     const int8_t *src, int scale)
 {
    dsp->int8x8_fmul_int32(dst, src, scale);
 }
 #endif
 static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 {
    int k, l;
@@ -1381,20 +1369,16 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
        /*
         * Decode VQ encoded high frequencies
         */
        for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) {
            /* 1 vector -> 32 samples but we only need the 8 samples
             * for this subsubframe. */
            int hfvq = s->high_freq_vq[k][l];
        if (s->subband_activity[k] > s->vq_start_subband[k]) {
            if (!s->debug_flag & 0x01) {
                av_log(s->avctx, AV_LOG_DEBUG,
                       "Stream with high frequencies VQ coding\n");
                s->debug_flag |= 0x01;
            }
            int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l],
                              &high_freq_vq[hfvq][subsubframe * 8],
                              s->scale_factor[k][l][0]);
            s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
                                high_freq_vq, subsubframe * 8,
                                s->scale_factor[k], s->vq_start_subband[k],
                                s->subband_activity[k]);
        }
    }
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -24,12 +24,22 @@
 #include "libavutil/intreadwrite.h"
 #include "dcadsp.h"
 static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
 static void decode_hf_c(float dst[DCA_SUBBANDS][8],
                        const int32_t vq_num[DCA_SUBBANDS],
                        const int8_t hf_vq[1024][32], intptr_t vq_offset,
                        int32_t scale[DCA_SUBBANDS][2],
                        intptr_t start, intptr_t end)
 {
    float fscale = scale / 16.0;
    int i;
    for (i = 0; i < 8; i++)
        dst[i] = src[i] * fscale;
    int i, l;
    for (l = start; l < end; l++) {
        /* 1 vector -> 32 samples but we only need the 8 samples
         * for this subsubframe. */
        const int8_t *ptr = &hf_vq[vq_num[l]][vq_offset];
        float fscale = scale[l][0] * (1 / 16.0);
        for (i = 0; i < 8; i++)
            dst[l][i] = ptr[i] * fscale;
    }
 }
 static inline void
@@ -96,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
    s->lfe_fir[0] = dca_lfe_fir0_c;
    s->lfe_fir[1] = dca_lfe_fir1_c;
    s->qmf_32_subbands = dca_qmf_32_subbands;
    s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
    s->decode_hf = decode_hf_c;
    if (ARCH_ARM) ff_dcadsp_init_arm(s);
    if (ARCH_X86) ff_dcadsp_init_x86(s);
 }
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -22,6 +22,8 @@
 #include "avfft.h"
 #include "synth_filter.h"
 #define DCA_SUBBANDS 64
 typedef struct DCADSPContext {
    void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
    void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
@@ -30,7 +32,11 @@ typedef struct DCADSPContext {
                            int *synth_buf_offset, float synth_buf2[32],
                            const float window[512], float *samples_out,
                            float raXin[32], float scale);
    void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale);
    void (*decode_hf)(float dst[DCA_SUBBANDS][8],
                      const int32_t vq_num[DCA_SUBBANDS],
                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
                      int32_t scale[DCA_SUBBANDS][2],
                      intptr_t start, intptr_t end);
 } DCADSPContext;
 void ff_dcadsp_init(DCADSPContext *s);
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -26,18 +26,35 @@ pf_inv16:  times 4 dd 0x3D800000 ; 1/16
 SECTION_TEXT
 ; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale)
 %macro INT8X8_FMUL_INT32 0
 cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
    cvtsi2ss    m0, scalem
 ; void decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS],
 ;                const int8_t hf_vq[1024][32], intptr_t vq_offset,
 ;                int32_t scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end)
 %macro DECODE_HF 0
 cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end
    lea       srcq, [srcq + offsetq]
    shl     startq, 2
    mov    offsetd, endm
 %define DICT offsetq
    shl    offsetq, 2
    mov       endm, offsetq
 .loop:
 %if ARCH_X86_64
    mov    offsetd, [scaleq + 2 * startq]
    cvtsi2ss    m0, offsetd
 %else
    cvtsi2ss    m0, [scaleq + 2 * startq]
 %endif
    mov    offsetd, [numq + startq]
    mulss       m0, [pf_inv16]
    shl       DICT, 5
    shufps      m0, m0, 0
 %if cpuflag(sse2)
 %if cpuflag(sse4)
    pmovsxbd    m1, [srcq+0]
    pmovsxbd    m2, [srcq+4]
    pmovsxbd    m1, [srcq + DICT + 0]
    pmovsxbd    m2, [srcq + DICT + 4]
 %else
    movq        m1, [srcq]
    movq        m1, [srcq + DICT]
    punpcklbw   m1, m1
    mova        m2, m1
    punpcklwd   m1, m1
@@ -48,8 +65,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
    cvtdq2ps    m1, m1
    cvtdq2ps    m2, m2
 %else
    movd       mm0, [srcq+0]
    movd       mm1, [srcq+4]
    movd       mm0, [srcq + DICT + 0]
    movd       mm1, [srcq + DICT + 4]
    punpcklbw  mm0, mm0
    punpcklbw  mm1, mm1
    movq       mm2, mm0
@@ -67,27 +84,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale
    cvtpi2ps    m3, mm2
    cvtpi2ps    m4, mm3
    shufps      m0, m0, 0
    emms
    shufps      m1, m3, q1010
    shufps      m2, m4, q1010
 %endif
    mulps       m1, m0
    mulps       m2, m0
    mova [dstq+ 0], m1
    mova [dstq+16], m2
    mova [dstq + 8 * startq +  0], m1
    mova [dstq + 8 * startq + 16], m2
    add     startq, 4
    cmp     startq, endm
    jl       .loop
 .end:
 %if notcpuflag(sse2)
    emms
 %endif
    REP_RET
 %endmacro
 %if ARCH_X86_32
 INIT_XMM sse
 INT8X8_FMUL_INT32
 DECODE_HF
 %endif
 INIT_XMM sse2
 INT8X8_FMUL_INT32
 DECODE_HF
 INIT_XMM sse4
 INT8X8_FMUL_INT32
 DECODE_HF
 ; %1=v0/v1  %2=in1  %3=in2
 %macro FIR_LOOP 2-3
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -23,9 +23,15 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/dcadsp.h"
 void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale);
 void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale);
 void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale);
 void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
                      const int8_t hf_vq[1024][32], intptr_t vq_offset,
                      int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS],
                       const int8_t hf_vq[1024][32], intptr_t vq_offset,
                       int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end);
 void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs);
 void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs);
@@ -35,18 +41,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
    if (EXTERNAL_SSE(cpu_flags)) {
 #if ARCH_X86_32
        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse;
        s->decode_hf = ff_decode_hf_sse;
 #endif
        s->lfe_fir[0]        = ff_dca_lfe_fir0_sse;
        s->lfe_fir[1]        = ff_dca_lfe_fir1_sse;
    }
    if (EXTERNAL_SSE2(cpu_flags)) {
        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2;
        s->decode_hf = ff_decode_hf_sse2;
    }
    if (EXTERNAL_SSE4(cpu_flags)) {
        s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4;
        s->decode_hf = ff_decode_hf_sse4;
    }
 }