Merge remote-tracking branch 'qatar/master'

* qatar/master: arm: Add VFP-accelerated version of qmf_32_subbands Merged-by: Michael Niedermayer <michaelni@gmx.at>
12 years ago · 23f250d2bc
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -26,6 +26,12 @@

 void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
                        int decifactor, float scale);
 void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
                                SynthFilterContext *synth, FFTContext *imdct,
                                float synth_buf_ptr[512],
                                int *synth_buf_offset, float synth_buf2[32],
                                const float window[512], float *samples_out,
                                float raXin[32], float scale);
 void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
                         int decifactor, float scale);

@@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
 {
    int cpu_flags = av_get_cpu_flags();

    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
        s->lfe_fir = ff_dca_lfe_fir_vfp;
        s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
    }
    if (have_neon(cpu_flags))
        s->lfe_fir = ff_dca_lfe_fir_neon;
 }
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -218,3 +218,276 @@ endfunc
        .unreq  POST1
        .unreq  POST2
        .unreq  POST3


 IN      .req    a1
 SBACT   .req    a2
 OLDFPSCR .req   a3
 IMDCT   .req    a4
 WINDOW  .req    v1
 OUT     .req    v2
 BUF     .req    v3
 SCALEINT .req   v4 @ only used in softfp case
 COUNT   .req    v5

 SCALE   .req    s0

 /* Stack layout differs in softfp and hardfp cases:
 *
 * hardfp
 *      fp -> 6 arg words saved by caller
 *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
 *            s16-s23 on entry
 *            align 16
 *     buf -> 8*32*4 bytes buffer
 *            s0 on entry
 *      sp -> 3 arg words for callee
 *
 * softfp
 *      fp -> 7 arg words saved by caller
 *            a4,v1-v5,fp,lr on entry
 *            s16-s23 on entry
 *            align 16
 *     buf -> 8*32*4 bytes buffer
 *      sp -> 4 arg words for callee
 */

 /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
 *                                 SynthFilterContext *synth, FFTContext *imdct,
 *                                 float (*synth_buf_ptr)[512],
 *                                 int *synth_buf_offset, float (*synth_buf2)[32],
 *                                 const float (*window)[512], float *samples_out,
 *                                 float (*raXin)[32], float scale);
 */
 function ff_dca_qmf_32_subbands_vfp, export=1
 VFP     push    {a3-a4,v1-v3,v5,fp,lr}
 NOVFP   push    {a4,v1-v5,fp,lr}
        add     fp, sp, #8*4
        vpush   {s16-s23}
        @ The buffer pointed at by raXin isn't big enough for us to do a
        @ complete matrix transposition as we want to, so allocate an
        @ alternative buffer from the stack. Align to 4 words for speed.
        sub     BUF, sp, #8*32*4
        bic     BUF, BUF, #15
        mov     sp, BUF
        ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
        fmrx    OLDFPSCR, FPSCR
        fmxr    FPSCR, lr
        @ COUNT is used to count down 2 things at once:
        @ bits 0-4 are the number of word pairs remaining in the output row
        @ bits 5-31 are the number of words to copy (with possible negation)
        @   from the source matrix before we start zeroing the remainder
        mov     COUNT, #(-4 << 5) + 16
        adds    COUNT, COUNT, SBACT, lsl #5
        bmi     2f
 1:
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  [IN, #(1*8+0)*4]
        vldr    s11, [IN, #(1*8+1)*4]
        vldr    s13, [IN, #(1*8+2)*4]
        vldr    s15, [IN, #(1*8+3)*4]
        vneg.f  s16, s16
        vldr    s17, [IN, #(1*8+4)*4]
        vldr    s19, [IN, #(1*8+5)*4]
        vldr    s21, [IN, #(1*8+6)*4]
        vldr    s23, [IN, #(1*8+7)*4]
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        vldr    s9,  [IN, #(3*8+0)*4]
        vldr    s11, [IN, #(3*8+1)*4]
        vldr    s13, [IN, #(3*8+2)*4]
        vldr    s15, [IN, #(3*8+3)*4]
        vldr    s17, [IN, #(3*8+4)*4]
        vldr    s19, [IN, #(3*8+5)*4]
        vldr    s21, [IN, #(3*8+6)*4]
        vldr    s23, [IN, #(3*8+7)*4]
        vneg.f  s9, s9
        vldr    s8,  [IN, #(2*8+0)*4]
        vldr    s10, [IN, #(2*8+1)*4]
        vldr    s12, [IN, #(2*8+2)*4]
        vldr    s14, [IN, #(2*8+3)*4]
        vneg.f  s17, s17
        vldr    s16, [IN, #(2*8+4)*4]
        vldr    s18, [IN, #(2*8+5)*4]
        vldr    s20, [IN, #(2*8+6)*4]
        vldr    s22, [IN, #(2*8+7)*4]
        vstr    d4,  [BUF, #(0*32+2)*4]
        vstr    d5,  [BUF, #(1*32+2)*4]
        vstr    d6,  [BUF, #(2*32+2)*4]
        vstr    d7,  [BUF, #(3*32+2)*4]
        vstr    d8,  [BUF, #(4*32+2)*4]
        vstr    d9,  [BUF, #(5*32+2)*4]
        vstr    d10, [BUF, #(6*32+2)*4]
        vstr    d11, [BUF, #(7*32+2)*4]
        add     IN, IN, #4*8*4
        add     BUF, BUF, #4*4
        subs    COUNT, COUNT, #(4 << 5) + 2
        bpl     1b
 2:      @ Now deal with trailing < 4 samples
        adds    COUNT, COUNT, #3 << 5
        bmi     4f  @ sb_act was a multiple of 4
        bics    lr, COUNT, #0x1F
        bne     3f
        @ sb_act was n*4+1
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  zero
        vldr    s11, zero
        vldr    s13, zero
        vldr    s15, zero
        vneg.f  s16, s16
        vldr    s17, zero
        vldr    s19, zero
        vldr    s21, zero
        vldr    s23, zero
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #1
        b       4f
 3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
        vldr    s8,  [IN, #(0*8+0)*4]
        vldr    s10, [IN, #(0*8+1)*4]
        vldr    s12, [IN, #(0*8+2)*4]
        vldr    s14, [IN, #(0*8+3)*4]
        vldr    s16, [IN, #(0*8+4)*4]
        vldr    s18, [IN, #(0*8+5)*4]
        vldr    s20, [IN, #(0*8+6)*4]
        vldr    s22, [IN, #(0*8+7)*4]
        vneg.f  s8, s8
        vldr    s9,  [IN, #(1*8+0)*4]
        vldr    s11, [IN, #(1*8+1)*4]
        vldr    s13, [IN, #(1*8+2)*4]
        vldr    s15, [IN, #(1*8+3)*4]
        vneg.f  s16, s16
        vldr    s17, [IN, #(1*8+4)*4]
        vldr    s19, [IN, #(1*8+5)*4]
        vldr    s21, [IN, #(1*8+6)*4]
        vldr    s23, [IN, #(1*8+7)*4]
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #(2 << 5) + 1
        bics    lr, COUNT, #0x1F
        bne     4f
        @ sb_act was n*4+3
        vldr    s8,  [IN, #(2*8+0)*4]
        vldr    s10, [IN, #(2*8+1)*4]
        vldr    s12, [IN, #(2*8+2)*4]
        vldr    s14, [IN, #(2*8+3)*4]
        vldr    s16, [IN, #(2*8+4)*4]
        vldr    s18, [IN, #(2*8+5)*4]
        vldr    s20, [IN, #(2*8+6)*4]
        vldr    s22, [IN, #(2*8+7)*4]
        vldr    s9,  zero
        vldr    s11, zero
        vldr    s13, zero
        vldr    s15, zero
        vldr    s17, zero
        vldr    s19, zero
        vldr    s21, zero
        vldr    s23, zero
        vstr    d4,  [BUF, #(0*32+0)*4]
        vstr    d5,  [BUF, #(1*32+0)*4]
        vstr    d6,  [BUF, #(2*32+0)*4]
        vstr    d7,  [BUF, #(3*32+0)*4]
        vstr    d8,  [BUF, #(4*32+0)*4]
        vstr    d9,  [BUF, #(5*32+0)*4]
        vstr    d10, [BUF, #(6*32+0)*4]
        vstr    d11, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        sub     COUNT, COUNT, #1
 4:      @ Now fill the remainder with 0
        vldr    s8, zero
        vldr    s9, zero
        ands    COUNT, COUNT, #0x1F
        beq     6f
 5:      vstr    d4, [BUF, #(0*32+0)*4]
        vstr    d4, [BUF, #(1*32+0)*4]
        vstr    d4, [BUF, #(2*32+0)*4]
        vstr    d4, [BUF, #(3*32+0)*4]
        vstr    d4, [BUF, #(4*32+0)*4]
        vstr    d4, [BUF, #(5*32+0)*4]
        vstr    d4, [BUF, #(6*32+0)*4]
        vstr    d4, [BUF, #(7*32+0)*4]
        add     BUF, BUF, #2*4
        subs    COUNT, COUNT, #1
        bne     5b
 6:
        fmxr    FPSCR, OLDFPSCR
        ldr     WINDOW, [fp, #3*4]
        ldr     OUT, [fp, #4*4]
        sub     BUF, BUF, #32*4
 NOVFP   ldr     SCALEINT, [fp, #6*4]
        mov     COUNT, #8
 VFP     vpush   {SCALE}
 VFP     sub     sp, sp, #3*4
 NOVFP   sub     sp, sp, #4*4
 7:
 VFP     ldr     a1, [fp, #-7*4]     @ imdct
 NOVFP   ldr     a1, [fp, #-8*4]
        ldmia   fp, {a2-a4}
 VFP     stmia   sp, {WINDOW, OUT, BUF}
 NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
 VFP     vldr    SCALE, [sp, #3*4]
        bl      ff_synth_filter_float_vfp
        add     OUT, OUT, #32*4
        add     BUF, BUF, #32*4
        subs    COUNT, COUNT, #1
        bne     7b

 A       sub     sp, fp, #(8+8)*4
 T       sub     fp, fp, #(8+8)*4
 T       mov     sp, fp
        vpop    {s16-s23}
 VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
 NOVFP   pop     {a4,v1-v5,fp,pc}
 endfunc

        .unreq  IN
        .unreq  SBACT
        .unreq  OLDFPSCR
        .unreq  IMDCT
        .unreq  WINDOW
        .unreq  OUT
        .unreq  BUF
        .unreq  SCALEINT
        .unreq  COUNT

        .unreq  SCALE

        .align 2
 zero:   .word   0