arm: Add VFP-accelerated version of synth_filter_float

Before After Mean StdDev Mean StdDev Change This function 9295.0 114.9 4853.2 83.5 +91.5% Overall 23699.8 397.6 19285.5 292.0 +22.9% Signed-off-by: Martin Storsjö <martin@martin.st>
12 years ago · 41ef1d360b
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -52,6 +52,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
                                          arm/vp8dsp_init_armv6.o       \
                                          arm/vp8dsp_armv6.o

 VFP-OBJS-$(CONFIG_DCA_DECODER)         += arm/synth_filter_vfp.o
 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o

 NEON-OBJS                              += arm/fmtconvert_neon.o
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)

 void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);

 void ff_synth_filter_float_vfp(FFTContext *imdct,
                               float *synth_buf_ptr, int *synth_buf_offset,
                               float synth_buf2[32], const float window[512],
                               float out[32], const float in[32],
                               float scale);

 void ff_synth_filter_float_neon(FFTContext *imdct,
                                float *synth_buf_ptr, int *synth_buf_offset,
                                float synth_buf2[32], const float window[512],
@@ -69,6 +75,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
    int cpu_flags = av_get_cpu_flags();

    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
        s->synth_filter_float = ff_synth_filter_float_vfp;
    if (have_neon(cpu_flags))
        s->synth_filter_float = ff_synth_filter_float_neon;
 }
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -0,0 +1,243 @@
 /*
 * Copyright (c) 2013 RISC OS Open Ltd
 * Author: Ben Avison <bavison@riscosopen.org>
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/arm/asm.S"

 IMDCT         .req    r0
 ORIG_P_SB     .req    r1
 P_SB_OFF      .req    r2
 I             .req    r0
 P_SB2_UP      .req    r1
 OLDFPSCR      .req    r2
 P_SB2_DN      .req    r3
 P_WIN_DN      .req    r4
 P_OUT_DN      .req    r5
 P_SB          .req    r6
 J_WRAP        .req    r7
 P_WIN_UP      .req    r12
 P_OUT_UP      .req    r14

 SCALE         .req    s0
 SBUF_DAT_REV0 .req    s4
 SBUF_DAT_REV1 .req    s5
 SBUF_DAT_REV2 .req    s6
 SBUF_DAT_REV3 .req    s7
 VA0           .req    s8
 VA3           .req    s11
 VB0           .req    s12
 VB3           .req    s15
 VC0           .req    s8
 VC3           .req    s11
 VD0           .req    s12
 VD3           .req    s15
 SBUF_DAT0     .req    s16
 SBUF_DAT1     .req    s17
 SBUF_DAT2     .req    s18
 SBUF_DAT3     .req    s19
 SBUF_DAT_ALT0 .req    s20
 SBUF_DAT_ALT1 .req    s21
 SBUF_DAT_ALT2 .req    s22
 SBUF_DAT_ALT3 .req    s23
 WIN_DN_DAT0   .req    s24
 WIN_UP_DAT0   .req    s28


 .macro inner_loop  half, tail, head
 .if (OFFSET & (64*4)) == 0                @ even numbered call
        SBUF_DAT_THIS0 .req SBUF_DAT0
        SBUF_DAT_THIS1 .req SBUF_DAT1
        SBUF_DAT_THIS2 .req SBUF_DAT2
        SBUF_DAT_THIS3 .req SBUF_DAT3
  .ifnc "\head",""
        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
        vldr    d9, [P_SB, #OFFSET+8]
  .endif
 .else
        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
  .ifnc "\head",""
        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
        vldr    d11, [P_SB, #OFFSET+8]
  .endif
 .endif
 .ifnc "\tail",""
  .ifc "\half","ab"
        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .else
        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
  .endif
 .endif
 .ifnc "\head",""
        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
        vldr    d15, [P_WIN_UP, #OFFSET+8]
        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
        vldr    d13, [P_WIN_DN, #OFFSET+8]
        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
  .ifc "\half","ab"
        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .else
        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
  .endif
        teq     J_WRAP, #J
        bne     2f             @ strongly predictable, so better than cond exec in this case
        sub     P_SB, P_SB, #512*4
 2:
  .set J, J - 64
  .set OFFSET, OFFSET + 64*4
 .endif
        .unreq  SBUF_DAT_THIS0
        .unreq  SBUF_DAT_THIS1
        .unreq  SBUF_DAT_THIS2
        .unreq  SBUF_DAT_THIS3
 .endm


 /* void ff_synth_filter_float_vfp(FFTContext *imdct,
 *                                float *synth_buf_ptr, int *synth_buf_offset,
 *                                float synth_buf2[32], const float window[512],
 *                                float out[32], const float in[32], float scale)
 */
 function ff_synth_filter_float_vfp, export=1
        push    {r3-r7,lr}
        vpush   {s16-s31}
        ldr     lr, [P_SB_OFF]
        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
        mov     P_SB, a2                  @ and keep a copy for ourselves
        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
        sub     lr, lr, #32
        and     lr, lr, #512-32
        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
 VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
        bl      ff_imdct_half_c
 VFP     vmov    SCALE, s16

        fmrx    OLDFPSCR, FPSCR
        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
        fmxr    FPSCR, lr
        ldr     P_SB2_DN, [sp, #16*4]
        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
 NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]

 #define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
        add     P_SB2_UP, P_SB2_DN, #16*4
        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
        add     P_OUT_UP, P_OUT_DN, #16*4
        add     P_SB2_DN, P_SB2_DN, #16*4
        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
        add     P_OUT_DN, P_OUT_DN, #16*4
        mov     I, #4
 1:
        vldmia  P_SB2_UP!, {VB0-VB3}
        vldmdb  P_SB2_DN!, {VA0-VA3}
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  ab,, head
 .rept 7
        inner_loop  ab, tail, head
 .endr
        inner_loop  ab, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vmul.f  VA0, VA0, SCALE
        vstmia  P_OUT_UP!, {VB0-VB3}
        vstmdb  P_OUT_DN!, {VA0-VA3}
        bne     1b

        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
        mov     I, #4
 1:
        vldr.d  d4, zero             @ d4 = VC0
        vldr.d  d5, zero
        vldr.d  d6, zero             @ d6 = VD0
        vldr.d  d7, zero
 .set J, 512 - 64
 .set OFFSET, -IMM_OFF_SKEW
        inner_loop  cd,, head
 .rept 7
        inner_loop  cd, tail, head
 .endr
        inner_loop  cd, tail
        add     P_WIN_UP, P_WIN_UP, #4*4
        sub     P_WIN_DN, P_WIN_DN, #4*4
        add     P_SB, P_SB, #(512+4)*4
        subs    I, I, #1
        vstmia  P_SB2_UP!, {VC0-VC3}
        vstmdb  P_SB2_DN!, {VD0-VD3}
        bne     1b

        fmxr    FPSCR, OLDFPSCR
        vpop    {s16-s31}
        pop     {r3-r7,pc}
 endfunc

        .unreq  IMDCT
        .unreq  ORIG_P_SB
        .unreq  P_SB_OFF
        .unreq  I
        .unreq  P_SB2_UP
        .unreq  OLDFPSCR
        .unreq  P_SB2_DN
        .unreq  P_WIN_DN
        .unreq  P_OUT_DN
        .unreq  P_SB
        .unreq  J_WRAP
        .unreq  P_WIN_UP
        .unreq  P_OUT_UP

        .unreq  SCALE
        .unreq  SBUF_DAT_REV0
        .unreq  SBUF_DAT_REV1
        .unreq  SBUF_DAT_REV2
        .unreq  SBUF_DAT_REV3
        .unreq  VA0
        .unreq  VA3
        .unreq  VB0
        .unreq  VB3
        .unreq  VC0
        .unreq  VC3
        .unreq  VD0
        .unreq  VD3
        .unreq  SBUF_DAT0
        .unreq  SBUF_DAT1
        .unreq  SBUF_DAT2
        .unreq  SBUF_DAT3
        .unreq  SBUF_DAT_ALT0
        .unreq  SBUF_DAT_ALT1
        .unreq  SBUF_DAT_ALT2
        .unreq  SBUF_DAT_ALT3
        .unreq  WIN_DN_DAT0
        .unreq  WIN_UP_DAT0

        .align  3
 zero:   .word   0, 0