|
- /*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
- #include "libavutil/arm/asm.S"
-
- POUT .req a1
- PIN .req a2
- PCOEF .req a3
- OLDFPSCR .req a4
- COUNTER .req ip
-
- IN0 .req s4
- IN1 .req s5
- IN2 .req s6
- IN3 .req s7
- IN4 .req s0
- IN5 .req s1
- IN6 .req s2
- IN7 .req s3
- COEF0 .req s8 @ coefficient elements
- COEF1 .req s9
- COEF2 .req s10
- COEF3 .req s11
- COEF4 .req s12
- COEF5 .req s13
- COEF6 .req s14
- COEF7 .req s15
- ACCUM0 .req s16 @ double-buffered multiply-accumulate results
- ACCUM4 .req s20
- POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
- POST1 .req s25
- POST2 .req s26
- POST3 .req s27
-
-
- .macro inner_loop decifactor, dir, tail, head
- .ifc "\dir","up"
- .set X, 0
- .set Y, 4
- .else
- .set X, 4*JMAX*4 - 4
- .set Y, -4
- .endif
- .ifnc "\head",""
- vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
- .endif
- .ifnc "\tail",""
- vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
- .endif
- .ifnc "\head",""
- vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
- .endif
- .ifnc "\head",""
- vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
- .ifc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
- .ifnc "\tail",""
- vmul.f ACCUM4, COEF4, IN1 @ vector operation
- .endif
- vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
- .endif
- .ifnc "\tail",""
- vstmia POUT!, {POST0-POST3}
- .endif
- .ifnc "\head",""
- vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
- vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
- .if \decifactor == 32
- vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
- vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
- vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
- vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
- vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
- vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
- vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
- vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
- vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
- vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
- vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
- vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
- vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
- .endif
- .endif
- .endm
-
- .macro dca_lfe_fir decifactor
- function ff_dca_lfe_fir\decifactor\()_vfp, export=1
- fmrx OLDFPSCR, FPSCR
- ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, ip
- vldr IN0, [PIN, #-0*4]
- vldr IN1, [PIN, #-1*4]
- vldr IN2, [PIN, #-2*4]
- vldr IN3, [PIN, #-3*4]
- .if \decifactor == 32
- .set JMAX, 8
- vpush {s16-s31}
- vldr IN4, [PIN, #-4*4]
- vldr IN5, [PIN, #-5*4]
- vldr IN6, [PIN, #-6*4]
- vldr IN7, [PIN, #-7*4]
- .else
- .set JMAX, 4
- vpush {s16-s27}
- .endif
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, up,, head
- 1: add PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, up, tail, head
- bne 1b
- inner_loop \decifactor, up, tail
-
- mov COUNTER, #\decifactor/4 - 1
- inner_loop \decifactor, down,, head
- 1: sub PCOEF, PCOEF, #4*JMAX*4
- subs COUNTER, COUNTER, #1
- inner_loop \decifactor, down, tail, head
- bne 1b
- inner_loop \decifactor, down, tail
-
- .if \decifactor == 32
- vpop {s16-s31}
- .else
- vpop {s16-s27}
- .endif
- fmxr FPSCR, OLDFPSCR
- bx lr
- endfunc
- .endm
-
- dca_lfe_fir 64
- .ltorg
- dca_lfe_fir 32
-
- .unreq POUT
- .unreq PIN
- .unreq PCOEF
- .unreq OLDFPSCR
- .unreq COUNTER
-
- .unreq IN0
- .unreq IN1
- .unreq IN2
- .unreq IN3
- .unreq IN4
- .unreq IN5
- .unreq IN6
- .unreq IN7
- .unreq COEF0
- .unreq COEF1
- .unreq COEF2
- .unreq COEF3
- .unreq COEF4
- .unreq COEF5
- .unreq COEF6
- .unreq COEF7
- .unreq ACCUM0
- .unreq ACCUM4
- .unreq POST0
- .unreq POST1
- .unreq POST2
- .unreq POST3
-
-
- IN .req a1
- SBACT .req a2
- OLDFPSCR .req a3
- IMDCT .req a4
- WINDOW .req v1
- OUT .req v2
- BUF .req v3
- SCALEINT .req v4 @ only used in softfp case
- COUNT .req v5
-
- SCALE .req s0
-
- /* Stack layout differs in softfp and hardfp cases:
- *
- * hardfp
- * fp -> 6 arg words saved by caller
- * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * s0 on entry
- * sp -> 3 arg words for callee
- *
- * softfp
- * fp -> 7 arg words saved by caller
- * a4,v1-v5,fp,lr on entry
- * s16-s23 on entry
- * align 16
- * buf -> 8*32*4 bytes buffer
- * sp -> 4 arg words for callee
- */
-
- /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- * SynthFilterContext *synth, FFTContext *imdct,
- * float (*synth_buf_ptr)[512],
- * int *synth_buf_offset, float (*synth_buf2)[32],
- * const float (*window)[512], float *samples_out,
- * float (*raXin)[32], float scale);
- */
- function ff_dca_qmf_32_subbands_vfp, export=1
- VFP push {a3-a4,v1-v3,v5,fp,lr}
- NOVFP push {a4,v1-v5,fp,lr}
- add fp, sp, #8*4
- vpush {s16-s23}
- @ The buffer pointed at by raXin isn't big enough for us to do a
- @ complete matrix transposition as we want to, so allocate an
- @ alternative buffer from the stack. Align to 4 words for speed.
- sub BUF, sp, #8*32*4
- bic BUF, BUF, #15
- mov sp, BUF
- ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
- fmrx OLDFPSCR, FPSCR
- fmxr FPSCR, lr
- @ COUNT is used to count down 2 things at once:
- @ bits 0-4 are the number of word pairs remaining in the output row
- @ bits 5-31 are the number of words to copy (with possible negation)
- @ from the source matrix before we start zeroing the remainder
- mov COUNT, #(-4 << 5) + 16
- adds COUNT, COUNT, SBACT, lsl #5
- bmi 2f
- 1:
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- vldr s9, [IN, #(3*8+0)*4]
- vldr s11, [IN, #(3*8+1)*4]
- vldr s13, [IN, #(3*8+2)*4]
- vldr s15, [IN, #(3*8+3)*4]
- vldr s17, [IN, #(3*8+4)*4]
- vldr s19, [IN, #(3*8+5)*4]
- vldr s21, [IN, #(3*8+6)*4]
- vldr s23, [IN, #(3*8+7)*4]
- vneg.f s9, s9
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vneg.f s17, s17
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vstr d4, [BUF, #(0*32+2)*4]
- vstr d5, [BUF, #(1*32+2)*4]
- vstr d6, [BUF, #(2*32+2)*4]
- vstr d7, [BUF, #(3*32+2)*4]
- vstr d8, [BUF, #(4*32+2)*4]
- vstr d9, [BUF, #(5*32+2)*4]
- vstr d10, [BUF, #(6*32+2)*4]
- vstr d11, [BUF, #(7*32+2)*4]
- add IN, IN, #4*8*4
- add BUF, BUF, #4*4
- subs COUNT, COUNT, #(4 << 5) + 2
- bpl 1b
- 2: @ Now deal with trailing < 4 samples
- adds COUNT, COUNT, #3 << 5
- bmi 4f @ sb_act was a multiple of 4
- bics lr, COUNT, #0x1F
- bne 3f
- @ sb_act was n*4+1
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vneg.f s16, s16
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
- b 4f
- 3: @ sb_act was n*4+2 or n*4+3, so do the first 2
- vldr s8, [IN, #(0*8+0)*4]
- vldr s10, [IN, #(0*8+1)*4]
- vldr s12, [IN, #(0*8+2)*4]
- vldr s14, [IN, #(0*8+3)*4]
- vldr s16, [IN, #(0*8+4)*4]
- vldr s18, [IN, #(0*8+5)*4]
- vldr s20, [IN, #(0*8+6)*4]
- vldr s22, [IN, #(0*8+7)*4]
- vneg.f s8, s8
- vldr s9, [IN, #(1*8+0)*4]
- vldr s11, [IN, #(1*8+1)*4]
- vldr s13, [IN, #(1*8+2)*4]
- vldr s15, [IN, #(1*8+3)*4]
- vneg.f s16, s16
- vldr s17, [IN, #(1*8+4)*4]
- vldr s19, [IN, #(1*8+5)*4]
- vldr s21, [IN, #(1*8+6)*4]
- vldr s23, [IN, #(1*8+7)*4]
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #(2 << 5) + 1
- bics lr, COUNT, #0x1F
- bne 4f
- @ sb_act was n*4+3
- vldr s8, [IN, #(2*8+0)*4]
- vldr s10, [IN, #(2*8+1)*4]
- vldr s12, [IN, #(2*8+2)*4]
- vldr s14, [IN, #(2*8+3)*4]
- vldr s16, [IN, #(2*8+4)*4]
- vldr s18, [IN, #(2*8+5)*4]
- vldr s20, [IN, #(2*8+6)*4]
- vldr s22, [IN, #(2*8+7)*4]
- vldr s9, zero
- vldr s11, zero
- vldr s13, zero
- vldr s15, zero
- vldr s17, zero
- vldr s19, zero
- vldr s21, zero
- vldr s23, zero
- vstr d4, [BUF, #(0*32+0)*4]
- vstr d5, [BUF, #(1*32+0)*4]
- vstr d6, [BUF, #(2*32+0)*4]
- vstr d7, [BUF, #(3*32+0)*4]
- vstr d8, [BUF, #(4*32+0)*4]
- vstr d9, [BUF, #(5*32+0)*4]
- vstr d10, [BUF, #(6*32+0)*4]
- vstr d11, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- sub COUNT, COUNT, #1
- 4: @ Now fill the remainder with 0
- vldr s8, zero
- vldr s9, zero
- ands COUNT, COUNT, #0x1F
- beq 6f
- 5: vstr d4, [BUF, #(0*32+0)*4]
- vstr d4, [BUF, #(1*32+0)*4]
- vstr d4, [BUF, #(2*32+0)*4]
- vstr d4, [BUF, #(3*32+0)*4]
- vstr d4, [BUF, #(4*32+0)*4]
- vstr d4, [BUF, #(5*32+0)*4]
- vstr d4, [BUF, #(6*32+0)*4]
- vstr d4, [BUF, #(7*32+0)*4]
- add BUF, BUF, #2*4
- subs COUNT, COUNT, #1
- bne 5b
- 6:
- fmxr FPSCR, OLDFPSCR
- ldr WINDOW, [fp, #3*4]
- ldr OUT, [fp, #4*4]
- sub BUF, BUF, #32*4
- NOVFP ldr SCALEINT, [fp, #6*4]
- mov COUNT, #8
- VFP vpush {SCALE}
- VFP sub sp, sp, #3*4
- NOVFP sub sp, sp, #4*4
- 7:
- VFP ldr a1, [fp, #-7*4] @ imdct
- NOVFP ldr a1, [fp, #-8*4]
- ldmia fp, {a2-a4}
- VFP stmia sp, {WINDOW, OUT, BUF}
- NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
- VFP vldr SCALE, [sp, #3*4]
- bl X(ff_synth_filter_float_vfp)
- add OUT, OUT, #32*4
- add BUF, BUF, #32*4
- subs COUNT, COUNT, #1
- bne 7b
-
- A sub sp, fp, #(8+8)*4
- T sub fp, fp, #(8+8)*4
- T mov sp, fp
- vpop {s16-s23}
- VFP pop {a3-a4,v1-v3,v5,fp,pc}
- NOVFP pop {a4,v1-v5,fp,pc}
- endfunc
-
- .unreq IN
- .unreq SBACT
- .unreq OLDFPSCR
- .unreq IMDCT
- .unreq WINDOW
- .unreq OUT
- .unreq BUF
- .unreq SCALEINT
- .unreq COUNT
-
- .unreq SCALE
-
- .align 2
- zero: .word 0
|