|
- /*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of Libav.
- *
- * Libav is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * Libav is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with Libav; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
- #include "libavutil/arm/asm.S"
-
- CONTEXT .req a1
- ORIGOUT .req a2
- IN .req a3
- OUT .req v1
- REVTAB .req v2
- TCOS .req v3
- TSIN .req v4
- OLDFPSCR .req v5
- J0 .req a2
- J1 .req a4
- J2 .req ip
- J3 .req lr
-
- .macro prerotation_innerloop
- .set trig_lo, k
- .set trig_hi, n4 - k - 2
- .set in_lo, trig_lo * 2
- .set in_hi, trig_hi * 2
- vldr d8, [TCOS, #trig_lo*4] @ s16,s17
- vldr d9, [TCOS, #trig_hi*4] @ s18,s19
- vldr s0, [IN, #in_hi*4 + 12]
- vldr s1, [IN, #in_hi*4 + 4]
- vldr s2, [IN, #in_lo*4 + 12]
- vldr s3, [IN, #in_lo*4 + 4]
- vmul.f s8, s0, s16 @ vector operation
- vldr d10, [TSIN, #trig_lo*4] @ s20,s21
- vldr d11, [TSIN, #trig_hi*4] @ s22,s23
- vldr s4, [IN, #in_lo*4]
- vldr s5, [IN, #in_lo*4 + 8]
- vldr s6, [IN, #in_hi*4]
- vldr s7, [IN, #in_hi*4 + 8]
- ldr J0, [REVTAB, #trig_lo*2]
- vmul.f s12, s0, s20 @ vector operation
- ldr J2, [REVTAB, #trig_hi*2]
- mov J1, J0, lsr #16
- and J0, J0, #255 @ halfword value will be < n4
- vmls.f s8, s4, s20 @ vector operation
- mov J3, J2, lsr #16
- and J2, J2, #255 @ halfword value will be < n4
- add J0, OUT, J0, lsl #3
- vmla.f s12, s4, s16 @ vector operation
- add J1, OUT, J1, lsl #3
- add J2, OUT, J2, lsl #3
- add J3, OUT, J3, lsl #3
- vstr s8, [J0]
- vstr s9, [J1]
- vstr s10, [J2]
- vstr s11, [J3]
- vstr s12, [J0, #4]
- vstr s13, [J1, #4]
- vstr s14, [J2, #4]
- vstr s15, [J3, #4]
- .set k, k + 2
- .endm
-
- .macro postrotation_innerloop tail, head
- .set trig_lo_head, n8 - k - 2
- .set trig_hi_head, n8 + k
- .set out_lo_head, trig_lo_head * 2
- .set out_hi_head, trig_hi_head * 2
- .set trig_lo_tail, n8 - (k - 2) - 2
- .set trig_hi_tail, n8 + (k - 2)
- .set out_lo_tail, trig_lo_tail * 2
- .set out_hi_tail, trig_hi_tail * 2
- .if (k & 2) == 0
- TCOS_D0_HEAD .req d10 @ s20,s21
- TCOS_D1_HEAD .req d11 @ s22,s23
- TCOS_S0_TAIL .req s24
- .else
- TCOS_D0_HEAD .req d12 @ s24,s25
- TCOS_D1_HEAD .req d13 @ s26,s27
- TCOS_S0_TAIL .req s20
- .endif
- .ifnc "\tail",""
- vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
- .endif
- .ifnc "\head",""
- vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
- vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
- vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
- .endif
- .ifnc "\tail",""
- vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
- .endif
- .ifnc "\head",""
- vldr s0, [OUT, #out_lo_head*4]
- vldr s1, [OUT, #out_lo_head*4 + 8]
- vldr s2, [OUT, #out_hi_head*4]
- vldr s3, [OUT, #out_hi_head*4 + 8]
- vldr s4, [OUT, #out_lo_head*4 + 4]
- vldr s5, [OUT, #out_lo_head*4 + 12]
- vldr s6, [OUT, #out_hi_head*4 + 4]
- vldr s7, [OUT, #out_hi_head*4 + 12]
- .endif
- .ifnc "\tail",""
- vstr s8, [OUT, #out_lo_tail*4]
- vstr s9, [OUT, #out_lo_tail*4 + 8]
- vstr s10, [OUT, #out_hi_tail*4]
- vstr s11, [OUT, #out_hi_tail*4 + 8]
- .endif
- .ifnc "\head",""
- vmul.f s8, s4, s16 @ vector operation
- .endif
- .ifnc "\tail",""
- vstr s12, [OUT, #out_hi_tail*4 + 12]
- vstr s13, [OUT, #out_hi_tail*4 + 4]
- vstr s14, [OUT, #out_lo_tail*4 + 12]
- vstr s15, [OUT, #out_lo_tail*4 + 4]
- .endif
- .ifnc "\head",""
- vmul.f s12, s0, s16 @ vector operation
- vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
- .endif
- .unreq TCOS_D0_HEAD
- .unreq TCOS_D1_HEAD
- .unreq TCOS_S0_TAIL
- .ifnc "\head",""
- .set k, k + 2
- .endif
- .endm
-
-
- /* void ff_imdct_half_vfp(FFTContext *s,
- * FFTSample *output,
- * const FFTSample *input)
- */
- function ff_imdct_half_vfp, export=1
- ldr ip, [CONTEXT, #5*4] @ mdct_bits
- teq ip, #6
- it ne
- bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA
-
- .set n, 1<<6
- .set n2, n/2
- .set n4, n/4
- .set n8, n/8
-
- push {v1-v5,lr}
- vpush {s16-s27}
- fmrx OLDFPSCR, FPSCR
- ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, lr
- mov OUT, ORIGOUT
- ldr REVTAB, [CONTEXT, #2*4]
- ldr TCOS, [CONTEXT, #6*4]
- ldr TSIN, [CONTEXT, #7*4]
-
- .set k, 0
- .rept n8/2
- prerotation_innerloop
- .endr
-
- fmxr FPSCR, OLDFPSCR
- mov a1, OUT
- bl X(ff_fft16_vfp)
- ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
- fmxr FPSCR, lr
-
- .set k, 0
- postrotation_innerloop , head
- .rept n8/2 - 1
- postrotation_innerloop tail, head
- .endr
- postrotation_innerloop tail
-
- fmxr FPSCR, OLDFPSCR
- vpop {s16-s27}
- pop {v1-v5,pc}
- endfunc
-
- .unreq CONTEXT
- .unreq ORIGOUT
- .unreq IN
- .unreq OUT
- .unreq REVTAB
- .unreq TCOS
- .unreq TSIN
- .unreq OLDFPSCR
- .unreq J0
- .unreq J1
- .unreq J2
- .unreq J3
|