You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

221 lines
7.0KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. POUT .req a1
  23. PIN .req a2
  24. PCOEF .req a3
  25. DECIFACTOR .req a4
  26. OLDFPSCR .req a4
  27. COUNTER .req ip
  28. SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
  29. SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
  30. IN0 .req s4
  31. IN1 .req s5
  32. IN2 .req s6
  33. IN3 .req s7
  34. IN4 .req s0
  35. IN5 .req s1
  36. IN6 .req s2
  37. IN7 .req s3
  38. COEF0 .req s8 @ coefficient elements
  39. COEF1 .req s9
  40. COEF2 .req s10
  41. COEF3 .req s11
  42. COEF4 .req s12
  43. COEF5 .req s13
  44. COEF6 .req s14
  45. COEF7 .req s15
  46. ACCUM0 .req s16 @ double-buffered multiply-accumulate results
  47. ACCUM4 .req s20
  48. POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
  49. POST1 .req s25
  50. POST2 .req s26
  51. POST3 .req s27
  52. .macro inner_loop decifactor, dir, tail, head
  53. .ifc "\dir","up"
  54. .set X, 0
  55. .set Y, 4
  56. .else
  57. .set X, 4*JMAX*4 - 4
  58. .set Y, -4
  59. .endif
  60. .ifnc "\head",""
  61. vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
  62. vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
  63. vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
  64. vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
  65. .endif
  66. .ifnc "\tail",""
  67. vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
  68. .endif
  69. .ifnc "\head",""
  70. vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
  71. vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
  72. vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
  73. vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
  74. .endif
  75. .ifnc "\tail",""
  76. vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
  77. .endif
  78. .ifnc "\head",""
  79. vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
  80. .ifc "\tail",""
  81. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  82. .endif
  83. vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
  84. vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
  85. .ifnc "\tail",""
  86. vmul.f ACCUM4, COEF4, IN1 @ vector operation
  87. .endif
  88. vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
  89. vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
  90. .endif
  91. .ifnc "\tail",""
  92. vstmia POUT!, {POST0-POST3}
  93. .endif
  94. .ifnc "\head",""
  95. vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
  96. vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
  97. vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
  98. vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
  99. vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
  100. vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
  101. .if \decifactor == 32
  102. vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
  103. vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
  104. vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
  105. vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
  106. vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
  107. vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
  108. vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
  109. vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
  110. vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
  111. vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
  112. vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
  113. vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
  114. vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
  115. vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
  116. vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
  117. vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
  118. vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
  119. vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
  120. vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
  121. vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
  122. .endif
  123. .endif
  124. .endm
  125. .macro dca_lfe_fir decifactor
  126. .if \decifactor == 32
  127. .set JMAX, 8
  128. vpush {s16-s31}
  129. vmov SCALE32, s0 @ duplicate scalar across vector
  130. vldr IN4, [PIN, #-4*4]
  131. vldr IN5, [PIN, #-5*4]
  132. vldr IN6, [PIN, #-6*4]
  133. vldr IN7, [PIN, #-7*4]
  134. .else
  135. .set JMAX, 4
  136. vpush {s16-s27}
  137. .endif
  138. mov COUNTER, #\decifactor/4 - 1
  139. inner_loop \decifactor, up,, head
  140. 1: add PCOEF, PCOEF, #4*JMAX*4
  141. subs COUNTER, COUNTER, #1
  142. inner_loop \decifactor, up, tail, head
  143. bne 1b
  144. inner_loop \decifactor, up, tail
  145. mov COUNTER, #\decifactor/4 - 1
  146. inner_loop \decifactor, down,, head
  147. 1: sub PCOEF, PCOEF, #4*JMAX*4
  148. subs COUNTER, COUNTER, #1
  149. inner_loop \decifactor, down, tail, head
  150. bne 1b
  151. inner_loop \decifactor, down, tail
  152. .if \decifactor == 32
  153. vpop {s16-s31}
  154. .else
  155. vpop {s16-s27}
  156. .endif
  157. fmxr FPSCR, OLDFPSCR
  158. bx lr
  159. .endm
  160. /* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
  161. * int decifactor, float scale)
  162. */
  163. function ff_dca_lfe_fir_vfp, export=1
  164. teq DECIFACTOR, #32
  165. fmrx OLDFPSCR, FPSCR
  166. ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  167. fmxr FPSCR, ip
  168. NOVFP vldr s0, [sp]
  169. vldr IN0, [PIN, #-0*4]
  170. vldr IN1, [PIN, #-1*4]
  171. vldr IN2, [PIN, #-2*4]
  172. vldr IN3, [PIN, #-3*4]
  173. beq 32f
  174. 64: dca_lfe_fir 64
  175. .ltorg
  176. 32: dca_lfe_fir 32
  177. endfunc
  178. .unreq POUT
  179. .unreq PIN
  180. .unreq PCOEF
  181. .unreq DECIFACTOR
  182. .unreq OLDFPSCR
  183. .unreq COUNTER
  184. .unreq SCALE32
  185. .unreq SCALE64
  186. .unreq IN0
  187. .unreq IN1
  188. .unreq IN2
  189. .unreq IN3
  190. .unreq IN4
  191. .unreq IN5
  192. .unreq IN6
  193. .unreq IN7
  194. .unreq COEF0
  195. .unreq COEF1
  196. .unreq COEF2
  197. .unreq COEF3
  198. .unreq COEF4
  199. .unreq COEF5
  200. .unreq COEF6
  201. .unreq COEF7
  202. .unreq ACCUM0
  203. .unreq ACCUM4
  204. .unreq POST0
  205. .unreq POST1
  206. .unreq POST2
  207. .unreq POST3