You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

348 lines
11KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. CONTEXT .req a1
  23. ORIGOUT .req a2
  24. IN .req a3
  25. OUT .req v1
  26. REVTAB .req v2
  27. TCOS .req v3
  28. TSIN .req v4
  29. OLDFPSCR .req v5
  30. J0 .req a2
  31. J1 .req a4
  32. J2 .req ip
  33. J3 .req lr
  34. REVTAB_HI .req v5
  35. IN_HI .req v6
  36. OUT_HI .req v6
  37. TCOS_HI .req sl
  38. TSIN_HI .req fp
  39. .macro prerotation_innerloop
  40. .set trig_lo, k
  41. .set trig_hi, n4 - k - 2
  42. .set in_lo, trig_lo * 2
  43. .set in_hi, trig_hi * 2
  44. vldr d8, [TCOS, #trig_lo*4] @ s16,s17
  45. vldr d9, [TCOS, #trig_hi*4] @ s18,s19
  46. vldr s0, [IN, #in_hi*4 + 12]
  47. vldr s1, [IN, #in_hi*4 + 4]
  48. vldr s2, [IN, #in_lo*4 + 12]
  49. vldr s3, [IN, #in_lo*4 + 4]
  50. vmul.f s8, s0, s16 @ vector operation
  51. vldr d10, [TSIN, #trig_lo*4] @ s20,s21
  52. vldr d11, [TSIN, #trig_hi*4] @ s22,s23
  53. vldr s4, [IN, #in_lo*4]
  54. vldr s5, [IN, #in_lo*4 + 8]
  55. vldr s6, [IN, #in_hi*4]
  56. vldr s7, [IN, #in_hi*4 + 8]
  57. ldr J0, [REVTAB, #trig_lo*2]
  58. vmul.f s12, s0, s20 @ vector operation
  59. ldr J2, [REVTAB, #trig_hi*2]
  60. mov J1, J0, lsr #16
  61. and J0, J0, #255 @ halfword value will be < n4
  62. vmls.f s8, s4, s20 @ vector operation
  63. mov J3, J2, lsr #16
  64. and J2, J2, #255 @ halfword value will be < n4
  65. add J0, OUT, J0, lsl #3
  66. vmla.f s12, s4, s16 @ vector operation
  67. add J1, OUT, J1, lsl #3
  68. add J2, OUT, J2, lsl #3
  69. add J3, OUT, J3, lsl #3
  70. vstr s8, [J0]
  71. vstr s9, [J1]
  72. vstr s10, [J2]
  73. vstr s11, [J3]
  74. vstr s12, [J0, #4]
  75. vstr s13, [J1, #4]
  76. vstr s14, [J2, #4]
  77. vstr s15, [J3, #4]
  78. .set k, k + 2
  79. .endm
  80. .macro prerotation_innerloop_rolled
  81. vldmia TCOS!, {s16,s17}
  82. vldmdb TCOS_HI!, {s18,s19}
  83. vldr s0, [IN_HI, #-4]
  84. vldr s1, [IN_HI, #-12]
  85. vldr s2, [IN, #12]
  86. vldr s3, [IN, #4]
  87. vmul.f s8, s0, s16 @ vector operation
  88. vldmia TSIN!, {s20,s21}
  89. vldmdb TSIN_HI!, {s22,s23}
  90. vldr s4, [IN]
  91. vldr s5, [IN, #8]
  92. vldr s6, [IN_HI, #-16]
  93. vldr s7, [IN_HI, #-8]
  94. vmul.f s12, s0, s20 @ vector operation
  95. add IN, IN, #16
  96. sub IN_HI, IN_HI, #16
  97. ldrh J0, [REVTAB], #2
  98. ldrh J1, [REVTAB], #2
  99. vmls.f s8, s4, s20 @ vector operation
  100. ldrh J3, [REVTAB_HI, #-2]!
  101. ldrh J2, [REVTAB_HI, #-2]!
  102. add J0, OUT, J0, lsl #3
  103. vmla.f s12, s4, s16 @ vector operation
  104. add J1, OUT, J1, lsl #3
  105. add J2, OUT, J2, lsl #3
  106. add J3, OUT, J3, lsl #3
  107. vstr s8, [J0]
  108. vstr s9, [J1]
  109. vstr s10, [J2]
  110. vstr s11, [J3]
  111. vstr s12, [J0, #4]
  112. vstr s13, [J1, #4]
  113. vstr s14, [J2, #4]
  114. vstr s15, [J3, #4]
  115. .endm
  116. .macro postrotation_innerloop tail, head
  117. .set trig_lo_head, n8 - k - 2
  118. .set trig_hi_head, n8 + k
  119. .set out_lo_head, trig_lo_head * 2
  120. .set out_hi_head, trig_hi_head * 2
  121. .set trig_lo_tail, n8 - (k - 2) - 2
  122. .set trig_hi_tail, n8 + (k - 2)
  123. .set out_lo_tail, trig_lo_tail * 2
  124. .set out_hi_tail, trig_hi_tail * 2
  125. .if (k & 2) == 0
  126. TCOS_D0_HEAD .req d10 @ s20,s21
  127. TCOS_D1_HEAD .req d11 @ s22,s23
  128. TCOS_S0_TAIL .req s24
  129. .else
  130. TCOS_D0_HEAD .req d12 @ s24,s25
  131. TCOS_D1_HEAD .req d13 @ s26,s27
  132. TCOS_S0_TAIL .req s20
  133. .endif
  134. .ifnc "\tail",""
  135. vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
  136. .endif
  137. .ifnc "\head",""
  138. vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
  139. vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
  140. vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
  141. .endif
  142. .ifnc "\tail",""
  143. vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
  144. .endif
  145. .ifnc "\head",""
  146. vldr s0, [OUT, #out_lo_head*4]
  147. vldr s1, [OUT, #out_lo_head*4 + 8]
  148. vldr s2, [OUT, #out_hi_head*4]
  149. vldr s3, [OUT, #out_hi_head*4 + 8]
  150. vldr s4, [OUT, #out_lo_head*4 + 4]
  151. vldr s5, [OUT, #out_lo_head*4 + 12]
  152. vldr s6, [OUT, #out_hi_head*4 + 4]
  153. vldr s7, [OUT, #out_hi_head*4 + 12]
  154. .endif
  155. .ifnc "\tail",""
  156. vstr s8, [OUT, #out_lo_tail*4]
  157. vstr s9, [OUT, #out_lo_tail*4 + 8]
  158. vstr s10, [OUT, #out_hi_tail*4]
  159. vstr s11, [OUT, #out_hi_tail*4 + 8]
  160. .endif
  161. .ifnc "\head",""
  162. vmul.f s8, s4, s16 @ vector operation
  163. .endif
  164. .ifnc "\tail",""
  165. vstr s12, [OUT, #out_hi_tail*4 + 12]
  166. vstr s13, [OUT, #out_hi_tail*4 + 4]
  167. vstr s14, [OUT, #out_lo_tail*4 + 12]
  168. vstr s15, [OUT, #out_lo_tail*4 + 4]
  169. .endif
  170. .ifnc "\head",""
  171. vmul.f s12, s0, s16 @ vector operation
  172. vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
  173. .endif
  174. .unreq TCOS_D0_HEAD
  175. .unreq TCOS_D1_HEAD
  176. .unreq TCOS_S0_TAIL
  177. .ifnc "\head",""
  178. .set k, k + 2
  179. .endif
  180. .endm
  181. .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
  182. .ifnc "\tail",""
  183. vmls.f s8, s0, \tcos_s0_tail @ vector operation
  184. .endif
  185. .ifnc "\head",""
  186. vldmia TSIN!, {s16,s17}
  187. vldmdb TSIN_HI!, {s18,s19}
  188. vldmia TCOS!, {\tcos_s0_head,\tcos_s1_head}
  189. .endif
  190. .ifnc "\tail",""
  191. vmla.f s12, s4, \tcos_s0_tail @ vector operation
  192. .endif
  193. .ifnc "\head",""
  194. vldr s0, [OUT, #+\out_offset_head+0]
  195. vldr s1, [OUT, #+\out_offset_head+8]
  196. vldr s2, [OUT_HI, #-\out_offset_head-16]
  197. vldr s3, [OUT_HI, #-\out_offset_head-8]
  198. vldr s4, [OUT, #+\out_offset_head+4]
  199. vldr s5, [OUT, #+\out_offset_head+12]
  200. vldr s6, [OUT_HI, #-\out_offset_head-12]
  201. vldr s7, [OUT_HI, #-\out_offset_head-4]
  202. .endif
  203. .ifnc "\tail",""
  204. vstr s8, [OUT, #+\out_offset_tail+0]
  205. vstr s9, [OUT, #+\out_offset_tail+8]
  206. vstr s10, [OUT_HI, #-\out_offset_tail-16]
  207. vstr s11, [OUT_HI, #-\out_offset_tail-8]
  208. .endif
  209. .ifnc "\head",""
  210. vmul.f s8, s4, s16 @ vector operation
  211. .endif
  212. .ifnc "\tail",""
  213. vstr s12, [OUT_HI, #-\out_offset_tail-4]
  214. vstr s13, [OUT_HI, #-\out_offset_tail-12]
  215. vstr s14, [OUT, #+\out_offset_tail+12]
  216. vstr s15, [OUT, #+\out_offset_tail+4]
  217. .endif
  218. .ifnc "\head",""
  219. vmul.f s12, s0, s16 @ vector operation
  220. vldmdb TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
  221. .endif
  222. .endm
  223. /* void ff_imdct_half_vfp(FFTContext *s,
  224. * FFTSample *output,
  225. * const FFTSample *input)
  226. */
  227. function ff_imdct_half_vfp, export=1
  228. ldr ip, [CONTEXT, #5*4] @ mdct_bits
  229. teq ip, #6
  230. bne 10f
  231. .set n, 1<<6
  232. .set n2, n/2
  233. .set n4, n/4
  234. .set n8, n/8
  235. push {v1-v5,lr}
  236. vpush {s16-s27}
  237. fmrx OLDFPSCR, FPSCR
  238. ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  239. fmxr FPSCR, lr
  240. mov OUT, ORIGOUT
  241. ldr REVTAB, [CONTEXT, #2*4]
  242. ldr TCOS, [CONTEXT, #6*4]
  243. ldr TSIN, [CONTEXT, #7*4]
  244. .set k, 0
  245. .rept n8/2
  246. prerotation_innerloop
  247. .endr
  248. fmxr FPSCR, OLDFPSCR
  249. mov a1, OUT
  250. bl X(ff_fft16_vfp)
  251. ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  252. fmxr FPSCR, lr
  253. .set k, 0
  254. postrotation_innerloop , head
  255. .rept n8/2 - 1
  256. postrotation_innerloop tail, head
  257. .endr
  258. postrotation_innerloop tail
  259. fmxr FPSCR, OLDFPSCR
  260. vpop {s16-s27}
  261. pop {v1-v5,pc}
  262. 10:
  263. push {v1-v6,sl,fp,lr}
  264. vpush {s16-s27}
  265. fmrx OLDFPSCR, FPSCR
  266. ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  267. fmxr FPSCR, lr
  268. mov lr, #1
  269. mov OUT, ORIGOUT
  270. ldr REVTAB, [CONTEXT, #2*4]
  271. ldr TCOS, [CONTEXT, #6*4]
  272. ldr TSIN, [CONTEXT, #7*4]
  273. mov lr, lr, lsl ip
  274. push {CONTEXT,OLDFPSCR}
  275. add IN_HI, IN, lr, lsl #1
  276. add REVTAB_HI, REVTAB, lr, lsr #1
  277. add TCOS_HI, TCOS, lr
  278. add TSIN_HI, TSIN, lr
  279. 0: prerotation_innerloop_rolled
  280. teq IN, IN_HI
  281. bne 0b
  282. ldmia sp, {CONTEXT,OLDFPSCR}
  283. mov ORIGOUT, OUT
  284. fmxr FPSCR, OLDFPSCR
  285. ldr ip, [CONTEXT, #9*4]
  286. blx ip @ s->fft_calc(s, output)
  287. pop {CONTEXT,OLDFPSCR}
  288. ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
  289. ldr ip, [CONTEXT, #5*4] @ mdct_bits
  290. fmxr FPSCR, lr
  291. mov lr, #1
  292. mov lr, lr, lsl ip
  293. sub TCOS, TCOS, lr, lsr #1
  294. sub TSIN, TSIN, lr, lsr #1
  295. add OUT_HI, OUT, lr, lsl #1
  296. add TCOS_HI, TCOS, lr
  297. add TSIN_HI, TSIN, lr
  298. postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
  299. b 1f
  300. 0: add OUT, OUT, #32
  301. sub OUT_HI, OUT_HI, #32
  302. postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
  303. 1: postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
  304. teq TSIN, TSIN_HI
  305. bne 0b
  306. postrotation_innerloop_rolled tail,,,,,, s24,, 16
  307. fmxr FPSCR, OLDFPSCR
  308. vpop {s16-s27}
  309. pop {v1-v6,sl,fp,pc}
  310. endfunc
  311. .unreq CONTEXT
  312. .unreq ORIGOUT
  313. .unreq IN
  314. .unreq OUT
  315. .unreq REVTAB
  316. .unreq TCOS
  317. .unreq TSIN
  318. .unreq OLDFPSCR
  319. .unreq J0
  320. .unreq J1
  321. .unreq J2
  322. .unreq J3
  323. .unreq REVTAB_HI
  324. .unreq IN_HI
  325. .unreq OUT_HI
  326. .unreq TCOS_HI
  327. .unreq TSIN_HI