You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

299 lines
9.9KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. @ TODO: * FFTs wider than 16
  23. @ * dispatch code
  24. function fft4_vfp
  25. vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
  26. vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
  27. vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
  28. vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
  29. @ stall
  30. vadd.f s12, s0, s8 @ i0
  31. vadd.f s13, s1, s9 @ i1
  32. vadd.f s14, s2, s10 @ i2
  33. vadd.f s15, s3, s11 @ i3
  34. vsub.f s8, s0, s8 @ i4
  35. vsub.f s9, s1, s9 @ i5
  36. vsub.f s10, s2, s10 @ i6
  37. vsub.f s11, s3, s11 @ i7
  38. @ stall
  39. @ stall
  40. vadd.f s0, s12, s14 @ z[0].re
  41. vsub.f s4, s12, s14 @ z[2].re
  42. vadd.f s1, s13, s15 @ z[0].im
  43. vsub.f s5, s13, s15 @ z[2].im
  44. vadd.f s7, s9, s10 @ z[3].im
  45. vsub.f s3, s9, s10 @ z[1].im
  46. vadd.f s2, s8, s11 @ z[1].re
  47. vsub.f s6, s8, s11 @ z[3].re
  48. @ stall
  49. @ stall
  50. vstr d0, [a1, #0*2*4]
  51. vstr d2, [a1, #2*2*4]
  52. @ stall
  53. @ stall
  54. vstr d1, [a1, #1*2*4]
  55. vstr d3, [a1, #3*2*4]
  56. bx lr
  57. endfunc
  58. .macro macro_fft8_head
  59. @ FFT4
  60. vldr d4, [a1, #0 * 2*4]
  61. vldr d6, [a1, #1 * 2*4]
  62. vldr d5, [a1, #2 * 2*4]
  63. vldr d7, [a1, #3 * 2*4]
  64. @ BF
  65. vldr d12, [a1, #4 * 2*4]
  66. vadd.f s16, s8, s12 @ vector op
  67. vldr d14, [a1, #5 * 2*4]
  68. vldr d13, [a1, #6 * 2*4]
  69. vldr d15, [a1, #7 * 2*4]
  70. vsub.f s20, s8, s12 @ vector op
  71. vadd.f s0, s16, s18
  72. vsub.f s2, s16, s18
  73. vadd.f s1, s17, s19
  74. vsub.f s3, s17, s19
  75. vadd.f s7, s21, s22
  76. vsub.f s5, s21, s22
  77. vadd.f s4, s20, s23
  78. vsub.f s6, s20, s23
  79. vsub.f s20, s24, s28 @ vector op
  80. vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
  81. vstr d1, [a1, #1 * 2*4]
  82. vldr s0, cos1pi4
  83. vadd.f s16, s24, s28 @ vector op
  84. vstr d2, [a1, #2 * 2*4]
  85. vstr d3, [a1, #3 * 2*4]
  86. vldr d12, [a1, #0 * 2*4]
  87. @ TRANSFORM
  88. vmul.f s20, s20, s0 @ vector x scalar op
  89. vldr d13, [a1, #1 * 2*4]
  90. vldr d14, [a1, #2 * 2*4]
  91. vldr d15, [a1, #3 * 2*4]
  92. @ BUTTERFLIES
  93. vadd.f s0, s18, s16
  94. vadd.f s1, s17, s19
  95. vsub.f s2, s17, s19
  96. vsub.f s3, s18, s16
  97. vadd.f s4, s21, s20
  98. vsub.f s5, s21, s20
  99. vadd.f s6, s22, s23
  100. vsub.f s7, s22, s23
  101. vadd.f s8, s0, s24 @ vector op
  102. vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
  103. vstr d1, [a1, #1 * 2*4]
  104. vldr d6, [a1, #0 * 2*4]
  105. vldr d7, [a1, #1 * 2*4]
  106. vadd.f s1, s5, s6
  107. vadd.f s0, s7, s4
  108. vsub.f s2, s5, s6
  109. vsub.f s3, s7, s4
  110. vsub.f s12, s24, s12 @ vector op
  111. vsub.f s5, s29, s1
  112. vsub.f s4, s28, s0
  113. vsub.f s6, s30, s2
  114. vsub.f s7, s31, s3
  115. vadd.f s16, s0, s28 @ vector op
  116. vstr d6, [a1, #4 * 2*4]
  117. vstr d7, [a1, #6 * 2*4]
  118. vstr d4, [a1, #0 * 2*4]
  119. vstr d5, [a1, #2 * 2*4]
  120. vstr d2, [a1, #5 * 2*4]
  121. vstr d3, [a1, #7 * 2*4]
  122. .endm
  123. .macro macro_fft8_tail
  124. vstr d8, [a1, #1 * 2*4]
  125. vstr d9, [a1, #3 * 2*4]
  126. .endm
  127. function fft8_vfp
  128. ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
  129. fmrx a2, FPSCR
  130. fmxr FPSCR, a3
  131. vpush {s16-s31}
  132. macro_fft8_head
  133. macro_fft8_tail
  134. vpop {s16-s31}
  135. fmxr FPSCR, a2
  136. bx lr
  137. endfunc
  138. .align 3
  139. cos1pi4: @ cos(1*pi/4) = sqrt(2)
  140. .float 0.707106769084930419921875
  141. cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
  142. .float 0.92387950420379638671875
  143. cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
  144. .float 0.3826834261417388916015625
  145. function ff_fft16_vfp, export=1
  146. ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
  147. fmrx a2, FPSCR
  148. fmxr FPSCR, a3
  149. vpush {s16-s31}
  150. macro_fft8_head
  151. @ FFT4(z+8)
  152. vldr d10, [a1, #8 * 2*4]
  153. vldr d12, [a1, #9 * 2*4]
  154. vldr d11, [a1, #10 * 2*4]
  155. vldr d13, [a1, #11 * 2*4]
  156. macro_fft8_tail
  157. vadd.f s16, s20, s24 @ vector op
  158. @ FFT4(z+12)
  159. vldr d4, [a1, #12 * 2*4]
  160. vldr d6, [a1, #13 * 2*4]
  161. vldr d5, [a1, #14 * 2*4]
  162. vsub.f s20, s20, s24 @ vector op
  163. vldr d7, [a1, #15 * 2*4]
  164. vadd.f s0, s16, s18
  165. vsub.f s4, s16, s18
  166. vadd.f s1, s17, s19
  167. vsub.f s5, s17, s19
  168. vadd.f s7, s21, s22
  169. vsub.f s3, s21, s22
  170. vadd.f s2, s20, s23
  171. vsub.f s6, s20, s23
  172. vadd.f s16, s8, s12 @ vector op
  173. vstr d0, [a1, #8 * 2*4]
  174. vstr d2, [a1, #10 * 2*4]
  175. vstr d1, [a1, #9 * 2*4]
  176. vsub.f s20, s8, s12
  177. vstr d3, [a1, #11 * 2*4]
  178. @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
  179. vldr d12, [a1, #10 * 2*4]
  180. vadd.f s0, s16, s18
  181. vadd.f s1, s17, s19
  182. vsub.f s6, s16, s18
  183. vsub.f s7, s17, s19
  184. vsub.f s3, s21, s22
  185. vadd.f s2, s20, s23
  186. vadd.f s5, s21, s22
  187. vsub.f s4, s20, s23
  188. vstr d0, [a1, #12 * 2*4]
  189. vmov s0, s6
  190. @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
  191. vldr d6, [a1, #9 * 2*4]
  192. vstr d1, [a1, #13 * 2*4]
  193. vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
  194. vstr d2, [a1, #15 * 2*4]
  195. vldr d7, [a1, #13 * 2*4]
  196. vadd.f s4, s25, s24
  197. vsub.f s5, s25, s24
  198. vsub.f s6, s0, s7
  199. vadd.f s7, s0, s7
  200. vmul.f s20, s12, s3 @ vector op
  201. @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
  202. vldr d4, [a1, #11 * 2*4]
  203. vldr d5, [a1, #15 * 2*4]
  204. vldr s1, cos3pi8
  205. vmul.f s24, s4, s2 @ vector * scalar op
  206. vmul.f s28, s12, s1 @ vector * scalar op
  207. vmul.f s12, s8, s1 @ vector * scalar op
  208. vadd.f s4, s20, s29
  209. vsub.f s5, s21, s28
  210. vsub.f s6, s22, s31
  211. vadd.f s7, s23, s30
  212. vmul.f s8, s8, s3 @ vector * scalar op
  213. vldr d8, [a1, #1 * 2*4]
  214. vldr d9, [a1, #5 * 2*4]
  215. vldr d10, [a1, #3 * 2*4]
  216. vldr d11, [a1, #7 * 2*4]
  217. vldr d14, [a1, #2 * 2*4]
  218. vadd.f s0, s6, s4
  219. vadd.f s1, s5, s7
  220. vsub.f s2, s5, s7
  221. vsub.f s3, s6, s4
  222. vadd.f s4, s12, s9
  223. vsub.f s5, s13, s8
  224. vsub.f s6, s14, s11
  225. vadd.f s7, s15, s10
  226. vadd.f s12, s0, s16 @ vector op
  227. vstr d0, [a1, #1 * 2*4]
  228. vstr d1, [a1, #5 * 2*4]
  229. vldr d4, [a1, #1 * 2*4]
  230. vldr d5, [a1, #5 * 2*4]
  231. vadd.f s0, s6, s4
  232. vadd.f s1, s5, s7
  233. vsub.f s2, s5, s7
  234. vsub.f s3, s6, s4
  235. vsub.f s8, s16, s8 @ vector op
  236. vstr d6, [a1, #1 * 2*4]
  237. vstr d7, [a1, #5 * 2*4]
  238. vldr d15, [a1, #6 * 2*4]
  239. vsub.f s4, s20, s0
  240. vsub.f s5, s21, s1
  241. vsub.f s6, s22, s2
  242. vsub.f s7, s23, s3
  243. vadd.f s20, s0, s20 @ vector op
  244. vstr d4, [a1, #9 * 2*4]
  245. @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
  246. vldr d6, [a1, #8 * 2*4]
  247. vstr d5, [a1, #13 * 2*4]
  248. vldr d7, [a1, #12 * 2*4]
  249. vstr d2, [a1, #11 * 2*4]
  250. vldr d8, [a1, #0 * 2*4]
  251. vstr d3, [a1, #15 * 2*4]
  252. vldr d9, [a1, #4 * 2*4]
  253. vadd.f s0, s26, s24
  254. vadd.f s1, s25, s27
  255. vsub.f s2, s25, s27
  256. vsub.f s3, s26, s24
  257. vadd.f s4, s14, s12
  258. vadd.f s5, s13, s15
  259. vsub.f s6, s13, s15
  260. vsub.f s7, s14, s12
  261. vadd.f s8, s0, s28 @ vector op
  262. vstr d0, [a1, #3 * 2*4]
  263. vstr d1, [a1, #7 * 2*4]
  264. vldr d6, [a1, #3 * 2*4]
  265. vldr d7, [a1, #7 * 2*4]
  266. vsub.f s0, s16, s4
  267. vsub.f s1, s17, s5
  268. vsub.f s2, s18, s6
  269. vsub.f s3, s19, s7
  270. vsub.f s12, s28, s12 @ vector op
  271. vadd.f s16, s4, s16 @ vector op
  272. vstr d10, [a1, #3 * 2*4]
  273. vstr d11, [a1, #7 * 2*4]
  274. vstr d4, [a1, #2 * 2*4]
  275. vstr d5, [a1, #6 * 2*4]
  276. vstr d0, [a1, #8 * 2*4]
  277. vstr d1, [a1, #12 * 2*4]
  278. vstr d6, [a1, #10 * 2*4]
  279. vstr d7, [a1, #14 * 2*4]
  280. vstr d8, [a1, #0 * 2*4]
  281. vstr d9, [a1, #4 * 2*4]
  282. vpop {s16-s31}
  283. fmxr FPSCR, a2
  284. bx lr
  285. endfunc