You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

443 lines
18KB

  1. /*
  2. * ARM NEON optimised FFT
  3. *
  4. * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
  5. * Copyright (c) 2009 Naotoshi Nojiri
  6. * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
  7. *
  8. * This algorithm (though not any of the implementation details) is
  9. * based on libdjbfft by D. J. Bernstein.
  10. *
  11. * This file is part of Libav.
  12. *
  13. * Libav is free software; you can redistribute it and/or
  14. * modify it under the terms of the GNU Lesser General Public
  15. * License as published by the Free Software Foundation; either
  16. * version 2.1 of the License, or (at your option) any later version.
  17. *
  18. * Libav is distributed in the hope that it will be useful,
  19. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  21. * Lesser General Public License for more details.
  22. *
  23. * You should have received a copy of the GNU Lesser General Public
  24. * License along with Libav; if not, write to the Free Software
  25. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  26. */
  27. #include "libavutil/aarch64/asm.S"
  28. #define M_SQRT1_2 0.70710678118654752440
  29. .macro transpose d0, d1, s0, s1
  30. trn1 \d0, \s0, \s1
  31. trn2 \d1, \s0, \s1
  32. .endm
  33. function fft4_neon
  34. ld1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
  35. fadd v4.2s, v0.2s, v1.2s // r0+r1,i0+i1
  36. fsub v6.2s, v0.2s, v1.2s // r0-r1,i0-i1
  37. ext v16.8b, v2.8b, v3.8b, #4
  38. ext v17.8b, v3.8b, v2.8b, #4
  39. fadd v5.2s, v2.2s, v3.2s // i2+i3,r2+r3
  40. fsub v7.2s, v16.2s, v17.2s // r3-r2,i2-i3
  41. fadd v0.2s, v4.2s, v5.2s
  42. fsub v2.2s, v4.2s, v5.2s
  43. fadd v1.2s, v6.2s, v7.2s
  44. fsub v3.2s, v6.2s, v7.2s
  45. st1 {v0.2s,v1.2s,v2.2s,v3.2s}, [x0]
  46. ret
  47. endfunc
  48. function fft8_neon
  49. mov x1, x0
  50. ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
  51. ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
  52. ext v22.8b, v2.8b, v3.8b, #4
  53. ext v23.8b, v3.8b, v2.8b, #4
  54. fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
  55. fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
  56. fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
  57. fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
  58. rev64 v27.2s, v28.2s // ???
  59. fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
  60. fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
  61. fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
  62. ext v6.8b, v4.8b, v5.8b, #4
  63. ext v7.8b, v5.8b, v4.8b, #4
  64. fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
  65. fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
  66. fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
  67. fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
  68. fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
  69. fadd v0.2s, v20.2s, v21.2s
  70. fsub v2.2s, v20.2s, v21.2s
  71. fadd v1.2s, v22.2s, v23.2s
  72. rev64 v26.2s, v26.2s
  73. rev64 v27.2s, v27.2s
  74. fsub v3.2s, v22.2s, v23.2s
  75. fsub v6.2s, v6.2s, v7.2s
  76. fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
  77. fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
  78. fadd v7.2s, v4.2s, v5.2s
  79. fsub v18.2s, v2.2s, v6.2s
  80. ext v26.8b, v24.8b, v25.8b, #4
  81. ext v27.8b, v25.8b, v24.8b, #4
  82. fadd v2.2s, v2.2s, v6.2s
  83. fsub v16.2s, v0.2s, v7.2s
  84. fadd v5.2s, v25.2s, v24.2s
  85. fsub v4.2s, v26.2s, v27.2s
  86. fadd v0.2s, v0.2s, v7.2s
  87. fsub v17.2s, v1.2s, v5.2s
  88. fsub v19.2s, v3.2s, v4.2s
  89. fadd v3.2s, v3.2s, v4.2s
  90. fadd v1.2s, v1.2s, v5.2s
  91. st1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0]
  92. st1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x1]
  93. ret
  94. endfunc
  95. function fft16_neon
  96. mov x1, x0
  97. ld1 {v0.2s, v1.2s, v2.2s, v3.2s}, [x0], #32
  98. ld1 {v16.2s,v17.2s,v18.2s,v19.2s}, [x0], #32
  99. ext v22.8b, v2.8b, v3.8b, #4
  100. ext v23.8b, v3.8b, v2.8b, #4
  101. fadd v4.2s, v16.2s, v17.2s // r4+r5,i4+i5
  102. fadd v5.2s, v18.2s, v19.2s // r6+r7,i6+i7
  103. fsub v17.2s, v16.2s, v17.2s // r4-r5,i4-i5
  104. fsub v19.2s, v18.2s, v19.2s // r6-r7,i6-i7
  105. rev64 v27.2s, v28.2s // ???
  106. fadd v20.2s, v0.2s, v1.2s // r0+r1,i0+i1
  107. fadd v21.2s, v2.2s, v3.2s // r2+r3,i2+i3
  108. fmul v26.2s, v17.2s, v28.2s // -a2r*w,a2i*w
  109. ext v6.8b, v4.8b, v5.8b, #4
  110. ext v7.8b, v5.8b, v4.8b, #4
  111. fmul v27.2s, v19.2s, v27.2s // a3r*w,-a3i*w
  112. fsub v23.2s, v22.2s, v23.2s // i2-i3,r3-r2
  113. fsub v22.2s, v0.2s, v1.2s // r0-r1,i0-i1
  114. fmul v24.2s, v17.2s, v28.s[1] // a2r*w,a2i*w
  115. fmul v25.2s, v19.2s, v28.s[1] // a3r*w,a3i*w
  116. fadd v0.2s, v20.2s, v21.2s
  117. fsub v2.2s, v20.2s, v21.2s
  118. fadd v1.2s, v22.2s, v23.2s
  119. rev64 v26.2s, v26.2s
  120. rev64 v27.2s, v27.2s
  121. fsub v3.2s, v22.2s, v23.2s
  122. fsub v6.2s, v6.2s, v7.2s
  123. fadd v24.2s, v24.2s, v26.2s // a2r+a2i,a2i-a2r t1,t2
  124. fadd v25.2s, v25.2s, v27.2s // a3r-a3i,a3i+a3r t5,t6
  125. fadd v7.2s, v4.2s, v5.2s
  126. fsub v18.2s, v2.2s, v6.2s
  127. ld1 {v20.4s,v21.4s}, [x0], #32
  128. ld1 {v22.4s,v23.4s}, [x0], #32
  129. ext v26.8b, v24.8b, v25.8b, #4
  130. ext v27.8b, v25.8b, v24.8b, #4
  131. fadd v2.2s, v2.2s, v6.2s
  132. fsub v16.2s, v0.2s, v7.2s
  133. fadd v5.2s, v25.2s, v24.2s
  134. fsub v4.2s, v26.2s, v27.2s
  135. transpose v24.2d, v25.2d, v20.2d, v22.2d
  136. transpose v26.2d, v27.2d, v21.2d, v23.2d
  137. fadd v0.2s, v0.2s, v7.2s
  138. fsub v17.2s, v1.2s, v5.2s
  139. fsub v19.2s, v3.2s, v4.2s
  140. fadd v3.2s, v3.2s, v4.2s
  141. fadd v1.2s, v1.2s, v5.2s
  142. ext v20.16b, v21.16b, v21.16b, #4
  143. ext v21.16b, v23.16b, v23.16b, #4
  144. zip1 v0.2d, v0.2d, v1.2d // {z[0], z[1]}
  145. zip1 v1.2d, v2.2d, v3.2d // {z[2], z[3]}
  146. zip1 v2.2d, v16.2d, v17.2d // {z[o1], z[o1+1]}
  147. zip1 v3.2d, v18.2d, v19.2d // {z[o1+2],z[o1+3]}
  148. // 2 x fft4
  149. transpose v22.2d, v23.2d, v20.2d, v21.2d
  150. fadd v4.4s, v24.4s, v25.4s
  151. fadd v5.4s, v26.4s, v27.4s
  152. fsub v6.4s, v24.4s, v25.4s
  153. fsub v7.4s, v22.4s, v23.4s
  154. ld1 {v23.4s}, [x14]
  155. fadd v24.4s, v4.4s, v5.4s // {z[o2+0],z[o2+1]}
  156. fsub v26.4s, v4.4s, v5.4s // {z[o2+2],z[o2+3]}
  157. fadd v25.4s, v6.4s, v7.4s // {z[o3+0],z[o3+1]}
  158. fsub v27.4s, v6.4s, v7.4s // {z[o3+2],z[o3+3]}
  159. //fft_pass_neon_16
  160. rev64 v7.4s, v25.4s
  161. fmul v25.4s, v25.4s, v23.s[1]
  162. fmul v7.4s, v7.4s, v29.4s
  163. fmla v25.4s, v7.4s, v23.s[3] // {t1a,t2a,t5a,t6a}
  164. zip1 v20.4s, v24.4s, v25.4s
  165. zip2 v21.4s, v24.4s, v25.4s
  166. fneg v22.4s, v20.4s
  167. fadd v4.4s, v21.4s, v20.4s
  168. fsub v6.4s, v20.4s, v21.4s // just the second half
  169. fadd v5.4s, v21.4s, v22.4s // just the first half
  170. tbl v4.16b, {v4.16b}, v30.16b // trans4_float
  171. tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
  172. fsub v20.4s, v0.4s, v4.4s // {z[o2],z[o2+1]}
  173. fadd v16.4s, v0.4s, v4.4s // {z[0], z[1]}
  174. fsub v22.4s, v2.4s, v5.4s // {z[o3],z[o3+1]}
  175. fadd v18.4s, v2.4s, v5.4s // {z[o1],z[o1+1]}
  176. //second half
  177. rev64 v6.4s, v26.4s
  178. fmul v26.4s, v26.4s, v23.s[2]
  179. rev64 v7.4s, v27.4s
  180. fmul v27.4s, v27.4s, v23.s[3]
  181. fmul v6.4s, v6.4s, v29.4s
  182. fmul v7.4s, v7.4s, v29.4s
  183. fmla v26.4s, v6.4s, v23.s[2] // {t1,t2,t5,t6}
  184. fmla v27.4s, v7.4s, v23.s[1] // {t1a,t2a,t5a,t6a}
  185. zip1 v24.4s, v26.4s, v27.4s
  186. zip2 v25.4s, v26.4s, v27.4s
  187. fneg v26.4s, v24.4s
  188. fadd v4.4s, v25.4s, v24.4s
  189. fsub v6.4s, v24.4s, v25.4s // just the second half
  190. fadd v5.4s, v25.4s, v26.4s // just the first half
  191. tbl v4.16b, {v4.16b}, v30.16b // trans4_float
  192. tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
  193. fadd v17.4s, v1.4s, v4.4s // {z[2], z[3]}
  194. fsub v21.4s, v1.4s, v4.4s // {z[o2+2],z[o2+3]}
  195. fadd v19.4s, v3.4s, v5.4s // {z[o1+2],z[o1+3]}
  196. fsub v23.4s, v3.4s, v5.4s // {z[o3+2],z[o3+3]}
  197. st1 {v16.4s,v17.4s}, [x1], #32
  198. st1 {v18.4s,v19.4s}, [x1], #32
  199. st1 {v20.4s,v21.4s}, [x1], #32
  200. st1 {v22.4s,v23.4s}, [x1], #32
  201. ret
  202. endfunc
  203. const trans4_float, align=4
  204. .byte 0, 1, 2, 3
  205. .byte 8, 9, 10, 11
  206. .byte 4, 5, 6, 7
  207. .byte 12, 13, 14, 15
  208. endconst
  209. const trans8_float, align=4
  210. .byte 24, 25, 26, 27
  211. .byte 0, 1, 2, 3
  212. .byte 28, 29, 30, 31
  213. .byte 4, 5, 6, 7
  214. endconst
  215. function fft_pass_neon
  216. sub x6, x2, #1 // n - 1, loop counter
  217. lsl x5, x2, #3 // 2 * n * sizeof FFTSample
  218. lsl x1, x2, #4 // 2 * n * sizeof FFTComplex
  219. add x5, x4, x5 // wim
  220. add x3, x1, x2, lsl #5 // 4 * n * sizeof FFTComplex
  221. add x2, x0, x2, lsl #5 // &z[o2]
  222. add x3, x0, x3 // &z[o3]
  223. add x1, x0, x1 // &z[o1]
  224. ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
  225. ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
  226. ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
  227. trn2 v25.2d, v20.2d, v22.2d
  228. sub x5, x5, #4 // wim--
  229. trn1 v24.2d, v20.2d, v22.2d
  230. ld1 {v5.s}[0], [x5], x7 // d5[0] = wim[-1]
  231. rev64 v7.4s, v25.4s
  232. fmul v25.4s, v25.4s, v4.s[1]
  233. ld1 {v16.4s}, [x0] // {z[0],z[1]}
  234. fmul v7.4s, v7.4s, v29.4s
  235. ld1 {v17.4s}, [x1] // {z[o1],z[o1+1]}
  236. prfm pldl1keep, [x2, #16]
  237. prfm pldl1keep, [x3, #16]
  238. fmla v25.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
  239. prfm pldl1keep, [x0, #16]
  240. prfm pldl1keep, [x1, #16]
  241. zip1 v20.4s, v24.4s, v25.4s
  242. zip2 v21.4s, v24.4s, v25.4s
  243. fneg v22.4s, v20.4s
  244. fadd v4.4s, v21.4s, v20.4s
  245. fsub v6.4s, v20.4s, v21.4s // just the second half
  246. fadd v5.4s, v21.4s, v22.4s // just the first half
  247. tbl v4.16b, {v4.16b}, v30.16b // trans4_float
  248. tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
  249. fadd v20.4s, v16.4s, v4.4s
  250. fsub v22.4s, v16.4s, v4.4s
  251. fadd v21.4s, v17.4s, v5.4s
  252. st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
  253. fsub v23.4s, v17.4s, v5.4s
  254. st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
  255. st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
  256. st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
  257. 1:
  258. ld1 {v20.4s},[x2] // {z[o2],z[o2+1]}
  259. ld1 {v22.4s},[x3] // {z[o3],z[o3+1]}
  260. ld1 {v4.2s}, [x4], #8 // {wre[0],wre[1]}
  261. transpose v26.2d, v27.2d, v20.2d, v22.2d
  262. ld1 {v5.2s}, [x5], x7 // {wim[-1],wim[0]}
  263. rev64 v6.4s, v26.4s
  264. fmul v26.4s, v26.4s, v4.s[0]
  265. rev64 v7.4s, v27.4s
  266. fmul v27.4s, v27.4s, v4.s[1]
  267. fmul v6.4s, v6.4s, v29.4s
  268. fmul v7.4s, v7.4s, v29.4s
  269. ld1 {v16.4s},[x0] // {z[0],z[1]}
  270. fmla v26.4s, v6.4s, v5.s[1] // {t1,t2,t5,t6}
  271. fmla v27.4s, v7.4s, v5.s[0] // {t1a,t2a,t5a,t6a}
  272. ld1 {v17.4s},[x1] // {z[o1],z[o1+1]}
  273. subs x6, x6, #1 // n--
  274. zip1 v20.4s, v26.4s, v27.4s
  275. zip2 v21.4s, v26.4s, v27.4s
  276. fneg v22.4s, v20.4s
  277. fadd v4.4s, v21.4s, v20.4s
  278. fsub v6.4s, v20.4s, v21.4s // just the second half
  279. fadd v5.4s, v21.4s, v22.4s // just the first half
  280. tbl v4.16b, {v4.16b}, v30.16b // trans4_float
  281. tbl v5.16b, {v5.16b,v6.16b}, v31.16b // trans8_float
  282. fadd v20.4s, v16.4s, v4.4s
  283. fsub v22.4s, v16.4s, v4.4s
  284. fadd v21.4s, v17.4s, v5.4s
  285. st1 {v20.4s}, [x0], #16 // {z[0], z[1]}
  286. fsub v23.4s, v17.4s, v5.4s
  287. st1 {v21.4s}, [x1], #16 // {z[o1],z[o1+1]}
  288. st1 {v22.4s}, [x2], #16 // {z[o2],z[o2+1]}
  289. st1 {v23.4s}, [x3], #16 // {z[o3],z[o3+1]}
  290. b.ne 1b
  291. ret
  292. endfunc
  293. .macro def_fft n, n2, n4
  294. function fft\n\()_neon, align=6
  295. sub sp, sp, #16
  296. stp x28, x30, [sp]
  297. add x28, x0, #\n4*2*8
  298. bl fft\n2\()_neon
  299. mov x0, x28
  300. bl fft\n4\()_neon
  301. add x0, x28, #\n4*1*8
  302. bl fft\n4\()_neon
  303. sub x0, x28, #\n4*2*8
  304. ldp x28, x30, [sp], #16
  305. movrel x4, X(ff_cos_\n)
  306. mov x2, #\n4>>1
  307. b fft_pass_neon
  308. endfunc
  309. .endm
  310. def_fft 32, 16, 8
  311. def_fft 64, 32, 16
  312. def_fft 128, 64, 32
  313. def_fft 256, 128, 64
  314. def_fft 512, 256, 128
  315. def_fft 1024, 512, 256
  316. def_fft 2048, 1024, 512
  317. def_fft 4096, 2048, 1024
  318. def_fft 8192, 4096, 2048
  319. def_fft 16384, 8192, 4096
  320. def_fft 32768, 16384, 8192
  321. def_fft 65536, 32768, 16384
  322. function ff_fft_calc_neon, export=1
  323. prfm pldl1keep, [x1]
  324. movrel x10, trans4_float
  325. ldr w2, [x0]
  326. movrel x11, trans8_float
  327. sub w2, w2, #2
  328. movrel x3, fft_tab_neon
  329. ld1 {v30.16b}, [x10]
  330. mov x7, #-8
  331. movrel x12, pmmp
  332. ldr x3, [x3, x2, lsl #3]
  333. movrel x13, mppm
  334. movrel x14, X(ff_cos_16)
  335. ld1 {v31.16b}, [x11]
  336. mov x0, x1
  337. ld1 {v29.4s}, [x12] // pmmp
  338. ld1 {v28.4s}, [x13]
  339. br x3
  340. endfunc
  341. function ff_fft_permute_neon, export=1
  342. mov x6, #1
  343. ldr w2, [x0] // nbits
  344. ldr x3, [x0, #16] // tmp_buf
  345. ldr x0, [x0, #8] // revtab
  346. lsl x6, x6, x2
  347. mov x2, x6
  348. 1:
  349. ld1 {v0.2s,v1.2s}, [x1], #16
  350. ldr w4, [x0], #4
  351. uxth w5, w4
  352. lsr w4, w4, #16
  353. add x5, x3, x5, lsl #3
  354. add x4, x3, x4, lsl #3
  355. st1 {v0.2s}, [x5]
  356. st1 {v1.2s}, [x4]
  357. subs x6, x6, #2
  358. b.gt 1b
  359. sub x1, x1, x2, lsl #3
  360. 1:
  361. ld1 {v0.4s,v1.4s}, [x3], #32
  362. st1 {v0.4s,v1.4s}, [x1], #32
  363. subs x2, x2, #4
  364. b.gt 1b
  365. ret
  366. endfunc
  367. const fft_tab_neon, relocate=1
  368. .quad fft4_neon
  369. .quad fft8_neon
  370. .quad fft16_neon
  371. .quad fft32_neon
  372. .quad fft64_neon
  373. .quad fft128_neon
  374. .quad fft256_neon
  375. .quad fft512_neon
  376. .quad fft1024_neon
  377. .quad fft2048_neon
  378. .quad fft4096_neon
  379. .quad fft8192_neon
  380. .quad fft16384_neon
  381. .quad fft32768_neon
  382. .quad fft65536_neon
  383. endconst
  384. const pmmp, align=4
  385. .float +1.0, -1.0, -1.0, +1.0
  386. endconst
  387. const mppm, align=4
  388. .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  389. endconst