You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

531 lines
19KB

  1. /*
  2. * Copyright (c) 2013 RISC OS Open Ltd
  3. * Author: Ben Avison <bavison@riscosopen.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. @ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
  23. @ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
  24. @ all single-precision VFP registers may be corrupted on exit. The a2
  25. @ register may not be clobbered in these functions, as it holds the
  26. @ stored original FPSCR.
  27. function ff_fft_calc_vfp, export=1
  28. ldr ip, [a1, #0] @ nbits
  29. mov a1, a2
  30. movrel a2, (fft_tab_vfp - 8)
  31. ldr pc, [a2, ip, lsl #2]
  32. endfunc
  33. const fft_tab_vfp, relocate=1
  34. .word fft4_vfp
  35. .word fft8_vfp
  36. .word X(ff_fft16_vfp) @ this one alone is exported
  37. .word fft32_vfp
  38. .word fft64_vfp
  39. .word fft128_vfp
  40. .word fft256_vfp
  41. .word fft512_vfp
  42. .word fft1024_vfp
  43. .word fft2048_vfp
  44. .word fft4096_vfp
  45. .word fft8192_vfp
  46. .word fft16384_vfp
  47. .word fft32768_vfp
  48. .word fft65536_vfp
  49. endconst
  50. function fft4_vfp
  51. vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
  52. vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
  53. vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
  54. vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
  55. @ stall
  56. vadd.f s12, s0, s8 @ i0
  57. vadd.f s13, s1, s9 @ i1
  58. vadd.f s14, s2, s10 @ i2
  59. vadd.f s15, s3, s11 @ i3
  60. vsub.f s8, s0, s8 @ i4
  61. vsub.f s9, s1, s9 @ i5
  62. vsub.f s10, s2, s10 @ i6
  63. vsub.f s11, s3, s11 @ i7
  64. @ stall
  65. @ stall
  66. vadd.f s0, s12, s14 @ z[0].re
  67. vsub.f s4, s12, s14 @ z[2].re
  68. vadd.f s1, s13, s15 @ z[0].im
  69. vsub.f s5, s13, s15 @ z[2].im
  70. vadd.f s7, s9, s10 @ z[3].im
  71. vsub.f s3, s9, s10 @ z[1].im
  72. vadd.f s2, s8, s11 @ z[1].re
  73. vsub.f s6, s8, s11 @ z[3].re
  74. @ stall
  75. @ stall
  76. vstr d0, [a1, #0*2*4]
  77. vstr d2, [a1, #2*2*4]
  78. @ stall
  79. @ stall
  80. vstr d1, [a1, #1*2*4]
  81. vstr d3, [a1, #3*2*4]
  82. bx lr
  83. endfunc
  84. .macro macro_fft8_head
  85. @ FFT4
  86. vldr d4, [a1, #0 * 2*4]
  87. vldr d6, [a1, #1 * 2*4]
  88. vldr d5, [a1, #2 * 2*4]
  89. vldr d7, [a1, #3 * 2*4]
  90. @ BF
  91. vldr d12, [a1, #4 * 2*4]
  92. vadd.f s16, s8, s12 @ vector op
  93. vldr d14, [a1, #5 * 2*4]
  94. vldr d13, [a1, #6 * 2*4]
  95. vldr d15, [a1, #7 * 2*4]
  96. vsub.f s20, s8, s12 @ vector op
  97. vadd.f s0, s16, s18
  98. vsub.f s2, s16, s18
  99. vadd.f s1, s17, s19
  100. vsub.f s3, s17, s19
  101. vadd.f s7, s21, s22
  102. vsub.f s5, s21, s22
  103. vadd.f s4, s20, s23
  104. vsub.f s6, s20, s23
  105. vsub.f s20, s24, s28 @ vector op
  106. vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
  107. vstr d1, [a1, #1 * 2*4]
  108. vldr s0, cos1pi4
  109. vadd.f s16, s24, s28 @ vector op
  110. vstr d2, [a1, #2 * 2*4]
  111. vstr d3, [a1, #3 * 2*4]
  112. vldr d12, [a1, #0 * 2*4]
  113. @ TRANSFORM
  114. vmul.f s20, s20, s0 @ vector x scalar op
  115. vldr d13, [a1, #1 * 2*4]
  116. vldr d14, [a1, #2 * 2*4]
  117. vldr d15, [a1, #3 * 2*4]
  118. @ BUTTERFLIES
  119. vadd.f s0, s18, s16
  120. vadd.f s1, s17, s19
  121. vsub.f s2, s17, s19
  122. vsub.f s3, s18, s16
  123. vadd.f s4, s21, s20
  124. vsub.f s5, s21, s20
  125. vadd.f s6, s22, s23
  126. vsub.f s7, s22, s23
  127. vadd.f s8, s0, s24 @ vector op
  128. vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
  129. vstr d1, [a1, #1 * 2*4]
  130. vldr d6, [a1, #0 * 2*4]
  131. vldr d7, [a1, #1 * 2*4]
  132. vadd.f s1, s5, s6
  133. vadd.f s0, s7, s4
  134. vsub.f s2, s5, s6
  135. vsub.f s3, s7, s4
  136. vsub.f s12, s24, s12 @ vector op
  137. vsub.f s5, s29, s1
  138. vsub.f s4, s28, s0
  139. vsub.f s6, s30, s2
  140. vsub.f s7, s31, s3
  141. vadd.f s16, s0, s28 @ vector op
  142. vstr d6, [a1, #4 * 2*4]
  143. vstr d7, [a1, #6 * 2*4]
  144. vstr d4, [a1, #0 * 2*4]
  145. vstr d5, [a1, #2 * 2*4]
  146. vstr d2, [a1, #5 * 2*4]
  147. vstr d3, [a1, #7 * 2*4]
  148. .endm
  149. .macro macro_fft8_tail
  150. vstr d8, [a1, #1 * 2*4]
  151. vstr d9, [a1, #3 * 2*4]
  152. .endm
  153. function .Lfft8_internal_vfp
  154. macro_fft8_head
  155. macro_fft8_tail
  156. bx lr
  157. endfunc
  158. function fft8_vfp
  159. ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
  160. fmrx a2, FPSCR
  161. fmxr FPSCR, a3
  162. vpush {s16-s31}
  163. mov ip, lr
  164. bl .Lfft8_internal_vfp
  165. vpop {s16-s31}
  166. fmxr FPSCR, a2
  167. bx ip
  168. endfunc
  169. .align 3
  170. cos1pi4: @ cos(1*pi/4) = sqrt(2)
  171. .float 0.707106769084930419921875
  172. cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
  173. .float 0.92387950420379638671875
  174. cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
  175. .float 0.3826834261417388916015625
  176. function .Lfft16_internal_vfp
  177. macro_fft8_head
  178. @ FFT4(z+8)
  179. vldr d10, [a1, #8 * 2*4]
  180. vldr d12, [a1, #9 * 2*4]
  181. vldr d11, [a1, #10 * 2*4]
  182. vldr d13, [a1, #11 * 2*4]
  183. macro_fft8_tail
  184. vadd.f s16, s20, s24 @ vector op
  185. @ FFT4(z+12)
  186. vldr d4, [a1, #12 * 2*4]
  187. vldr d6, [a1, #13 * 2*4]
  188. vldr d5, [a1, #14 * 2*4]
  189. vsub.f s20, s20, s24 @ vector op
  190. vldr d7, [a1, #15 * 2*4]
  191. vadd.f s0, s16, s18
  192. vsub.f s4, s16, s18
  193. vadd.f s1, s17, s19
  194. vsub.f s5, s17, s19
  195. vadd.f s7, s21, s22
  196. vsub.f s3, s21, s22
  197. vadd.f s2, s20, s23
  198. vsub.f s6, s20, s23
  199. vadd.f s16, s8, s12 @ vector op
  200. vstr d0, [a1, #8 * 2*4]
  201. vstr d2, [a1, #10 * 2*4]
  202. vstr d1, [a1, #9 * 2*4]
  203. vsub.f s20, s8, s12
  204. vstr d3, [a1, #11 * 2*4]
  205. @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
  206. vldr d12, [a1, #10 * 2*4]
  207. vadd.f s0, s16, s18
  208. vadd.f s1, s17, s19
  209. vsub.f s6, s16, s18
  210. vsub.f s7, s17, s19
  211. vsub.f s3, s21, s22
  212. vadd.f s2, s20, s23
  213. vadd.f s5, s21, s22
  214. vsub.f s4, s20, s23
  215. vstr d0, [a1, #12 * 2*4]
  216. vmov s0, s6
  217. @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
  218. vldr d6, [a1, #9 * 2*4]
  219. vstr d1, [a1, #13 * 2*4]
  220. vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
  221. vstr d2, [a1, #15 * 2*4]
  222. vldr d7, [a1, #13 * 2*4]
  223. vadd.f s4, s25, s24
  224. vsub.f s5, s25, s24
  225. vsub.f s6, s0, s7
  226. vadd.f s7, s0, s7
  227. vmul.f s20, s12, s3 @ vector op
  228. @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
  229. vldr d4, [a1, #11 * 2*4]
  230. vldr d5, [a1, #15 * 2*4]
  231. vldr s1, cos3pi8
  232. vmul.f s24, s4, s2 @ vector * scalar op
  233. vmul.f s28, s12, s1 @ vector * scalar op
  234. vmul.f s12, s8, s1 @ vector * scalar op
  235. vadd.f s4, s20, s29
  236. vsub.f s5, s21, s28
  237. vsub.f s6, s22, s31
  238. vadd.f s7, s23, s30
  239. vmul.f s8, s8, s3 @ vector * scalar op
  240. vldr d8, [a1, #1 * 2*4]
  241. vldr d9, [a1, #5 * 2*4]
  242. vldr d10, [a1, #3 * 2*4]
  243. vldr d11, [a1, #7 * 2*4]
  244. vldr d14, [a1, #2 * 2*4]
  245. vadd.f s0, s6, s4
  246. vadd.f s1, s5, s7
  247. vsub.f s2, s5, s7
  248. vsub.f s3, s6, s4
  249. vadd.f s4, s12, s9
  250. vsub.f s5, s13, s8
  251. vsub.f s6, s14, s11
  252. vadd.f s7, s15, s10
  253. vadd.f s12, s0, s16 @ vector op
  254. vstr d0, [a1, #1 * 2*4]
  255. vstr d1, [a1, #5 * 2*4]
  256. vldr d4, [a1, #1 * 2*4]
  257. vldr d5, [a1, #5 * 2*4]
  258. vadd.f s0, s6, s4
  259. vadd.f s1, s5, s7
  260. vsub.f s2, s5, s7
  261. vsub.f s3, s6, s4
  262. vsub.f s8, s16, s8 @ vector op
  263. vstr d6, [a1, #1 * 2*4]
  264. vstr d7, [a1, #5 * 2*4]
  265. vldr d15, [a1, #6 * 2*4]
  266. vsub.f s4, s20, s0
  267. vsub.f s5, s21, s1
  268. vsub.f s6, s22, s2
  269. vsub.f s7, s23, s3
  270. vadd.f s20, s0, s20 @ vector op
  271. vstr d4, [a1, #9 * 2*4]
  272. @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
  273. vldr d6, [a1, #8 * 2*4]
  274. vstr d5, [a1, #13 * 2*4]
  275. vldr d7, [a1, #12 * 2*4]
  276. vstr d2, [a1, #11 * 2*4]
  277. vldr d8, [a1, #0 * 2*4]
  278. vstr d3, [a1, #15 * 2*4]
  279. vldr d9, [a1, #4 * 2*4]
  280. vadd.f s0, s26, s24
  281. vadd.f s1, s25, s27
  282. vsub.f s2, s25, s27
  283. vsub.f s3, s26, s24
  284. vadd.f s4, s14, s12
  285. vadd.f s5, s13, s15
  286. vsub.f s6, s13, s15
  287. vsub.f s7, s14, s12
  288. vadd.f s8, s0, s28 @ vector op
  289. vstr d0, [a1, #3 * 2*4]
  290. vstr d1, [a1, #7 * 2*4]
  291. vldr d6, [a1, #3 * 2*4]
  292. vldr d7, [a1, #7 * 2*4]
  293. vsub.f s0, s16, s4
  294. vsub.f s1, s17, s5
  295. vsub.f s2, s18, s6
  296. vsub.f s3, s19, s7
  297. vsub.f s12, s28, s12 @ vector op
  298. vadd.f s16, s4, s16 @ vector op
  299. vstr d10, [a1, #3 * 2*4]
  300. vstr d11, [a1, #7 * 2*4]
  301. vstr d4, [a1, #2 * 2*4]
  302. vstr d5, [a1, #6 * 2*4]
  303. vstr d0, [a1, #8 * 2*4]
  304. vstr d1, [a1, #12 * 2*4]
  305. vstr d6, [a1, #10 * 2*4]
  306. vstr d7, [a1, #14 * 2*4]
  307. vstr d8, [a1, #0 * 2*4]
  308. vstr d9, [a1, #4 * 2*4]
  309. bx lr
  310. endfunc
  311. function ff_fft16_vfp, export=1
  312. ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
  313. fmrx a2, FPSCR
  314. fmxr FPSCR, a3
  315. vpush {s16-s31}
  316. mov ip, lr
  317. bl .Lfft16_internal_vfp
  318. vpop {s16-s31}
  319. fmxr FPSCR, a2
  320. bx ip
  321. endfunc
  322. .macro pass n, z0, z1, z2, z3
  323. add v6, v5, #4*2*\n
  324. @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
  325. @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
  326. @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
  327. @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
  328. vldr d8, [\z2, #8*(o2+1)] @ s16,s17
  329. vldmdb v6!, {s2}
  330. vldr d9, [\z3, #8*(o3+1)] @ s18,s19
  331. vldmia v5!, {s0,s1} @ s0 is unused
  332. vldr s7, [\z2, #8*o2] @ t1
  333. vmul.f s20, s16, s2 @ vector * scalar
  334. vldr s0, [\z3, #8*o3] @ t5
  335. vldr s6, [\z2, #8*o2+4] @ t2
  336. vldr s3, [\z3, #8*o3+4] @ t6
  337. vmul.f s16, s16, s1 @ vector * scalar
  338. ldr a4, =\n-1
  339. 1: add \z0, \z0, #8*2
  340. .if \n*4*2 >= 512
  341. add \z1, \z1, #8*2
  342. .endif
  343. .if \n*4*2 >= 256
  344. add \z2, \z2, #8*2
  345. .endif
  346. .if \n*4*2 >= 512
  347. add \z3, \z3, #8*2
  348. .endif
  349. @ up to 2 stalls (VFP vector issuing / waiting for s0)
  350. @ depending upon whether this is the first iteration and
  351. @ how many add instructions are inserted above
  352. vadd.f s4, s0, s7 @ t5
  353. vadd.f s5, s6, s3 @ t6
  354. vsub.f s6, s6, s3 @ t4
  355. vsub.f s7, s0, s7 @ t3
  356. vldr d6, [\z0, #8*0-8*2] @ s12,s13
  357. vadd.f s0, s16, s21 @ t1
  358. vldr d7, [\z1, #8*o1-8*2] @ s14,s15
  359. vsub.f s1, s18, s23 @ t5
  360. vadd.f s8, s4, s12 @ vector + vector
  361. @ stall (VFP vector issuing)
  362. @ stall (VFP vector issuing)
  363. @ stall (VFP vector issuing)
  364. vsub.f s4, s12, s4
  365. vsub.f s5, s13, s5
  366. vsub.f s6, s14, s6
  367. vsub.f s7, s15, s7
  368. vsub.f s2, s17, s20 @ t2
  369. vadd.f s3, s19, s22 @ t6
  370. vstr d4, [\z0, #8*0-8*2] @ s8,s9
  371. vstr d5, [\z1, #8*o1-8*2] @ s10,s11
  372. @ stall (waiting for s5)
  373. vstr d2, [\z2, #8*o2-8*2] @ s4,s5
  374. vadd.f s4, s1, s0 @ t5
  375. vstr d3, [\z3, #8*o3-8*2] @ s6,s7
  376. vsub.f s7, s1, s0 @ t3
  377. vadd.f s5, s2, s3 @ t6
  378. vsub.f s6, s2, s3 @ t4
  379. vldr d6, [\z0, #8*1-8*2] @ s12,s13
  380. vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15
  381. vldr d4, [\z2, #8*o2] @ s8,s9
  382. vldmdb v6!, {s2,s3}
  383. vldr d5, [\z3, #8*o3] @ s10,s11
  384. vadd.f s20, s4, s12 @ vector + vector
  385. vldmia v5!, {s0,s1}
  386. vldr d8, [\z2, #8*(o2+1)] @ s16,s17
  387. @ stall (VFP vector issuing)
  388. vsub.f s4, s12, s4
  389. vsub.f s5, s13, s5
  390. vsub.f s6, s14, s6
  391. vsub.f s7, s15, s7
  392. vmul.f s12, s8, s3 @ vector * scalar
  393. vstr d10, [\z0, #8*1-8*2] @ s20,s21
  394. vldr d9, [\z3, #8*(o3+1)] @ s18,s19
  395. vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23
  396. vmul.f s8, s8, s0 @ vector * scalar
  397. vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5
  398. @ stall (waiting for s7)
  399. vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7
  400. vmul.f s20, s16, s2 @ vector * scalar
  401. @ stall (VFP vector issuing)
  402. @ stall (VFP vector issuing)
  403. @ stall (VFP vector issuing)
  404. vadd.f s7, s8, s13 @ t1
  405. vsub.f s6, s9, s12 @ t2
  406. vsub.f s0, s10, s15 @ t5
  407. vadd.f s3, s11, s14 @ t6
  408. vmul.f s16, s16, s1 @ vector * scalar
  409. subs a4, a4, #1
  410. bne 1b
  411. @ What remains is identical to the first two indentations of
  412. @ the above, but without the increment of z
  413. vadd.f s4, s0, s7 @ t5
  414. vadd.f s5, s6, s3 @ t6
  415. vsub.f s6, s6, s3 @ t4
  416. vsub.f s7, s0, s7 @ t3
  417. vldr d6, [\z0, #8*0] @ s12,s13
  418. vadd.f s0, s16, s21 @ t1
  419. vldr d7, [\z1, #8*o1] @ s14,s15
  420. vsub.f s1, s18, s23 @ t5
  421. vadd.f s8, s4, s12 @ vector + vector
  422. vsub.f s4, s12, s4
  423. vsub.f s5, s13, s5
  424. vsub.f s6, s14, s6
  425. vsub.f s7, s15, s7
  426. vsub.f s2, s17, s20 @ t2
  427. vadd.f s3, s19, s22 @ t6
  428. vstr d4, [\z0, #8*0] @ s8,s9
  429. vstr d5, [\z1, #8*o1] @ s10,s11
  430. vstr d2, [\z2, #8*o2] @ s4,s5
  431. vadd.f s4, s1, s0 @ t5
  432. vstr d3, [\z3, #8*o3] @ s6,s7
  433. vsub.f s7, s1, s0 @ t3
  434. vadd.f s5, s2, s3 @ t6
  435. vsub.f s6, s2, s3 @ t4
  436. vldr d6, [\z0, #8*1] @ s12,s13
  437. vldr d7, [\z1, #8*(o1+1)] @ s14,s15
  438. vadd.f s20, s4, s12 @ vector + vector
  439. vsub.f s4, s12, s4
  440. vsub.f s5, s13, s5
  441. vsub.f s6, s14, s6
  442. vsub.f s7, s15, s7
  443. vstr d10, [\z0, #8*1] @ s20,s21
  444. vstr d11, [\z1, #8*(o1+1)] @ s22,s23
  445. vstr d2, [\z2, #8*(o2+1)] @ s4,s5
  446. vstr d3, [\z3, #8*(o3+1)] @ s6,s7
  447. .endm
  448. .macro def_fft n, n2, n4
  449. function .Lfft\n\()_internal_vfp
  450. .if \n >= 512
  451. push {v1-v6,lr}
  452. .elseif \n >= 256
  453. push {v1-v2,v5-v6,lr}
  454. .else
  455. push {v1,v5-v6,lr}
  456. .endif
  457. mov v1, a1
  458. bl .Lfft\n2\()_internal_vfp
  459. add a1, v1, #8*(\n/4)*2
  460. bl .Lfft\n4\()_internal_vfp
  461. movrelx v5, X(ff_cos_\n), a1
  462. add a1, v1, #8*(\n/4)*3
  463. bl .Lfft\n4\()_internal_vfp
  464. .if \n >= 512
  465. .set o1, 0*(\n/4/2)
  466. .set o2, 0*(\n/4/2)
  467. .set o3, 0*(\n/4/2)
  468. add v2, v1, #8*2*(\n/4/2)
  469. add v3, v1, #8*4*(\n/4/2)
  470. add v4, v1, #8*6*(\n/4/2)
  471. pass (\n/4/2), v1, v2, v3, v4
  472. pop {v1-v6,pc}
  473. .elseif \n >= 256
  474. .set o1, 2*(\n/4/2)
  475. .set o2, 0*(\n/4/2)
  476. .set o3, 2*(\n/4/2)
  477. add v2, v1, #8*4*(\n/4/2)
  478. pass (\n/4/2), v1, v1, v2, v2
  479. pop {v1-v2,v5-v6,pc}
  480. .else
  481. .set o1, 2*(\n/4/2)
  482. .set o2, 4*(\n/4/2)
  483. .set o3, 6*(\n/4/2)
  484. pass (\n/4/2), v1, v1, v1, v1
  485. pop {v1,v5-v6,pc}
  486. .endif
  487. endfunc
  488. function fft\n\()_vfp
  489. ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */
  490. fmrx a2, FPSCR
  491. fmxr FPSCR, a3
  492. vpush {s16-s31}
  493. mov ip, lr
  494. bl .Lfft\n\()_internal_vfp
  495. vpop {s16-s31}
  496. fmxr FPSCR, a2
  497. bx ip
  498. endfunc
  499. .ltorg
  500. .endm
  501. def_fft 32, 16, 8
  502. def_fft 64, 32, 16
  503. def_fft 128, 64, 32
  504. def_fft 256, 128, 64
  505. def_fft 512, 256, 128
  506. def_fft 1024, 512, 256
  507. def_fft 2048, 1024, 512
  508. def_fft 4096, 2048, 1024
  509. def_fft 8192, 4096, 2048
  510. def_fft 16384, 8192, 4096
  511. def_fft 32768, 16384, 8192
  512. def_fft 65536, 32768, 16384