You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

455 lines
12KB

  1. /*
  2. * FFT transform with Altivec optimizations
  3. * Copyright (c) 2009 Loren Merritt
  4. *
  5. * This algorithm (though not any of the implementation details) is
  6. * based on libdjbfft by D. J. Bernstein.
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. /*
  25. * These functions are not individually interchangeable with the C versions.
  26. * While C takes arrays of FFTComplex, Altivec leaves intermediate results
  27. * in blocks as convenient to the vector size.
  28. * i.e. {4x real, 4x imaginary, 4x real, ...}
  29. *
  30. * I ignore standard calling convention.
  31. * Instead, the following registers are treated as global constants:
  32. * v14: zero
  33. * v15..v18: cosines
  34. * v19..v29: permutations
  35. * r9: 16
  36. * r12: ff_cos_tabs
  37. * and the rest are free for local use.
  38. */
  39. #include "config.h"
  40. #if HAVE_GNU_AS && HAVE_ALTIVEC
  41. #include "asm.S"
  42. .text
  43. .macro addi2 ra, imm // add 32-bit immediate
  44. .if \imm & 0xffff
  45. addi \ra, \ra, \imm@l
  46. .endif
  47. .if (\imm+0x8000)>>16
  48. addis \ra, \ra, \imm@ha
  49. .endif
  50. .endm
  51. .macro FFT4 a0, a1, a2, a3 // in:0-1 out:2-3
  52. vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
  53. vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
  54. vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
  55. vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
  56. vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
  57. vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
  58. vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
  59. vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
  60. vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
  61. vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
  62. .endm
  63. .macro FFT4x2 a0, a1, b0, b1, a2, a3, b2, b3
  64. vperm \a2,\a0,\a1,v20 // vcprm(0,1,s2,s1) // {r0,i0,r3,i2}
  65. vperm \a3,\a0,\a1,v21 // vcprm(2,3,s0,s3) // {r1,i1,r2,i3}
  66. vperm \b2,\b0,\b1,v20
  67. vperm \b3,\b0,\b1,v21
  68. vaddfp \a0,\a2,\a3 // {t1,t2,t6,t5}
  69. vsubfp \a1,\a2,\a3 // {t3,t4,t8,t7}
  70. vaddfp \b0,\b2,\b3
  71. vsubfp \b1,\b2,\b3
  72. vmrghw \a2,\a0,\a1 // vcprm(0,s0,1,s1) // {t1,t3,t2,t4}
  73. vperm \a3,\a0,\a1,v22 // vcprm(2,s3,3,s2) // {t6,t7,t5,t8}
  74. vmrghw \b2,\b0,\b1
  75. vperm \b3,\b0,\b1,v22
  76. vaddfp \a0,\a2,\a3 // {r0,r1,i0,i1}
  77. vsubfp \a1,\a2,\a3 // {r2,r3,i2,i3}
  78. vaddfp \b0,\b2,\b3
  79. vsubfp \b1,\b2,\b3
  80. vperm \a2,\a0,\a1,v23 // vcprm(0,1,s0,s1) // {r0,r1,r2,r3}
  81. vperm \a3,\a0,\a1,v24 // vcprm(2,3,s2,s3) // {i0,i1,i2,i3}
  82. vperm \b2,\b0,\b1,v23
  83. vperm \b3,\b0,\b1,v24
  84. .endm
  85. .macro FFT8 a0, a1, b0, b1, a2, a3, b2, b3, b4 // in,out:a0-b1
  86. vmrghw \b2,\b0,\b1 // vcprm(0,s0,1,s1) // {r4,r6,i4,i6}
  87. vmrglw \b3,\b0,\b1 // vcprm(2,s2,3,s3) // {r5,r7,i5,i7}
  88. vperm \a2,\a0,\a1,v20 // FFT4 ...
  89. vperm \a3,\a0,\a1,v21
  90. vaddfp \b0,\b2,\b3 // {t1,t3,t2,t4}
  91. vsubfp \b1,\b2,\b3 // {r5,r7,i5,i7}
  92. vperm \b4,\b1,\b1,v25 // vcprm(2,3,0,1) // {i5,i7,r5,r7}
  93. vaddfp \a0,\a2,\a3
  94. vsubfp \a1,\a2,\a3
  95. vmaddfp \b1,\b1,v17,v14 // * {-1,1,1,-1}/sqrt(2)
  96. vmaddfp \b1,\b4,v18,\b1 // * { 1,1,1,1 }/sqrt(2) // {t8,ta,t7,t9}
  97. vmrghw \a2,\a0,\a1
  98. vperm \a3,\a0,\a1,v22
  99. vperm \b2,\b0,\b1,v26 // vcprm(1,2,s3,s0) // {t3,t2,t9,t8}
  100. vperm \b3,\b0,\b1,v27 // vcprm(0,3,s2,s1) // {t1,t4,t7,ta}
  101. vaddfp \a0,\a2,\a3
  102. vsubfp \a1,\a2,\a3
  103. vaddfp \b0,\b2,\b3 // {t1,t2,t9,ta}
  104. vsubfp \b1,\b2,\b3 // {t6,t5,tc,tb}
  105. vperm \a2,\a0,\a1,v23
  106. vperm \a3,\a0,\a1,v24
  107. vperm \b2,\b0,\b1,v28 // vcprm(0,2,s1,s3) // {t1,t9,t5,tb}
  108. vperm \b3,\b0,\b1,v29 // vcprm(1,3,s0,s2) // {t2,ta,t6,tc}
  109. vsubfp \b0,\a2,\b2 // {r4,r5,r6,r7}
  110. vsubfp \b1,\a3,\b3 // {i4,i5,i6,i7}
  111. vaddfp \a0,\a2,\b2 // {r0,r1,r2,r3}
  112. vaddfp \a1,\a3,\b3 // {i0,i1,i2,i3}
  113. .endm
  114. .macro BF d0,d1,s0,s1
  115. vsubfp \d1,\s0,\s1
  116. vaddfp \d0,\s0,\s1
  117. .endm
  118. .macro zip d0,d1,s0,s1
  119. vmrghw \d0,\s0,\s1
  120. vmrglw \d1,\s0,\s1
  121. .endm
  122. .macro def_fft4 interleave
  123. fft4\interleave\()_altivec:
  124. lvx v0, 0,r3
  125. lvx v1,r9,r3
  126. FFT4 v0,v1,v2,v3
  127. .ifnb \interleave
  128. zip v0,v1,v2,v3
  129. stvx v0, 0,r3
  130. stvx v1,r9,r3
  131. .else
  132. stvx v2, 0,r3
  133. stvx v3,r9,r3
  134. .endif
  135. blr
  136. .endm
  137. .macro def_fft8 interleave
  138. fft8\interleave\()_altivec:
  139. addi r4,r3,32
  140. lvx v0, 0,r3
  141. lvx v1,r9,r3
  142. lvx v2, 0,r4
  143. lvx v3,r9,r4
  144. FFT8 v0,v1,v2,v3,v4,v5,v6,v7,v8
  145. .ifnb \interleave
  146. zip v4,v5,v0,v1
  147. zip v6,v7,v2,v3
  148. stvx v4, 0,r3
  149. stvx v5,r9,r3
  150. stvx v6, 0,r4
  151. stvx v7,r9,r4
  152. .else
  153. stvx v0, 0,r3
  154. stvx v1,r9,r3
  155. stvx v2, 0,r4
  156. stvx v3,r9,r4
  157. .endif
  158. blr
  159. .endm
  160. .macro def_fft16 interleave
  161. fft16\interleave\()_altivec:
  162. addi r5,r3,64
  163. addi r6,r3,96
  164. addi r4,r3,32
  165. lvx v0, 0,r5
  166. lvx v1,r9,r5
  167. lvx v2, 0,r6
  168. lvx v3,r9,r6
  169. FFT4x2 v0,v1,v2,v3,v4,v5,v6,v7
  170. lvx v0, 0,r3
  171. lvx v1,r9,r3
  172. lvx v2, 0,r4
  173. lvx v3,r9,r4
  174. FFT8 v0,v1,v2,v3,v8,v9,v10,v11,v12
  175. vmaddfp v8,v4,v15,v14 // r2*wre
  176. vmaddfp v9,v5,v15,v14 // i2*wre
  177. vmaddfp v10,v6,v15,v14 // r3*wre
  178. vmaddfp v11,v7,v15,v14 // i3*wre
  179. vmaddfp v8,v5,v16,v8 // i2*wim
  180. vnmsubfp v9,v4,v16,v9 // r2*wim
  181. vnmsubfp v10,v7,v16,v10 // i3*wim
  182. vmaddfp v11,v6,v16,v11 // r3*wim
  183. BF v10,v12,v10,v8
  184. BF v11,v13,v9,v11
  185. BF v0,v4,v0,v10
  186. BF v3,v7,v3,v12
  187. BF v1,v5,v1,v11
  188. BF v2,v6,v2,v13
  189. .ifnb \interleave
  190. zip v8, v9,v0,v1
  191. zip v10,v11,v2,v3
  192. zip v12,v13,v4,v5
  193. zip v14,v15,v6,v7
  194. stvx v8, 0,r3
  195. stvx v9,r9,r3
  196. stvx v10, 0,r4
  197. stvx v11,r9,r4
  198. stvx v12, 0,r5
  199. stvx v13,r9,r5
  200. stvx v14, 0,r6
  201. stvx v15,r9,r6
  202. .else
  203. stvx v0, 0,r3
  204. stvx v4, 0,r5
  205. stvx v3,r9,r4
  206. stvx v7,r9,r6
  207. stvx v1,r9,r3
  208. stvx v5,r9,r5
  209. stvx v2, 0,r4
  210. stvx v6, 0,r6
  211. .endif
  212. blr
  213. .endm
  214. // void pass(float *z, float *wre, int n)
  215. .macro PASS interleave, suffix
  216. fft_pass\suffix\()_altivec:
  217. mtctr r5
  218. slwi r0,r5,4
  219. slwi r7,r5,6 // o2
  220. slwi r5,r5,5 // o1
  221. add r10,r5,r7 // o3
  222. add r0,r4,r0 // wim
  223. addi r6,r5,16 // o1+16
  224. addi r8,r7,16 // o2+16
  225. addi r11,r10,16 // o3+16
  226. 1:
  227. lvx v8, 0,r4 // wre
  228. lvx v10, 0,r0 // wim
  229. sub r0,r0,r9
  230. lvx v9, 0,r0
  231. vperm v9,v9,v10,v19 // vcprm(s0,3,2,1) => wim[0 .. -3]
  232. lvx v4,r3,r7 // r2 = z[o2]
  233. lvx v5,r3,r8 // i2 = z[o2+16]
  234. lvx v6,r3,r10 // r3 = z[o3]
  235. lvx v7,r3,r11 // i3 = z[o3+16]
  236. vmaddfp v10,v4,v8,v14 // r2*wre
  237. vmaddfp v11,v5,v8,v14 // i2*wre
  238. vmaddfp v12,v6,v8,v14 // r3*wre
  239. vmaddfp v13,v7,v8,v14 // i3*wre
  240. lvx v0, 0,r3 // r0 = z[0]
  241. lvx v3,r3,r6 // i1 = z[o1+16]
  242. vmaddfp v10,v5,v9,v10 // i2*wim
  243. vnmsubfp v11,v4,v9,v11 // r2*wim
  244. vnmsubfp v12,v7,v9,v12 // i3*wim
  245. vmaddfp v13,v6,v9,v13 // r3*wim
  246. lvx v1,r3,r9 // i0 = z[16]
  247. lvx v2,r3,r5 // r1 = z[o1]
  248. BF v12,v8,v12,v10
  249. BF v13,v9,v11,v13
  250. BF v0,v4,v0,v12
  251. BF v3,v7,v3,v8
  252. .if !\interleave
  253. stvx v0, 0,r3
  254. stvx v4,r3,r7
  255. stvx v3,r3,r6
  256. stvx v7,r3,r11
  257. .endif
  258. BF v1,v5,v1,v13
  259. BF v2,v6,v2,v9
  260. .if !\interleave
  261. stvx v1,r3,r9
  262. stvx v2,r3,r5
  263. stvx v5,r3,r8
  264. stvx v6,r3,r10
  265. .else
  266. vmrghw v8,v0,v1
  267. vmrglw v9,v0,v1
  268. stvx v8, 0,r3
  269. stvx v9,r3,r9
  270. vmrghw v8,v2,v3
  271. vmrglw v9,v2,v3
  272. stvx v8,r3,r5
  273. stvx v9,r3,r6
  274. vmrghw v8,v4,v5
  275. vmrglw v9,v4,v5
  276. stvx v8,r3,r7
  277. stvx v9,r3,r8
  278. vmrghw v8,v6,v7
  279. vmrglw v9,v6,v7
  280. stvx v8,r3,r10
  281. stvx v9,r3,r11
  282. .endif
  283. addi r3,r3,32
  284. addi r4,r4,16
  285. bdnz 1b
  286. sub r3,r3,r5
  287. blr
  288. .endm
  289. #define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
  290. #define WORD_0 0x00,0x01,0x02,0x03
  291. #define WORD_1 0x04,0x05,0x06,0x07
  292. #define WORD_2 0x08,0x09,0x0a,0x0b
  293. #define WORD_3 0x0c,0x0d,0x0e,0x0f
  294. #define WORD_s0 0x10,0x11,0x12,0x13
  295. #define WORD_s1 0x14,0x15,0x16,0x17
  296. #define WORD_s2 0x18,0x19,0x1a,0x1b
  297. #define WORD_s3 0x1c,0x1d,0x1e,0x1f
  298. #define vcprm(a, b, c, d) .byte WORD_##a, WORD_##b, WORD_##c, WORD_##d
  299. .rodata
  300. .align 4
  301. fft_data:
  302. .float 0, 0, 0, 0
  303. .float 1, 0.92387953, M_SQRT1_2, 0.38268343
  304. .float 0, 0.38268343, M_SQRT1_2, 0.92387953
  305. .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2,-M_SQRT1_2
  306. .float M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  307. vcprm(s0,3,2,1)
  308. vcprm(0,1,s2,s1)
  309. vcprm(2,3,s0,s3)
  310. vcprm(2,s3,3,s2)
  311. vcprm(0,1,s0,s1)
  312. vcprm(2,3,s2,s3)
  313. vcprm(2,3,0,1)
  314. vcprm(1,2,s3,s0)
  315. vcprm(0,3,s2,s1)
  316. vcprm(0,2,s1,s3)
  317. vcprm(1,3,s0,s2)
  318. .macro lvm b, r, regs:vararg
  319. lvx \r, 0, \b
  320. addi \b, \b, 16
  321. .ifnb \regs
  322. lvm \b, \regs
  323. .endif
  324. .endm
  325. .macro stvm b, r, regs:vararg
  326. stvx \r, 0, \b
  327. addi \b, \b, 16
  328. .ifnb \regs
  329. stvm \b, \regs
  330. .endif
  331. .endm
  332. .macro fft_calc interleave
  333. extfunc ff_fft_calc\interleave\()_altivec
  334. mflr r0
  335. stp r0, 2*PS(r1)
  336. stpu r1, -(160+16*PS)(r1)
  337. get_got r11
  338. addi r6, r1, 16*PS
  339. stvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
  340. mfvrsave r0
  341. stw r0, 15*PS(r1)
  342. li r6, 0xfffffffc
  343. mtvrsave r6
  344. movrel r6, fft_data, r11
  345. lvm r6, v14, v15, v16, v17, v18, v19, v20, v21
  346. lvm r6, v22, v23, v24, v25, v26, v27, v28, v29
  347. li r9, 16
  348. movrel r12, X(ff_cos_tabs), r11
  349. movrel r6, fft_dispatch_tab\interleave\()_altivec, r11
  350. lwz r3, 0(r3)
  351. subi r3, r3, 2
  352. slwi r3, r3, 2+ARCH_PPC64
  353. lpx r3, r3, r6
  354. mtctr r3
  355. mr r3, r4
  356. bctrl
  357. addi r6, r1, 16*PS
  358. lvm r6, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
  359. lwz r6, 15*PS(r1)
  360. mtvrsave r6
  361. lp r1, 0(r1)
  362. lp r0, 2*PS(r1)
  363. mtlr r0
  364. blr
  365. .endm
  366. .macro DECL_FFT suffix, bits, n, n2, n4
  367. fft\n\suffix\()_altivec:
  368. mflr r0
  369. stp r0,PS*(\bits-3)(r1)
  370. bl fft\n2\()_altivec
  371. addi2 r3,\n*4
  372. bl fft\n4\()_altivec
  373. addi2 r3,\n*2
  374. bl fft\n4\()_altivec
  375. addi2 r3,\n*-6
  376. lp r0,PS*(\bits-3)(r1)
  377. lp r4,\bits*PS(r12)
  378. mtlr r0
  379. li r5,\n/16
  380. b fft_pass\suffix\()_altivec
  381. .endm
  382. .macro DECL_FFTS interleave, suffix
  383. .text
  384. def_fft4 \suffix
  385. def_fft8 \suffix
  386. def_fft16 \suffix
  387. PASS \interleave, \suffix
  388. DECL_FFT \suffix, 5, 32, 16, 8
  389. DECL_FFT \suffix, 6, 64, 32, 16
  390. DECL_FFT \suffix, 7, 128, 64, 32
  391. DECL_FFT \suffix, 8, 256, 128, 64
  392. DECL_FFT \suffix, 9, 512, 256, 128
  393. DECL_FFT \suffix,10, 1024, 512, 256
  394. DECL_FFT \suffix,11, 2048, 1024, 512
  395. DECL_FFT \suffix,12, 4096, 2048, 1024
  396. DECL_FFT \suffix,13, 8192, 4096, 2048
  397. DECL_FFT \suffix,14,16384, 8192, 4096
  398. DECL_FFT \suffix,15,32768,16384, 8192
  399. DECL_FFT \suffix,16,65536,32768,16384
  400. fft_calc \suffix
  401. .rodata
  402. .align 3
  403. fft_dispatch_tab\suffix\()_altivec:
  404. PTR fft4\suffix\()_altivec
  405. PTR fft8\suffix\()_altivec
  406. PTR fft16\suffix\()_altivec
  407. PTR fft32\suffix\()_altivec
  408. PTR fft64\suffix\()_altivec
  409. PTR fft128\suffix\()_altivec
  410. PTR fft256\suffix\()_altivec
  411. PTR fft512\suffix\()_altivec
  412. PTR fft1024\suffix\()_altivec
  413. PTR fft2048\suffix\()_altivec
  414. PTR fft4096\suffix\()_altivec
  415. PTR fft8192\suffix\()_altivec
  416. PTR fft16384\suffix\()_altivec
  417. PTR fft32768\suffix\()_altivec
  418. PTR fft65536\suffix\()_altivec
  419. .endm
  420. DECL_FFTS 0
  421. DECL_FFTS 1, _interleave
  422. #endif /* HAVE_GNU_AS && HAVE_ALTIVEC */