You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1088 lines
25KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of Libav.
  10. ;*
  11. ;* Libav is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* Libav is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with Libav; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "libavutil/x86/x86util.asm"
  30. %if ARCH_X86_64
  31. %define pointer resq
  32. %else
  33. %define pointer resd
  34. %endif
  35. struc FFTContext
  36. .nbits: resd 1
  37. .reverse: resd 1
  38. .revtab: pointer 1
  39. .tmpbuf: pointer 1
  40. .mdctsize: resd 1
  41. .mdctbits: resd 1
  42. .tcos: pointer 1
  43. .tsin: pointer 1
  44. .fftperm: pointer 1
  45. .fftcalc: pointer 1
  46. .imdctcalc:pointer 1
  47. .imdcthalf:pointer 1
  48. endstruc
  49. SECTION_RODATA
  50. %define M_SQRT1_2 0.70710678118654752440
  51. %define M_COS_PI_1_8 0.923879532511287
  52. %define M_COS_PI_3_8 0.38268343236509
  53. align 32
  54. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  55. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  56. ps_root2: times 8 dd M_SQRT1_2
  57. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  58. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  59. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  60. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  61. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  62. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  63. ps_m1m1m1m1: times 4 dd 1<<31
  64. ps_m1p1: dd 1<<31, 0
  65. %assign i 16
  66. %rep 13
  67. cextern cos_ %+ i
  68. %assign i i<<1
  69. %endrep
  70. %if ARCH_X86_64
  71. %define pointer dq
  72. %else
  73. %define pointer dd
  74. %endif
  75. %macro IF0 1+
  76. %endmacro
  77. %macro IF1 1+
  78. %1
  79. %endmacro
  80. SECTION .text
  81. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  82. mova %1, %3
  83. mova %2, %1
  84. pfadd %1, %4
  85. pfsub %2, %4
  86. %endmacro
  87. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  88. mova %5, %3
  89. pfsub %3, %4
  90. pfadd %5, %4 ; {t6,t5}
  91. pxor %3, [ps_m1p1] ; {t8,t7}
  92. mova %6, %1
  93. movd [r0+12], %3
  94. punpckhdq %3, [r0+8]
  95. pfadd %1, %5 ; {r0,i0}
  96. pfsub %6, %5 ; {r2,i2}
  97. mova %4, %2
  98. pfadd %2, %3 ; {r1,i1}
  99. pfsub %4, %3 ; {r3,i3}
  100. SWAP %3, %6
  101. %endmacro
  102. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  103. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  104. ; %3, %4, %5 tmp
  105. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  106. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  107. %macro T8_AVX 5
  108. vsubps %5, %1, %2 ; v = %1 - %2
  109. vaddps %3, %1, %2 ; w = %1 + %2
  110. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  111. vpermilps %2, %2, [perm1]
  112. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  113. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  114. vsubps %4, %5, %1 ; s = r - q
  115. vaddps %1, %5, %1 ; u = r + q
  116. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  117. vshufps %5, %4, %1, 0xbb
  118. vshufps %3, %4, %1, 0xee
  119. vperm2f128 %3, %3, %5, 0x13
  120. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  121. vshufps %2, %1, %4, 0xdd
  122. vshufps %1, %1, %4, 0x88
  123. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  124. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  125. vsubps %5, %1, %3
  126. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  127. vsubps %2, %4, %1 ; %2 = v - w
  128. vaddps %1, %4, %1 ; %1 = v + w
  129. %endmacro
  130. ; In SSE mode do one fft4 transforms
  131. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  132. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  133. ;
  134. ; In AVX mode do two fft4 transforms
  135. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  136. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  137. %macro T4_SSE 3
  138. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  139. addps %1, %1, %2 ; {t1,t2,t6,t5}
  140. xorps %3, %3, [ps_p1p1m1p1]
  141. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  142. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  143. subps %3, %1, %2 ; {r2,i2,r3,i3}
  144. addps %1, %1, %2 ; {r0,i0,r1,i1}
  145. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  146. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  147. %endmacro
  148. ; In SSE mode do one FFT8
  149. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  150. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  151. ;
  152. ; In AVX mode do two FFT8
  153. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  154. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  155. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  156. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  157. %macro T8_SSE 6
  158. addps %6, %3, %4 ; {t1,t2,t3,t4}
  159. subps %3, %3, %4 ; {r5,i5,r7,i7}
  160. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  161. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  162. mulps %4, %4, [ps_root2]
  163. addps %3, %3, %4 ; {t8,t7,ta,t9}
  164. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  165. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  166. subps %3, %6, %4 ; {t6,t5,tc,tb}
  167. addps %6, %6, %4 ; {t1,t2,t9,ta}
  168. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  169. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  170. subps %3, %1, %6 ; {r4,r5,r6,r7}
  171. addps %1, %1, %6 ; {r0,r1,r2,r3}
  172. subps %4, %2, %5 ; {i4,i5,i6,i7}
  173. addps %2, %2, %5 ; {i0,i1,i2,i3}
  174. %endmacro
  175. ; scheduled for cpu-bound sizes
  176. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  177. IF%1 mova m4, Z(4)
  178. IF%1 mova m5, Z(5)
  179. mova m0, %2 ; wre
  180. mova m1, %3 ; wim
  181. mulps m2, m4, m0 ; r2*wre
  182. IF%1 mova m6, Z2(6)
  183. mulps m3, m5, m1 ; i2*wim
  184. IF%1 mova m7, Z2(7)
  185. mulps m4, m4, m1 ; r2*wim
  186. mulps m5, m5, m0 ; i2*wre
  187. addps m2, m2, m3 ; r2*wre + i2*wim
  188. mulps m3, m1, m7 ; i3*wim
  189. subps m5, m5, m4 ; i2*wre - r2*wim
  190. mulps m1, m1, m6 ; r3*wim
  191. mulps m4, m0, m6 ; r3*wre
  192. mulps m0, m0, m7 ; i3*wre
  193. subps m4, m4, m3 ; r3*wre - i3*wim
  194. mova m3, Z(0)
  195. addps m0, m0, m1 ; i3*wre + r3*wim
  196. subps m1, m4, m2 ; t3
  197. addps m4, m4, m2 ; t5
  198. subps m3, m3, m4 ; r2
  199. addps m4, m4, Z(0) ; r0
  200. mova m6, Z(2)
  201. mova Z(4), m3
  202. mova Z(0), m4
  203. subps m3, m5, m0 ; t4
  204. subps m4, m6, m3 ; r3
  205. addps m3, m3, m6 ; r1
  206. mova Z2(6), m4
  207. mova Z(2), m3
  208. mova m2, Z(3)
  209. addps m3, m5, m0 ; t6
  210. subps m2, m2, m1 ; i3
  211. mova m7, Z(1)
  212. addps m1, m1, Z(3) ; i1
  213. mova Z2(7), m2
  214. mova Z(3), m1
  215. subps m4, m7, m3 ; i2
  216. addps m3, m3, m7 ; i0
  217. mova Z(5), m4
  218. mova Z(1), m3
  219. %endmacro
  220. ; scheduled to avoid store->load aliasing
  221. %macro PASS_BIG 1 ; (!interleave)
  222. mova m4, Z(4) ; r2
  223. mova m5, Z(5) ; i2
  224. mova m0, [wq] ; wre
  225. mova m1, [wq+o1q] ; wim
  226. mulps m2, m4, m0 ; r2*wre
  227. mova m6, Z2(6) ; r3
  228. mulps m3, m5, m1 ; i2*wim
  229. mova m7, Z2(7) ; i3
  230. mulps m4, m4, m1 ; r2*wim
  231. mulps m5, m5, m0 ; i2*wre
  232. addps m2, m2, m3 ; r2*wre + i2*wim
  233. mulps m3, m1, m7 ; i3*wim
  234. mulps m1, m1, m6 ; r3*wim
  235. subps m5, m5, m4 ; i2*wre - r2*wim
  236. mulps m4, m0, m6 ; r3*wre
  237. mulps m0, m0, m7 ; i3*wre
  238. subps m4, m4, m3 ; r3*wre - i3*wim
  239. mova m3, Z(0)
  240. addps m0, m0, m1 ; i3*wre + r3*wim
  241. subps m1, m4, m2 ; t3
  242. addps m4, m4, m2 ; t5
  243. subps m3, m3, m4 ; r2
  244. addps m4, m4, Z(0) ; r0
  245. mova m6, Z(2)
  246. mova Z(4), m3
  247. mova Z(0), m4
  248. subps m3, m5, m0 ; t4
  249. subps m4, m6, m3 ; r3
  250. addps m3, m3, m6 ; r1
  251. IF%1 mova Z2(6), m4
  252. IF%1 mova Z(2), m3
  253. mova m2, Z(3)
  254. addps m5, m5, m0 ; t6
  255. subps m2, m2, m1 ; i3
  256. mova m7, Z(1)
  257. addps m1, m1, Z(3) ; i1
  258. IF%1 mova Z2(7), m2
  259. IF%1 mova Z(3), m1
  260. subps m6, m7, m5 ; i2
  261. addps m5, m5, m7 ; i0
  262. IF%1 mova Z(5), m6
  263. IF%1 mova Z(1), m5
  264. %if %1==0
  265. INTERL m1, m3, m7, Z, 2
  266. INTERL m2, m4, m0, Z2, 6
  267. mova m1, Z(0)
  268. mova m2, Z(4)
  269. INTERL m5, m1, m3, Z, 0
  270. INTERL m6, m2, m7, Z, 4
  271. %endif
  272. %endmacro
  273. %macro PUNPCK 3
  274. mova %3, %1
  275. punpckldq %1, %2
  276. punpckhdq %3, %2
  277. %endmacro
  278. %define Z(x) [r0+mmsize*x]
  279. %define Z2(x) [r0+mmsize*x]
  280. %define ZH(x) [r0+mmsize*x+mmsize/2]
  281. INIT_YMM avx
  282. align 16
  283. fft8_avx:
  284. mova m0, Z(0)
  285. mova m1, Z(1)
  286. T8_AVX m0, m1, m2, m3, m4
  287. mova Z(0), m0
  288. mova Z(1), m1
  289. ret
  290. align 16
  291. fft16_avx:
  292. mova m2, Z(2)
  293. mova m3, Z(3)
  294. T4_SSE m2, m3, m7
  295. mova m0, Z(0)
  296. mova m1, Z(1)
  297. T8_AVX m0, m1, m4, m5, m7
  298. mova m4, [ps_cos16_1]
  299. mova m5, [ps_cos16_2]
  300. vmulps m6, m2, m4
  301. vmulps m7, m3, m5
  302. vaddps m7, m7, m6
  303. vmulps m2, m2, m5
  304. vmulps m3, m3, m4
  305. vsubps m3, m3, m2
  306. vblendps m2, m7, m3, 0xf0
  307. vperm2f128 m3, m7, m3, 0x21
  308. vaddps m4, m2, m3
  309. vsubps m2, m3, m2
  310. vperm2f128 m2, m2, m2, 0x01
  311. vsubps m3, m1, m2
  312. vaddps m1, m1, m2
  313. vsubps m5, m0, m4
  314. vaddps m0, m0, m4
  315. vextractf128 Z(0), m0, 0
  316. vextractf128 ZH(0), m1, 0
  317. vextractf128 Z(1), m0, 1
  318. vextractf128 ZH(1), m1, 1
  319. vextractf128 Z(2), m5, 0
  320. vextractf128 ZH(2), m3, 0
  321. vextractf128 Z(3), m5, 1
  322. vextractf128 ZH(3), m3, 1
  323. ret
  324. align 16
  325. fft32_avx:
  326. call fft16_avx
  327. mova m0, Z(4)
  328. mova m1, Z(5)
  329. T4_SSE m0, m1, m4
  330. mova m2, Z(6)
  331. mova m3, Z(7)
  332. T8_SSE m0, m1, m2, m3, m4, m6
  333. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  334. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  335. vperm2f128 m4, m0, m2, 0x20
  336. vperm2f128 m5, m1, m3, 0x20
  337. vperm2f128 m6, m0, m2, 0x31
  338. vperm2f128 m7, m1, m3, 0x31
  339. PASS_SMALL 0, [cos_32], [cos_32+32]
  340. ret
  341. fft32_interleave_avx:
  342. call fft32_avx
  343. mov r2d, 32
  344. .deint_loop:
  345. mova m2, Z(0)
  346. mova m3, Z(1)
  347. vunpcklps m0, m2, m3
  348. vunpckhps m1, m2, m3
  349. vextractf128 Z(0), m0, 0
  350. vextractf128 ZH(0), m1, 0
  351. vextractf128 Z(1), m0, 1
  352. vextractf128 ZH(1), m1, 1
  353. add r0, mmsize*2
  354. sub r2d, mmsize/4
  355. jg .deint_loop
  356. ret
  357. INIT_XMM sse
  358. align 16
  359. fft4_avx:
  360. fft4_sse:
  361. mova m0, Z(0)
  362. mova m1, Z(1)
  363. T4_SSE m0, m1, m2
  364. mova Z(0), m0
  365. mova Z(1), m1
  366. ret
  367. align 16
  368. fft8_sse:
  369. mova m0, Z(0)
  370. mova m1, Z(1)
  371. T4_SSE m0, m1, m2
  372. mova m2, Z(2)
  373. mova m3, Z(3)
  374. T8_SSE m0, m1, m2, m3, m4, m5
  375. mova Z(0), m0
  376. mova Z(1), m1
  377. mova Z(2), m2
  378. mova Z(3), m3
  379. ret
  380. align 16
  381. fft16_sse:
  382. mova m0, Z(0)
  383. mova m1, Z(1)
  384. T4_SSE m0, m1, m2
  385. mova m2, Z(2)
  386. mova m3, Z(3)
  387. T8_SSE m0, m1, m2, m3, m4, m5
  388. mova m4, Z(4)
  389. mova m5, Z(5)
  390. mova Z(0), m0
  391. mova Z(1), m1
  392. mova Z(2), m2
  393. mova Z(3), m3
  394. T4_SSE m4, m5, m6
  395. mova m6, Z2(6)
  396. mova m7, Z2(7)
  397. T4_SSE m6, m7, m0
  398. PASS_SMALL 0, [cos_16], [cos_16+16]
  399. ret
  400. %macro FFT48_3DNOW 0
  401. align 16
  402. fft4 %+ SUFFIX:
  403. T2_3DNOW m0, m1, Z(0), Z(1)
  404. mova m2, Z(2)
  405. mova m3, Z(3)
  406. T4_3DNOW m0, m1, m2, m3, m4, m5
  407. PUNPCK m0, m1, m4
  408. PUNPCK m2, m3, m5
  409. mova Z(0), m0
  410. mova Z(1), m4
  411. mova Z(2), m2
  412. mova Z(3), m5
  413. ret
  414. align 16
  415. fft8 %+ SUFFIX:
  416. T2_3DNOW m0, m1, Z(0), Z(1)
  417. mova m2, Z(2)
  418. mova m3, Z(3)
  419. T4_3DNOW m0, m1, m2, m3, m4, m5
  420. mova Z(0), m0
  421. mova Z(2), m2
  422. T2_3DNOW m4, m5, Z(4), Z(5)
  423. T2_3DNOW m6, m7, Z2(6), Z2(7)
  424. PSWAPD m0, m5
  425. PSWAPD m2, m7
  426. pxor m0, [ps_m1p1]
  427. pxor m2, [ps_m1p1]
  428. pfsub m5, m0
  429. pfadd m7, m2
  430. pfmul m5, [ps_root2]
  431. pfmul m7, [ps_root2]
  432. T4_3DNOW m1, m3, m5, m7, m0, m2
  433. mova Z(5), m5
  434. mova Z2(7), m7
  435. mova m0, Z(0)
  436. mova m2, Z(2)
  437. T4_3DNOW m0, m2, m4, m6, m5, m7
  438. PUNPCK m0, m1, m5
  439. PUNPCK m2, m3, m7
  440. mova Z(0), m0
  441. mova Z(1), m5
  442. mova Z(2), m2
  443. mova Z(3), m7
  444. PUNPCK m4, Z(5), m5
  445. PUNPCK m6, Z2(7), m7
  446. mova Z(4), m4
  447. mova Z(5), m5
  448. mova Z2(6), m6
  449. mova Z2(7), m7
  450. ret
  451. %endmacro
  452. %if ARCH_X86_32
  453. INIT_MMX 3dnowext
  454. FFT48_3DNOW
  455. INIT_MMX 3dnow
  456. FFT48_3DNOW
  457. %endif
  458. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  459. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  460. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  461. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  462. %macro DECL_PASS 2+ ; name, payload
  463. align 16
  464. %1:
  465. DEFINE_ARGS zc, w, n, o1, o3
  466. lea o3q, [nq*3]
  467. lea o1q, [nq*8]
  468. shl o3q, 4
  469. .loop:
  470. %2
  471. add zcq, mmsize*2
  472. add wq, mmsize
  473. sub nd, mmsize/8
  474. jg .loop
  475. rep ret
  476. %endmacro
  477. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  478. lea r2, [dispatch_tab%1]
  479. mov r2, [r2 + (%2q-2)*gprsize]
  480. %ifdef PIC
  481. lea r3, [$$]
  482. add r2, r3
  483. %endif
  484. call r2
  485. %endmacro ; FFT_DISPATCH
  486. INIT_YMM avx
  487. %macro INTERL_AVX 5
  488. vunpckhps %3, %2, %1
  489. vunpcklps %2, %2, %1
  490. vextractf128 %4(%5), %2, 0
  491. vextractf128 %4 %+ H(%5), %3, 0
  492. vextractf128 %4(%5 + 1), %2, 1
  493. vextractf128 %4 %+ H(%5 + 1), %3, 1
  494. %endmacro
  495. %define INTERL INTERL_AVX
  496. DECL_PASS pass_avx, PASS_BIG 1
  497. DECL_PASS pass_interleave_avx, PASS_BIG 0
  498. cglobal fft_calc, 2,5,8
  499. mov r3d, [r0 + FFTContext.nbits]
  500. mov r0, r1
  501. mov r1, r3
  502. FFT_DISPATCH _interleave %+ SUFFIX, r1
  503. REP_RET
  504. INIT_XMM sse
  505. %macro INTERL_SSE 5
  506. mova %3, %2
  507. unpcklps %2, %1
  508. unpckhps %3, %1
  509. mova %4(%5), %2
  510. mova %4(%5+1), %3
  511. %endmacro
  512. %define INTERL INTERL_SSE
  513. DECL_PASS pass_sse, PASS_BIG 1
  514. DECL_PASS pass_interleave_sse, PASS_BIG 0
  515. %macro FFT_CALC_FUNC 0
  516. cglobal fft_calc, 2,5,8
  517. mov r3d, [r0 + FFTContext.nbits]
  518. PUSH r1
  519. PUSH r3
  520. mov r0, r1
  521. mov r1, r3
  522. FFT_DISPATCH _interleave %+ SUFFIX, r1
  523. POP rcx
  524. POP r4
  525. cmp rcx, 3+(mmsize/16)
  526. jg .end
  527. mov r2, -1
  528. add rcx, 3
  529. shl r2, cl
  530. sub r4, r2
  531. .loop:
  532. %if mmsize == 8
  533. PSWAPD m0, [r4 + r2 + 4]
  534. mova [r4 + r2 + 4], m0
  535. %else
  536. movaps xmm0, [r4 + r2]
  537. movaps xmm1, xmm0
  538. unpcklps xmm0, [r4 + r2 + 16]
  539. unpckhps xmm1, [r4 + r2 + 16]
  540. movaps [r4 + r2], xmm0
  541. movaps [r4 + r2 + 16], xmm1
  542. %endif
  543. add r2, mmsize*2
  544. jl .loop
  545. .end:
  546. %if cpuflag(3dnow)
  547. femms
  548. RET
  549. %else
  550. REP_RET
  551. %endif
  552. %endmacro
  553. %if ARCH_X86_32
  554. INIT_MMX 3dnow
  555. FFT_CALC_FUNC
  556. INIT_MMX 3dnowext
  557. FFT_CALC_FUNC
  558. %endif
  559. INIT_XMM sse
  560. FFT_CALC_FUNC
  561. cglobal fft_permute, 2,7,1
  562. mov r4, [r0 + FFTContext.revtab]
  563. mov r5, [r0 + FFTContext.tmpbuf]
  564. mov ecx, [r0 + FFTContext.nbits]
  565. mov r2, 1
  566. shl r2, cl
  567. xor r0, r0
  568. %if ARCH_X86_32
  569. mov r1, r1m
  570. %endif
  571. .loop:
  572. movaps xmm0, [r1 + 8*r0]
  573. movzx r6, word [r4 + 2*r0]
  574. movzx r3, word [r4 + 2*r0 + 2]
  575. movlps [r5 + 8*r6], xmm0
  576. movhps [r5 + 8*r3], xmm0
  577. add r0, 2
  578. cmp r0, r2
  579. jl .loop
  580. shl r2, 3
  581. add r1, r2
  582. add r5, r2
  583. neg r2
  584. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  585. .loopcopy:
  586. movaps xmm0, [r5 + r2]
  587. movaps xmm1, [r5 + r2 + 16]
  588. movaps [r1 + r2], xmm0
  589. movaps [r1 + r2 + 16], xmm1
  590. add r2, 32
  591. jl .loopcopy
  592. REP_RET
  593. %if ARCH_X86_32
  594. INIT_MMX 3dnow
  595. %define mulps pfmul
  596. %define addps pfadd
  597. %define subps pfsub
  598. %define unpcklps punpckldq
  599. %define unpckhps punpckhdq
  600. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  601. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  602. %define pass_3dnowext pass_3dnow
  603. %define pass_interleave_3dnowext pass_interleave_3dnow
  604. %endif
  605. %ifdef PIC
  606. %define SECTION_REL - $$
  607. %else
  608. %define SECTION_REL
  609. %endif
  610. %macro DECL_FFT 1-2 ; nbits, suffix
  611. %ifidn %0, 1
  612. %xdefine fullsuffix SUFFIX
  613. %else
  614. %xdefine fullsuffix %2 %+ SUFFIX
  615. %endif
  616. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  617. %if %1>=5
  618. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  619. %endif
  620. %if %1>=6
  621. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  622. %endif
  623. %assign n 1<<%1
  624. %rep 17-%1
  625. %assign n2 n/2
  626. %assign n4 n/4
  627. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  628. align 16
  629. fft %+ n %+ fullsuffix:
  630. call fft %+ n2 %+ SUFFIX
  631. add r0, n*4 - (n&(-2<<%1))
  632. call fft %+ n4 %+ SUFFIX
  633. add r0, n*2 - (n2&(-2<<%1))
  634. call fft %+ n4 %+ SUFFIX
  635. sub r0, n*6 + (n2&(-2<<%1))
  636. lea r1, [cos_ %+ n]
  637. mov r2d, n4/2
  638. jmp pass %+ fullsuffix
  639. %assign n n*2
  640. %endrep
  641. %undef n
  642. align 8
  643. dispatch_tab %+ fullsuffix: pointer list_of_fft
  644. %endmacro ; DECL_FFT
  645. INIT_YMM avx
  646. DECL_FFT 6
  647. DECL_FFT 6, _interleave
  648. INIT_XMM sse
  649. DECL_FFT 5
  650. DECL_FFT 5, _interleave
  651. %if ARCH_X86_32
  652. INIT_MMX 3dnow
  653. DECL_FFT 4
  654. DECL_FFT 4, _interleave
  655. INIT_MMX 3dnowext
  656. DECL_FFT 4
  657. DECL_FFT 4, _interleave
  658. %endif
  659. %if CONFIG_MDCT
  660. %macro IMDCT_CALC_FUNC 0
  661. cglobal imdct_calc, 3,5,3
  662. mov r3d, [r0 + FFTContext.mdctsize]
  663. mov r4, [r0 + FFTContext.imdcthalf]
  664. add r1, r3
  665. PUSH r3
  666. PUSH r1
  667. %if ARCH_X86_32
  668. push r2
  669. push r1
  670. push r0
  671. %else
  672. sub rsp, 8+32*WIN64 ; allocate win64 shadow space
  673. %endif
  674. call r4
  675. %if ARCH_X86_32
  676. add esp, 12
  677. %else
  678. add rsp, 8+32*WIN64
  679. %endif
  680. POP r1
  681. POP r3
  682. lea r0, [r1 + 2*r3]
  683. mov r2, r3
  684. sub r3, mmsize
  685. neg r2
  686. mova m2, [ps_m1m1m1m1]
  687. .loop:
  688. %if mmsize == 8
  689. PSWAPD m0, [r1 + r3]
  690. PSWAPD m1, [r0 + r2]
  691. pxor m0, m2
  692. %else
  693. mova m0, [r1 + r3]
  694. mova m1, [r0 + r2]
  695. shufps m0, m0, 0x1b
  696. shufps m1, m1, 0x1b
  697. xorps m0, m2
  698. %endif
  699. mova [r0 + r3], m1
  700. mova [r1 + r2], m0
  701. sub r3, mmsize
  702. add r2, mmsize
  703. jl .loop
  704. %if cpuflag(3dnow)
  705. femms
  706. RET
  707. %else
  708. REP_RET
  709. %endif
  710. %endmacro
  711. %if ARCH_X86_32
  712. INIT_MMX 3dnow
  713. IMDCT_CALC_FUNC
  714. INIT_MMX 3dnowext
  715. IMDCT_CALC_FUNC
  716. %endif
  717. INIT_XMM sse
  718. IMDCT_CALC_FUNC
  719. INIT_XMM sse
  720. %undef mulps
  721. %undef addps
  722. %undef subps
  723. %undef unpcklps
  724. %undef unpckhps
  725. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  726. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  727. PSWAPD m0, [%3+%2*4]
  728. movq m2, [%3+%1*4-8]
  729. movq m3, m0
  730. punpckldq m0, m2
  731. punpckhdq m2, m3
  732. movd m1, [%4+%1*2-4] ; tcos[j]
  733. movd m3, [%4+%2*2] ; tcos[n4-j-1]
  734. punpckldq m1, [%5+%1*2-4] ; tsin[j]
  735. punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
  736. mova m4, m0
  737. PSWAPD m5, m1
  738. pfmul m0, m1
  739. pfmul m4, m5
  740. mova m6, m2
  741. PSWAPD m5, m3
  742. pfmul m2, m3
  743. pfmul m6, m5
  744. %if cpuflag(3dnowext)
  745. pfpnacc m0, m4
  746. pfpnacc m2, m6
  747. %else
  748. SBUTTERFLY dq, 0, 4, 1
  749. SBUTTERFLY dq, 2, 6, 3
  750. pxor m4, m7
  751. pxor m6, m7
  752. pfadd m0, m4
  753. pfadd m2, m6
  754. %endif
  755. %else
  756. movaps xmm0, [%3+%2*4]
  757. movaps xmm1, [%3+%1*4-0x10]
  758. movaps xmm2, xmm0
  759. shufps xmm0, xmm1, 0x88
  760. shufps xmm1, xmm2, 0x77
  761. movlps xmm4, [%4+%2*2]
  762. movlps xmm5, [%5+%2*2+0x0]
  763. movhps xmm4, [%4+%1*2-0x8]
  764. movhps xmm5, [%5+%1*2-0x8]
  765. movaps xmm2, xmm0
  766. movaps xmm3, xmm1
  767. mulps xmm0, xmm5
  768. mulps xmm1, xmm4
  769. mulps xmm2, xmm4
  770. mulps xmm3, xmm5
  771. subps xmm1, xmm0
  772. addps xmm2, xmm3
  773. movaps xmm0, xmm1
  774. unpcklps xmm1, xmm2
  775. unpckhps xmm0, xmm2
  776. %endif
  777. %endmacro
  778. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  779. mulps m6, %3, [%5+%1]
  780. mulps m7, %2, [%5+%1]
  781. mulps %2, %2, [%6+%1]
  782. mulps %3, %3, [%6+%1]
  783. subps %2, %2, m6
  784. addps %3, %3, m7
  785. %endmacro
  786. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  787. .post:
  788. vmovaps ymm1, [%3+%1*2]
  789. vmovaps ymm0, [%3+%1*2+0x20]
  790. vmovaps ymm3, [%3+%2*2]
  791. vmovaps ymm2, [%3+%2*2+0x20]
  792. CMUL %1, ymm0, ymm1, %3, %4, %5
  793. CMUL %2, ymm2, ymm3, %3, %4, %5
  794. vshufps ymm1, ymm1, ymm1, 0x1b
  795. vshufps ymm3, ymm3, ymm3, 0x1b
  796. vperm2f128 ymm1, ymm1, ymm1, 0x01
  797. vperm2f128 ymm3, ymm3, ymm3, 0x01
  798. vunpcklps ymm6, ymm2, ymm1
  799. vunpckhps ymm4, ymm2, ymm1
  800. vunpcklps ymm7, ymm0, ymm3
  801. vunpckhps ymm5, ymm0, ymm3
  802. vextractf128 [%3+%1*2], ymm7, 0
  803. vextractf128 [%3+%1*2+0x10], ymm5, 0
  804. vextractf128 [%3+%1*2+0x20], ymm7, 1
  805. vextractf128 [%3+%1*2+0x30], ymm5, 1
  806. vextractf128 [%3+%2*2], ymm6, 0
  807. vextractf128 [%3+%2*2+0x10], ymm4, 0
  808. vextractf128 [%3+%2*2+0x20], ymm6, 1
  809. vextractf128 [%3+%2*2+0x30], ymm4, 1
  810. sub %2, 0x20
  811. add %1, 0x20
  812. jl .post
  813. %endmacro
  814. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  815. .post:
  816. movaps xmm1, [%3+%1*2]
  817. movaps xmm0, [%3+%1*2+0x10]
  818. CMUL %1, xmm0, xmm1, %3, %4, %5
  819. movaps xmm5, [%3+%2*2]
  820. movaps xmm4, [%3+%2*2+0x10]
  821. CMUL %2, xmm4, xmm5, %3, %4, %5
  822. shufps xmm1, xmm1, 0x1b
  823. shufps xmm5, xmm5, 0x1b
  824. movaps xmm6, xmm4
  825. unpckhps xmm4, xmm1
  826. unpcklps xmm6, xmm1
  827. movaps xmm2, xmm0
  828. unpcklps xmm0, xmm5
  829. unpckhps xmm2, xmm5
  830. movaps [%3+%2*2], xmm6
  831. movaps [%3+%2*2+0x10], xmm4
  832. movaps [%3+%1*2], xmm0
  833. movaps [%3+%1*2+0x10], xmm2
  834. sub %2, 0x10
  835. add %1, 0x10
  836. jl .post
  837. %endmacro
  838. %macro CMUL_3DNOW 6
  839. mova m6, [%1+%2*2]
  840. mova %3, [%1+%2*2+8]
  841. mova %4, m6
  842. mova m7, %3
  843. pfmul m6, [%5+%2]
  844. pfmul %3, [%6+%2]
  845. pfmul %4, [%6+%2]
  846. pfmul m7, [%5+%2]
  847. pfsub %3, m6
  848. pfadd %4, m7
  849. %endmacro
  850. %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
  851. .post:
  852. CMUL_3DNOW %3, %1, m0, m1, %4, %5
  853. CMUL_3DNOW %3, %2, m2, m3, %4, %5
  854. movd [%3+%1*2+ 0], m0
  855. movd [%3+%2*2+12], m1
  856. movd [%3+%2*2+ 0], m2
  857. movd [%3+%1*2+12], m3
  858. psrlq m0, 32
  859. psrlq m1, 32
  860. psrlq m2, 32
  861. psrlq m3, 32
  862. movd [%3+%1*2+ 8], m0
  863. movd [%3+%2*2+ 4], m1
  864. movd [%3+%2*2+ 8], m2
  865. movd [%3+%1*2+ 4], m3
  866. sub %2, 8
  867. add %1, 8
  868. jl .post
  869. %endmacro
  870. %macro DECL_IMDCT 1
  871. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  872. %if ARCH_X86_64
  873. %define rrevtab r7
  874. %define rtcos r8
  875. %define rtsin r9
  876. %else
  877. %define rrevtab r6
  878. %define rtsin r6
  879. %define rtcos r5
  880. %endif
  881. mov r3d, [r0+FFTContext.mdctsize]
  882. add r2, r3
  883. shr r3, 1
  884. mov rtcos, [r0+FFTContext.tcos]
  885. mov rtsin, [r0+FFTContext.tsin]
  886. add rtcos, r3
  887. add rtsin, r3
  888. %if ARCH_X86_64 == 0
  889. push rtcos
  890. push rtsin
  891. %endif
  892. shr r3, 1
  893. mov rrevtab, [r0+FFTContext.revtab]
  894. add rrevtab, r3
  895. %if ARCH_X86_64 == 0
  896. push rrevtab
  897. %endif
  898. %if mmsize == 8
  899. sub r3, 2
  900. %else
  901. sub r3, 4
  902. %endif
  903. %if ARCH_X86_64 || mmsize == 8
  904. xor r4, r4
  905. sub r4, r3
  906. %endif
  907. %if notcpuflag(3dnowext) && mmsize == 8
  908. movd m7, [ps_m1m1m1m1]
  909. %endif
  910. .pre:
  911. %if ARCH_X86_64 == 0
  912. ;unspill
  913. %if mmsize != 8
  914. xor r4, r4
  915. sub r4, r3
  916. %endif
  917. mov rtcos, [esp+8]
  918. mov rtsin, [esp+4]
  919. %endif
  920. PREROTATER r4, r3, r2, rtcos, rtsin
  921. %if mmsize == 8
  922. mov r6, [esp] ; rrevtab = ptr+n8
  923. movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
  924. movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
  925. mova [r1+r5*8], m0
  926. mova [r1+r6*8], m2
  927. add r4, 2
  928. sub r3, 2
  929. %else
  930. %if ARCH_X86_64
  931. movzx r5, word [rrevtab+r4-4]
  932. movzx r6, word [rrevtab+r4-2]
  933. movzx r10, word [rrevtab+r3]
  934. movzx r11, word [rrevtab+r3+2]
  935. movlps [r1+r5 *8], xmm0
  936. movhps [r1+r6 *8], xmm0
  937. movlps [r1+r10*8], xmm1
  938. movhps [r1+r11*8], xmm1
  939. add r4, 4
  940. %else
  941. mov r6, [esp]
  942. movzx r5, word [r6+r4-4]
  943. movzx r4, word [r6+r4-2]
  944. movlps [r1+r5*8], xmm0
  945. movhps [r1+r4*8], xmm0
  946. movzx r5, word [r6+r3]
  947. movzx r4, word [r6+r3+2]
  948. movlps [r1+r5*8], xmm1
  949. movhps [r1+r4*8], xmm1
  950. %endif
  951. sub r3, 4
  952. %endif
  953. jns .pre
  954. mov r5, r0
  955. mov r6, r1
  956. mov r0, r1
  957. mov r1d, [r5+FFTContext.nbits]
  958. FFT_DISPATCH SUFFIX, r1
  959. mov r0d, [r5+FFTContext.mdctsize]
  960. add r6, r0
  961. shr r0, 1
  962. %if ARCH_X86_64 == 0
  963. %define rtcos r2
  964. %define rtsin r3
  965. mov rtcos, [esp+8]
  966. mov rtsin, [esp+4]
  967. %endif
  968. neg r0
  969. mov r1, -mmsize
  970. sub r1, r0
  971. %1 r0, r1, r6, rtcos, rtsin
  972. %if ARCH_X86_64 == 0
  973. add esp, 12
  974. %endif
  975. %if mmsize == 8
  976. femms
  977. %endif
  978. RET
  979. %endmacro
  980. DECL_IMDCT POSROTATESHUF
  981. %if ARCH_X86_32
  982. INIT_MMX 3dnow
  983. DECL_IMDCT POSROTATESHUF_3DNOW
  984. INIT_MMX 3dnowext
  985. DECL_IMDCT POSROTATESHUF_3DNOW
  986. %endif
  987. INIT_YMM avx
  988. DECL_IMDCT POSROTATESHUF_AVX
  989. %endif ; CONFIG_MDCT