You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

973 lines
23KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of Libav.
  10. ;*
  11. ;* Libav is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* Libav is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with Libav; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "x86inc.asm"
  30. %if ARCH_X86_64
  31. %define pointer resq
  32. %else
  33. %define pointer resd
  34. %endif
  35. struc FFTContext
  36. .nbits: resd 1
  37. .reverse: resd 1
  38. .revtab: pointer 1
  39. .tmpbuf: pointer 1
  40. .mdctsize: resd 1
  41. .mdctbits: resd 1
  42. .tcos: pointer 1
  43. .tsin: pointer 1
  44. .fftperm: pointer 1
  45. .fftcalc: pointer 1
  46. .imdctcalc:pointer 1
  47. .imdcthalf:pointer 1
  48. endstruc
  49. SECTION_RODATA
  50. %define M_SQRT1_2 0.70710678118654752440
  51. %define M_COS_PI_1_8 0.923879532511287
  52. %define M_COS_PI_3_8 0.38268343236509
  53. align 32
  54. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  55. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  56. ps_root2: times 8 dd M_SQRT1_2
  57. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  58. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  59. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  60. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  61. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  62. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  63. ps_m1m1m1m1: times 4 dd 1<<31
  64. ps_m1p1: dd 1<<31, 0
  65. %assign i 16
  66. %rep 13
  67. cextern cos_ %+ i
  68. %assign i i<<1
  69. %endrep
  70. %if ARCH_X86_64
  71. %define pointer dq
  72. %else
  73. %define pointer dd
  74. %endif
  75. %macro IF0 1+
  76. %endmacro
  77. %macro IF1 1+
  78. %1
  79. %endmacro
  80. SECTION_TEXT
  81. %macro T2_3DN 4 ; z0, z1, mem0, mem1
  82. mova %1, %3
  83. mova %2, %1
  84. pfadd %1, %4
  85. pfsub %2, %4
  86. %endmacro
  87. %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
  88. mova %5, %3
  89. pfsub %3, %4
  90. pfadd %5, %4 ; {t6,t5}
  91. pxor %3, [ps_m1p1] ; {t8,t7}
  92. mova %6, %1
  93. pswapd %3, %3
  94. pfadd %1, %5 ; {r0,i0}
  95. pfsub %6, %5 ; {r2,i2}
  96. mova %4, %2
  97. pfadd %2, %3 ; {r1,i1}
  98. pfsub %4, %3 ; {r3,i3}
  99. SWAP %3, %6
  100. %endmacro
  101. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  102. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  103. ; %3, %4, %5 tmp
  104. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  105. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  106. %macro T8_AVX 5
  107. vsubps %5, %1, %2 ; v = %1 - %2
  108. vaddps %3, %1, %2 ; w = %1 + %2
  109. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  110. vpermilps %2, %2, [perm1]
  111. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  112. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  113. vsubps %4, %5, %1 ; s = r - q
  114. vaddps %1, %5, %1 ; u = r + q
  115. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  116. vshufps %5, %4, %1, 0xbb
  117. vshufps %3, %4, %1, 0xee
  118. vperm2f128 %3, %3, %5, 0x13
  119. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  120. vshufps %2, %1, %4, 0xdd
  121. vshufps %1, %1, %4, 0x88
  122. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  123. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  124. vsubps %5, %1, %3
  125. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  126. vsubps %2, %4, %1 ; %2 = v - w
  127. vaddps %1, %4, %1 ; %1 = v + w
  128. %endmacro
  129. ; In SSE mode do one fft4 transforms
  130. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  131. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  132. ;
  133. ; In AVX mode do two fft4 transforms
  134. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  135. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  136. %macro T4_SSE 3
  137. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  138. addps %1, %1, %2 ; {t1,t2,t6,t5}
  139. xorps %3, %3, [ps_p1p1m1p1]
  140. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  141. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  142. subps %3, %1, %2 ; {r2,i2,r3,i3}
  143. addps %1, %1, %2 ; {r0,i0,r1,i1}
  144. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  145. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  146. %endmacro
  147. ; In SSE mode do one FFT8
  148. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  149. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  150. ;
  151. ; In AVX mode do two FFT8
  152. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  153. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  154. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  155. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  156. %macro T8_SSE 6
  157. addps %6, %3, %4 ; {t1,t2,t3,t4}
  158. subps %3, %3, %4 ; {r5,i5,r7,i7}
  159. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  160. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  161. mulps %4, %4, [ps_root2]
  162. addps %3, %3, %4 ; {t8,t7,ta,t9}
  163. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  164. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  165. subps %3, %6, %4 ; {t6,t5,tc,tb}
  166. addps %6, %6, %4 ; {t1,t2,t9,ta}
  167. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  168. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  169. subps %3, %1, %6 ; {r4,r5,r6,r7}
  170. addps %1, %1, %6 ; {r0,r1,r2,r3}
  171. subps %4, %2, %5 ; {i4,i5,i6,i7}
  172. addps %2, %2, %5 ; {i0,i1,i2,i3}
  173. %endmacro
  174. ; scheduled for cpu-bound sizes
  175. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  176. IF%1 mova m4, Z(4)
  177. IF%1 mova m5, Z(5)
  178. mova m0, %2 ; wre
  179. mova m1, %3 ; wim
  180. mulps m2, m4, m0 ; r2*wre
  181. IF%1 mova m6, Z2(6)
  182. mulps m3, m5, m1 ; i2*wim
  183. IF%1 mova m7, Z2(7)
  184. mulps m4, m4, m1 ; r2*wim
  185. mulps m5, m5, m0 ; i2*wre
  186. addps m2, m2, m3 ; r2*wre + i2*wim
  187. mulps m3, m1, m7 ; i3*wim
  188. subps m5, m5, m4 ; i2*wre - r2*wim
  189. mulps m1, m1, m6 ; r3*wim
  190. mulps m4, m0, m6 ; r3*wre
  191. mulps m0, m0, m7 ; i3*wre
  192. subps m4, m4, m3 ; r3*wre - i3*wim
  193. mova m3, Z(0)
  194. addps m0, m0, m1 ; i3*wre + r3*wim
  195. subps m1, m4, m2 ; t3
  196. addps m4, m4, m2 ; t5
  197. subps m3, m3, m4 ; r2
  198. addps m4, m4, Z(0) ; r0
  199. mova m6, Z(2)
  200. mova Z(4), m3
  201. mova Z(0), m4
  202. subps m3, m5, m0 ; t4
  203. subps m4, m6, m3 ; r3
  204. addps m3, m3, m6 ; r1
  205. mova Z2(6), m4
  206. mova Z(2), m3
  207. mova m2, Z(3)
  208. addps m3, m5, m0 ; t6
  209. subps m2, m2, m1 ; i3
  210. mova m7, Z(1)
  211. addps m1, m1, Z(3) ; i1
  212. mova Z2(7), m2
  213. mova Z(3), m1
  214. subps m4, m7, m3 ; i2
  215. addps m3, m3, m7 ; i0
  216. mova Z(5), m4
  217. mova Z(1), m3
  218. %endmacro
  219. ; scheduled to avoid store->load aliasing
  220. %macro PASS_BIG 1 ; (!interleave)
  221. mova m4, Z(4) ; r2
  222. mova m5, Z(5) ; i2
  223. mova m0, [wq] ; wre
  224. mova m1, [wq+o1q] ; wim
  225. mulps m2, m4, m0 ; r2*wre
  226. mova m6, Z2(6) ; r3
  227. mulps m3, m5, m1 ; i2*wim
  228. mova m7, Z2(7) ; i3
  229. mulps m4, m4, m1 ; r2*wim
  230. mulps m5, m5, m0 ; i2*wre
  231. addps m2, m2, m3 ; r2*wre + i2*wim
  232. mulps m3, m1, m7 ; i3*wim
  233. mulps m1, m1, m6 ; r3*wim
  234. subps m5, m5, m4 ; i2*wre - r2*wim
  235. mulps m4, m0, m6 ; r3*wre
  236. mulps m0, m0, m7 ; i3*wre
  237. subps m4, m4, m3 ; r3*wre - i3*wim
  238. mova m3, Z(0)
  239. addps m0, m0, m1 ; i3*wre + r3*wim
  240. subps m1, m4, m2 ; t3
  241. addps m4, m4, m2 ; t5
  242. subps m3, m3, m4 ; r2
  243. addps m4, m4, Z(0) ; r0
  244. mova m6, Z(2)
  245. mova Z(4), m3
  246. mova Z(0), m4
  247. subps m3, m5, m0 ; t4
  248. subps m4, m6, m3 ; r3
  249. addps m3, m3, m6 ; r1
  250. IF%1 mova Z2(6), m4
  251. IF%1 mova Z(2), m3
  252. mova m2, Z(3)
  253. addps m5, m5, m0 ; t6
  254. subps m2, m2, m1 ; i3
  255. mova m7, Z(1)
  256. addps m1, m1, Z(3) ; i1
  257. IF%1 mova Z2(7), m2
  258. IF%1 mova Z(3), m1
  259. subps m6, m7, m5 ; i2
  260. addps m5, m5, m7 ; i0
  261. IF%1 mova Z(5), m6
  262. IF%1 mova Z(1), m5
  263. %if %1==0
  264. INTERL m1, m3, m7, Z, 2
  265. INTERL m2, m4, m0, Z2, 6
  266. mova m1, Z(0)
  267. mova m2, Z(4)
  268. INTERL m5, m1, m3, Z, 0
  269. INTERL m6, m2, m7, Z, 4
  270. %endif
  271. %endmacro
  272. %macro PUNPCK 3
  273. mova %3, %1
  274. punpckldq %1, %2
  275. punpckhdq %3, %2
  276. %endmacro
  277. %define Z(x) [r0+mmsize*x]
  278. %define Z2(x) [r0+mmsize*x]
  279. %define ZH(x) [r0+mmsize*x+mmsize/2]
  280. INIT_YMM avx
  281. %if HAVE_AVX
  282. align 16
  283. fft8_avx:
  284. mova m0, Z(0)
  285. mova m1, Z(1)
  286. T8_AVX m0, m1, m2, m3, m4
  287. mova Z(0), m0
  288. mova Z(1), m1
  289. ret
  290. align 16
  291. fft16_avx:
  292. mova m2, Z(2)
  293. mova m3, Z(3)
  294. T4_SSE m2, m3, m7
  295. mova m0, Z(0)
  296. mova m1, Z(1)
  297. T8_AVX m0, m1, m4, m5, m7
  298. mova m4, [ps_cos16_1]
  299. mova m5, [ps_cos16_2]
  300. vmulps m6, m2, m4
  301. vmulps m7, m3, m5
  302. vaddps m7, m7, m6
  303. vmulps m2, m2, m5
  304. vmulps m3, m3, m4
  305. vsubps m3, m3, m2
  306. vblendps m2, m7, m3, 0xf0
  307. vperm2f128 m3, m7, m3, 0x21
  308. vaddps m4, m2, m3
  309. vsubps m2, m3, m2
  310. vperm2f128 m2, m2, m2, 0x01
  311. vsubps m3, m1, m2
  312. vaddps m1, m1, m2
  313. vsubps m5, m0, m4
  314. vaddps m0, m0, m4
  315. vextractf128 Z(0), m0, 0
  316. vextractf128 ZH(0), m1, 0
  317. vextractf128 Z(1), m0, 1
  318. vextractf128 ZH(1), m1, 1
  319. vextractf128 Z(2), m5, 0
  320. vextractf128 ZH(2), m3, 0
  321. vextractf128 Z(3), m5, 1
  322. vextractf128 ZH(3), m3, 1
  323. ret
  324. align 16
  325. fft32_avx:
  326. call fft16_avx
  327. mova m0, Z(4)
  328. mova m1, Z(5)
  329. T4_SSE m0, m1, m4
  330. mova m2, Z(6)
  331. mova m3, Z(7)
  332. T8_SSE m0, m1, m2, m3, m4, m6
  333. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  334. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  335. vperm2f128 m4, m0, m2, 0x20
  336. vperm2f128 m5, m1, m3, 0x20
  337. vperm2f128 m6, m0, m2, 0x31
  338. vperm2f128 m7, m1, m3, 0x31
  339. PASS_SMALL 0, [cos_32], [cos_32+32]
  340. ret
  341. fft32_interleave_avx:
  342. call fft32_avx
  343. mov r2d, 32
  344. .deint_loop:
  345. mova m2, Z(0)
  346. mova m3, Z(1)
  347. vunpcklps m0, m2, m3
  348. vunpckhps m1, m2, m3
  349. vextractf128 Z(0), m0, 0
  350. vextractf128 ZH(0), m1, 0
  351. vextractf128 Z(1), m0, 1
  352. vextractf128 ZH(1), m1, 1
  353. add r0, mmsize*2
  354. sub r2d, mmsize/4
  355. jg .deint_loop
  356. ret
  357. %endif
  358. INIT_XMM sse
  359. %define movdqa movaps
  360. align 16
  361. fft4_avx:
  362. fft4_sse:
  363. mova m0, Z(0)
  364. mova m1, Z(1)
  365. T4_SSE m0, m1, m2
  366. mova Z(0), m0
  367. mova Z(1), m1
  368. ret
  369. align 16
  370. fft8_sse:
  371. mova m0, Z(0)
  372. mova m1, Z(1)
  373. T4_SSE m0, m1, m2
  374. mova m2, Z(2)
  375. mova m3, Z(3)
  376. T8_SSE m0, m1, m2, m3, m4, m5
  377. mova Z(0), m0
  378. mova Z(1), m1
  379. mova Z(2), m2
  380. mova Z(3), m3
  381. ret
  382. align 16
  383. fft16_sse:
  384. mova m0, Z(0)
  385. mova m1, Z(1)
  386. T4_SSE m0, m1, m2
  387. mova m2, Z(2)
  388. mova m3, Z(3)
  389. T8_SSE m0, m1, m2, m3, m4, m5
  390. mova m4, Z(4)
  391. mova m5, Z(5)
  392. mova Z(0), m0
  393. mova Z(1), m1
  394. mova Z(2), m2
  395. mova Z(3), m3
  396. T4_SSE m4, m5, m6
  397. mova m6, Z2(6)
  398. mova m7, Z2(7)
  399. T4_SSE m6, m7, m0
  400. PASS_SMALL 0, [cos_16], [cos_16+16]
  401. ret
  402. %macro FFT48_3DN 0
  403. align 16
  404. fft4 %+ SUFFIX:
  405. T2_3DN m0, m1, Z(0), Z(1)
  406. mova m2, Z(2)
  407. mova m3, Z(3)
  408. T4_3DN m0, m1, m2, m3, m4, m5
  409. PUNPCK m0, m1, m4
  410. PUNPCK m2, m3, m5
  411. mova Z(0), m0
  412. mova Z(1), m4
  413. mova Z(2), m2
  414. mova Z(3), m5
  415. ret
  416. align 16
  417. fft8 %+ SUFFIX:
  418. T2_3DN m0, m1, Z(0), Z(1)
  419. mova m2, Z(2)
  420. mova m3, Z(3)
  421. T4_3DN m0, m1, m2, m3, m4, m5
  422. mova Z(0), m0
  423. mova Z(2), m2
  424. T2_3DN m4, m5, Z(4), Z(5)
  425. T2_3DN m6, m7, Z2(6), Z2(7)
  426. pswapd m0, m5
  427. pswapd m2, m7
  428. pxor m0, [ps_m1p1]
  429. pxor m2, [ps_m1p1]
  430. pfsub m5, m0
  431. pfadd m7, m2
  432. pfmul m5, [ps_root2]
  433. pfmul m7, [ps_root2]
  434. T4_3DN m1, m3, m5, m7, m0, m2
  435. mova Z(5), m5
  436. mova Z2(7), m7
  437. mova m0, Z(0)
  438. mova m2, Z(2)
  439. T4_3DN m0, m2, m4, m6, m5, m7
  440. PUNPCK m0, m1, m5
  441. PUNPCK m2, m3, m7
  442. mova Z(0), m0
  443. mova Z(1), m5
  444. mova Z(2), m2
  445. mova Z(3), m7
  446. PUNPCK m4, Z(5), m5
  447. PUNPCK m6, Z2(7), m7
  448. mova Z(4), m4
  449. mova Z(5), m5
  450. mova Z2(6), m6
  451. mova Z2(7), m7
  452. ret
  453. %endmacro
  454. INIT_MMX 3dnow2
  455. FFT48_3DN
  456. %macro pswapd 2
  457. %ifidn %1, %2
  458. movd [r0+12], %1
  459. punpckhdq %1, [r0+8]
  460. %else
  461. movq %1, %2
  462. psrlq %1, 32
  463. punpckldq %1, %2
  464. %endif
  465. %endmacro
  466. INIT_MMX 3dnow
  467. FFT48_3DN
  468. %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
  469. %define Z2(x) [zq + o3q + mmsize*(x&1)]
  470. %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  471. %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
  472. %macro DECL_PASS 2+ ; name, payload
  473. align 16
  474. %1:
  475. DEFINE_ARGS z, w, n, o1, o3
  476. lea o3q, [nq*3]
  477. lea o1q, [nq*8]
  478. shl o3q, 4
  479. .loop:
  480. %2
  481. add zq, mmsize*2
  482. add wq, mmsize
  483. sub nd, mmsize/8
  484. jg .loop
  485. rep ret
  486. %endmacro
  487. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  488. lea r2, [dispatch_tab%1]
  489. mov r2, [r2 + (%2q-2)*gprsize]
  490. %ifdef PIC
  491. lea r3, [$$]
  492. add r2, r3
  493. %endif
  494. call r2
  495. %endmacro ; FFT_DISPATCH
  496. INIT_YMM avx
  497. %if HAVE_AVX
  498. %macro INTERL_AVX 5
  499. vunpckhps %3, %2, %1
  500. vunpcklps %2, %2, %1
  501. vextractf128 %4(%5), %2, 0
  502. vextractf128 %4 %+ H(%5), %3, 0
  503. vextractf128 %4(%5 + 1), %2, 1
  504. vextractf128 %4 %+ H(%5 + 1), %3, 1
  505. %endmacro
  506. %define INTERL INTERL_AVX
  507. DECL_PASS pass_avx, PASS_BIG 1
  508. DECL_PASS pass_interleave_avx, PASS_BIG 0
  509. cglobal fft_calc, 2,5,8
  510. mov r3d, [r0 + FFTContext.nbits]
  511. mov r0, r1
  512. mov r1, r3
  513. FFT_DISPATCH _interleave %+ SUFFIX, r1
  514. REP_RET
  515. %endif
  516. INIT_XMM sse
  517. %macro INTERL_SSE 5
  518. mova %3, %2
  519. unpcklps %2, %1
  520. unpckhps %3, %1
  521. mova %4(%5), %2
  522. mova %4(%5+1), %3
  523. %endmacro
  524. %define INTERL INTERL_SSE
  525. DECL_PASS pass_sse, PASS_BIG 1
  526. DECL_PASS pass_interleave_sse, PASS_BIG 0
  527. cglobal fft_calc, 2,5,8
  528. mov r3d, [r0 + FFTContext.nbits]
  529. PUSH r1
  530. PUSH r3
  531. mov r0, r1
  532. mov r1, r3
  533. FFT_DISPATCH _interleave %+ SUFFIX, r1
  534. POP rcx
  535. POP r4
  536. cmp rcx, 4
  537. jg .end
  538. mov r2, -1
  539. add rcx, 3
  540. shl r2, cl
  541. sub r4, r2
  542. .loop
  543. movaps xmm0, [r4 + r2]
  544. movaps xmm1, xmm0
  545. unpcklps xmm0, [r4 + r2 + 16]
  546. unpckhps xmm1, [r4 + r2 + 16]
  547. movaps [r4 + r2], xmm0
  548. movaps [r4 + r2 + 16], xmm1
  549. add r2, 32
  550. jl .loop
  551. .end:
  552. REP_RET
  553. cglobal fft_permute, 2,7,1
  554. mov r4, [r0 + FFTContext.revtab]
  555. mov r5, [r0 + FFTContext.tmpbuf]
  556. mov ecx, [r0 + FFTContext.nbits]
  557. mov r2, 1
  558. shl r2, cl
  559. xor r0, r0
  560. %if ARCH_X86_32
  561. mov r1, r1m
  562. %endif
  563. .loop:
  564. movaps xmm0, [r1 + 8*r0]
  565. movzx r6, word [r4 + 2*r0]
  566. movzx r3, word [r4 + 2*r0 + 2]
  567. movlps [r5 + 8*r6], xmm0
  568. movhps [r5 + 8*r3], xmm0
  569. add r0, 2
  570. cmp r0, r2
  571. jl .loop
  572. shl r2, 3
  573. add r1, r2
  574. add r5, r2
  575. neg r2
  576. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  577. .loopcopy:
  578. movaps xmm0, [r5 + r2]
  579. movaps xmm1, [r5 + r2 + 16]
  580. movaps [r1 + r2], xmm0
  581. movaps [r1 + r2 + 16], xmm1
  582. add r2, 32
  583. jl .loopcopy
  584. REP_RET
  585. cglobal imdct_calc, 3,5,3
  586. mov r3d, [r0 + FFTContext.mdctsize]
  587. mov r4, [r0 + FFTContext.imdcthalf]
  588. add r1, r3
  589. PUSH r3
  590. PUSH r1
  591. %if ARCH_X86_32
  592. push r2
  593. push r1
  594. push r0
  595. %else
  596. sub rsp, 8
  597. %endif
  598. call r4
  599. %if ARCH_X86_32
  600. add esp, 12
  601. %else
  602. add rsp, 8
  603. %endif
  604. POP r1
  605. POP r3
  606. lea r0, [r1 + 2*r3]
  607. mov r2, r3
  608. sub r3, 16
  609. neg r2
  610. movaps xmm2, [ps_m1m1m1m1]
  611. .loop:
  612. movaps xmm0, [r1 + r3]
  613. movaps xmm1, [r0 + r2]
  614. shufps xmm0, xmm0, 0x1b
  615. shufps xmm1, xmm1, 0x1b
  616. xorps xmm0, xmm2
  617. movaps [r0 + r3], xmm1
  618. movaps [r1 + r2], xmm0
  619. sub r3, 16
  620. add r2, 16
  621. jl .loop
  622. REP_RET
  623. INIT_MMX 3dnow
  624. %define mulps pfmul
  625. %define addps pfadd
  626. %define subps pfsub
  627. %define unpcklps punpckldq
  628. %define unpckhps punpckhdq
  629. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  630. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  631. %define pass_3dnow2 pass_3dnow
  632. %define pass_interleave_3dnow2 pass_interleave_3dnow
  633. %ifdef PIC
  634. %define SECTION_REL - $$
  635. %else
  636. %define SECTION_REL
  637. %endif
  638. %macro DECL_FFT 1-2 ; nbits, suffix
  639. %ifidn %0, 1
  640. %xdefine fullsuffix SUFFIX
  641. %else
  642. %xdefine fullsuffix %2 %+ SUFFIX
  643. %endif
  644. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  645. %if %1>=5
  646. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  647. %endif
  648. %if %1>=6
  649. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  650. %endif
  651. %assign n 1<<%1
  652. %rep 17-%1
  653. %assign n2 n/2
  654. %assign n4 n/4
  655. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  656. align 16
  657. fft %+ n %+ fullsuffix:
  658. call fft %+ n2 %+ SUFFIX
  659. add r0, n*4 - (n&(-2<<%1))
  660. call fft %+ n4 %+ SUFFIX
  661. add r0, n*2 - (n2&(-2<<%1))
  662. call fft %+ n4 %+ SUFFIX
  663. sub r0, n*6 + (n2&(-2<<%1))
  664. lea r1, [cos_ %+ n]
  665. mov r2d, n4/2
  666. jmp pass %+ fullsuffix
  667. %assign n n*2
  668. %endrep
  669. %undef n
  670. align 8
  671. dispatch_tab %+ fullsuffix: pointer list_of_fft
  672. section .text
  673. ; On x86_32, this function does the register saving and restoring for all of fft.
  674. ; The others pass args in registers and don't spill anything.
  675. cglobal fft_dispatch%2, 2,5,8, z, nbits
  676. FFT_DISPATCH fullsuffix, nbits
  677. %if mmsize == 32
  678. vzeroupper
  679. %endif
  680. RET
  681. %endmacro ; DECL_FFT
  682. %if HAVE_AVX
  683. INIT_YMM avx
  684. DECL_FFT 6
  685. DECL_FFT 6, _interleave
  686. %endif
  687. INIT_XMM sse
  688. DECL_FFT 5
  689. DECL_FFT 5, _interleave
  690. INIT_MMX 3dnow
  691. DECL_FFT 4
  692. DECL_FFT 4, _interleave
  693. INIT_MMX 3dnow2
  694. DECL_FFT 4
  695. DECL_FFT 4, _interleave
  696. INIT_XMM sse
  697. %undef mulps
  698. %undef addps
  699. %undef subps
  700. %undef unpcklps
  701. %undef unpckhps
  702. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  703. movaps xmm0, [%3+%2*4]
  704. movaps xmm1, [%3+%1*4-0x10]
  705. movaps xmm2, xmm0
  706. shufps xmm0, xmm1, 0x88
  707. shufps xmm1, xmm2, 0x77
  708. movlps xmm4, [%4+%2*2]
  709. movlps xmm5, [%5+%2*2+0x0]
  710. movhps xmm4, [%4+%1*2-0x8]
  711. movhps xmm5, [%5+%1*2-0x8]
  712. movaps xmm2, xmm0
  713. movaps xmm3, xmm1
  714. mulps xmm0, xmm5
  715. mulps xmm1, xmm4
  716. mulps xmm2, xmm4
  717. mulps xmm3, xmm5
  718. subps xmm1, xmm0
  719. addps xmm2, xmm3
  720. movaps xmm0, xmm1
  721. unpcklps xmm1, xmm2
  722. unpckhps xmm0, xmm2
  723. %endmacro
  724. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  725. mulps m6, %3, [%5+%1]
  726. mulps m7, %2, [%5+%1]
  727. mulps %2, %2, [%6+%1]
  728. mulps %3, %3, [%6+%1]
  729. subps %2, %2, m6
  730. addps %3, %3, m7
  731. %endmacro
  732. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  733. .post:
  734. vmovaps ymm1, [%3+%1*2]
  735. vmovaps ymm0, [%3+%1*2+0x20]
  736. vmovaps ymm3, [%3+%2*2]
  737. vmovaps ymm2, [%3+%2*2+0x20]
  738. CMUL %1, ymm0, ymm1, %3, %4, %5
  739. CMUL %2, ymm2, ymm3, %3, %4, %5
  740. vshufps ymm1, ymm1, ymm1, 0x1b
  741. vshufps ymm3, ymm3, ymm3, 0x1b
  742. vperm2f128 ymm1, ymm1, ymm1, 0x01
  743. vperm2f128 ymm3, ymm3, ymm3, 0x01
  744. vunpcklps ymm6, ymm2, ymm1
  745. vunpckhps ymm4, ymm2, ymm1
  746. vunpcklps ymm7, ymm0, ymm3
  747. vunpckhps ymm5, ymm0, ymm3
  748. vextractf128 [%3+%1*2], ymm7, 0
  749. vextractf128 [%3+%1*2+0x10], ymm5, 0
  750. vextractf128 [%3+%1*2+0x20], ymm7, 1
  751. vextractf128 [%3+%1*2+0x30], ymm5, 1
  752. vextractf128 [%3+%2*2], ymm6, 0
  753. vextractf128 [%3+%2*2+0x10], ymm4, 0
  754. vextractf128 [%3+%2*2+0x20], ymm6, 1
  755. vextractf128 [%3+%2*2+0x30], ymm4, 1
  756. sub %2, 0x20
  757. add %1, 0x20
  758. jl .post
  759. %endmacro
  760. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  761. .post:
  762. movaps xmm1, [%3+%1*2]
  763. movaps xmm0, [%3+%1*2+0x10]
  764. CMUL %1, xmm0, xmm1, %3, %4, %5
  765. movaps xmm5, [%3+%2*2]
  766. movaps xmm4, [%3+%2*2+0x10]
  767. CMUL %2, xmm4, xmm5, %3, %4, %5
  768. shufps xmm1, xmm1, 0x1b
  769. shufps xmm5, xmm5, 0x1b
  770. movaps xmm6, xmm4
  771. unpckhps xmm4, xmm1
  772. unpcklps xmm6, xmm1
  773. movaps xmm2, xmm0
  774. unpcklps xmm0, xmm5
  775. unpckhps xmm2, xmm5
  776. movaps [%3+%2*2], xmm6
  777. movaps [%3+%2*2+0x10], xmm4
  778. movaps [%3+%1*2], xmm0
  779. movaps [%3+%1*2+0x10], xmm2
  780. sub %2, 0x10
  781. add %1, 0x10
  782. jl .post
  783. %endmacro
  784. %macro DECL_IMDCT 1
  785. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  786. %if ARCH_X86_64
  787. %define rrevtab r7
  788. %define rtcos r8
  789. %define rtsin r9
  790. %else
  791. %define rrevtab r6
  792. %define rtsin r6
  793. %define rtcos r5
  794. %endif
  795. mov r3d, [r0+FFTContext.mdctsize]
  796. add r2, r3
  797. shr r3, 1
  798. mov rtcos, [r0+FFTContext.tcos]
  799. mov rtsin, [r0+FFTContext.tsin]
  800. add rtcos, r3
  801. add rtsin, r3
  802. %if ARCH_X86_64 == 0
  803. push rtcos
  804. push rtsin
  805. %endif
  806. shr r3, 1
  807. mov rrevtab, [r0+FFTContext.revtab]
  808. add rrevtab, r3
  809. %if ARCH_X86_64 == 0
  810. push rrevtab
  811. %endif
  812. sub r3, 4
  813. %if ARCH_X86_64
  814. xor r4, r4
  815. sub r4, r3
  816. %endif
  817. .pre:
  818. %if ARCH_X86_64 == 0
  819. ;unspill
  820. xor r4, r4
  821. sub r4, r3
  822. mov rtsin, [esp+4]
  823. mov rtcos, [esp+8]
  824. %endif
  825. PREROTATER r4, r3, r2, rtcos, rtsin
  826. %if ARCH_X86_64
  827. movzx r5, word [rrevtab+r4-4]
  828. movzx r6, word [rrevtab+r4-2]
  829. movzx r10, word [rrevtab+r3]
  830. movzx r11, word [rrevtab+r3+2]
  831. movlps [r1+r5 *8], xmm0
  832. movhps [r1+r6 *8], xmm0
  833. movlps [r1+r10*8], xmm1
  834. movhps [r1+r11*8], xmm1
  835. add r4, 4
  836. %else
  837. mov r6, [esp]
  838. movzx r5, word [r6+r4-4]
  839. movzx r4, word [r6+r4-2]
  840. movlps [r1+r5*8], xmm0
  841. movhps [r1+r4*8], xmm0
  842. movzx r5, word [r6+r3]
  843. movzx r4, word [r6+r3+2]
  844. movlps [r1+r5*8], xmm1
  845. movhps [r1+r4*8], xmm1
  846. %endif
  847. sub r3, 4
  848. jns .pre
  849. mov r5, r0
  850. mov r6, r1
  851. mov r0, r1
  852. mov r1d, [r5+FFTContext.nbits]
  853. FFT_DISPATCH SUFFIX, r1
  854. mov r0d, [r5+FFTContext.mdctsize]
  855. add r6, r0
  856. shr r0, 1
  857. %if ARCH_X86_64 == 0
  858. %define rtcos r2
  859. %define rtsin r3
  860. mov rtcos, [esp+8]
  861. mov rtsin, [esp+4]
  862. %endif
  863. neg r0
  864. mov r1, -mmsize
  865. sub r1, r0
  866. %1 r0, r1, r6, rtcos, rtsin
  867. %if ARCH_X86_64 == 0
  868. add esp, 12
  869. %endif
  870. %if mmsize == 32
  871. vzeroupper
  872. %endif
  873. RET
  874. %endmacro
  875. DECL_IMDCT POSROTATESHUF
  876. INIT_YMM avx
  877. %if HAVE_AVX
  878. DECL_IMDCT POSROTATESHUF_AVX
  879. %endif