You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

862 lines
21KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "libavutil/x86/x86inc.asm"
  30. %if ARCH_X86_64
  31. %define pointer resq
  32. %else
  33. %define pointer resd
  34. %endif
  35. SECTION_RODATA
  36. struc FFTContext
  37. .nbits: resd 1
  38. .reverse: resd 1
  39. .revtab: pointer 1
  40. .tmpbuf: pointer 1
  41. .mdctsize: resd 1
  42. .mdctbits: resd 1
  43. .tcos: pointer 1
  44. .tsin: pointer 1
  45. endstruc
  46. %define M_SQRT1_2 0.70710678118654752440
  47. %define M_COS_PI_1_8 0.923879532511287
  48. %define M_COS_PI_3_8 0.38268343236509
  49. align 32
  50. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  51. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  52. ps_root2: times 8 dd M_SQRT1_2
  53. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  54. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  55. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  56. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  57. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  58. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  59. ps_m1p1: dd 1<<31, 0
  60. %assign i 16
  61. %rep 13
  62. cextern cos_ %+ i
  63. %assign i i<<1
  64. %endrep
  65. %if ARCH_X86_64
  66. %define pointer dq
  67. %else
  68. %define pointer dd
  69. %endif
  70. %macro IF0 1+
  71. %endmacro
  72. %macro IF1 1+
  73. %1
  74. %endmacro
  75. SECTION_TEXT
  76. %macro T2_3DN 4 ; z0, z1, mem0, mem1
  77. mova %1, %3
  78. mova %2, %1
  79. pfadd %1, %4
  80. pfsub %2, %4
  81. %endmacro
  82. %macro T4_3DN 6 ; z0, z1, z2, z3, tmp0, tmp1
  83. mova %5, %3
  84. pfsub %3, %4
  85. pfadd %5, %4 ; {t6,t5}
  86. pxor %3, [ps_m1p1] ; {t8,t7}
  87. mova %6, %1
  88. pswapd %3, %3
  89. pfadd %1, %5 ; {r0,i0}
  90. pfsub %6, %5 ; {r2,i2}
  91. mova %4, %2
  92. pfadd %2, %3 ; {r1,i1}
  93. pfsub %4, %3 ; {r3,i3}
  94. SWAP %3, %6
  95. %endmacro
  96. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  97. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  98. ; %3, %4, %5 tmp
  99. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  100. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  101. %macro T8_AVX 5
  102. vsubps %5, %1, %2 ; v = %1 - %2
  103. vaddps %3, %1, %2 ; w = %1 + %2
  104. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  105. vpermilps %2, %2, [perm1]
  106. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  107. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  108. vsubps %4, %5, %1 ; s = r - q
  109. vaddps %1, %5, %1 ; u = r + q
  110. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  111. vshufps %5, %4, %1, 0xbb
  112. vshufps %3, %4, %1, 0xee
  113. vperm2f128 %3, %3, %5, 0x13
  114. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  115. vshufps %2, %1, %4, 0xdd
  116. vshufps %1, %1, %4, 0x88
  117. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  118. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  119. vsubps %5, %1, %3
  120. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  121. vsubps %2, %4, %1 ; %2 = v - w
  122. vaddps %1, %4, %1 ; %1 = v + w
  123. %endmacro
  124. ; In SSE mode do one fft4 transforms
  125. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  126. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  127. ;
  128. ; In AVX mode do two fft4 transforms
  129. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  130. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  131. %macro T4_SSE 3
  132. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  133. addps %1, %1, %2 ; {t1,t2,t6,t5}
  134. xorps %3, %3, [ps_p1p1m1p1]
  135. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  136. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  137. subps %3, %1, %2 ; {r2,i2,r3,i3}
  138. addps %1, %1, %2 ; {r0,i0,r1,i1}
  139. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  140. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  141. %endmacro
  142. ; In SSE mode do one FFT8
  143. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  144. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  145. ;
  146. ; In AVX mode do two FFT8
  147. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  148. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  149. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  150. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  151. %macro T8_SSE 6
  152. addps %6, %3, %4 ; {t1,t2,t3,t4}
  153. subps %3, %3, %4 ; {r5,i5,r7,i7}
  154. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  155. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  156. mulps %4, %4, [ps_root2]
  157. addps %3, %3, %4 ; {t8,t7,ta,t9}
  158. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  159. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  160. subps %3, %6, %4 ; {t6,t5,tc,tb}
  161. addps %6, %6, %4 ; {t1,t2,t9,ta}
  162. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  163. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  164. subps %3, %1, %6 ; {r4,r5,r6,r7}
  165. addps %1, %1, %6 ; {r0,r1,r2,r3}
  166. subps %4, %2, %5 ; {i4,i5,i6,i7}
  167. addps %2, %2, %5 ; {i0,i1,i2,i3}
  168. %endmacro
  169. ; scheduled for cpu-bound sizes
  170. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  171. IF%1 mova m4, Z(4)
  172. IF%1 mova m5, Z(5)
  173. mova m0, %2 ; wre
  174. mova m1, %3 ; wim
  175. mulps m2, m4, m0 ; r2*wre
  176. IF%1 mova m6, Z2(6)
  177. mulps m3, m5, m1 ; i2*wim
  178. IF%1 mova m7, Z2(7)
  179. mulps m4, m4, m1 ; r2*wim
  180. mulps m5, m5, m0 ; i2*wre
  181. addps m2, m2, m3 ; r2*wre + i2*wim
  182. mulps m3, m1, m7 ; i3*wim
  183. subps m5, m5, m4 ; i2*wre - r2*wim
  184. mulps m1, m1, m6 ; r3*wim
  185. mulps m4, m0, m6 ; r3*wre
  186. mulps m0, m0, m7 ; i3*wre
  187. subps m4, m4, m3 ; r3*wre - i3*wim
  188. mova m3, Z(0)
  189. addps m0, m0, m1 ; i3*wre + r3*wim
  190. subps m1, m4, m2 ; t3
  191. addps m4, m4, m2 ; t5
  192. subps m3, m3, m4 ; r2
  193. addps m4, m4, Z(0) ; r0
  194. mova m6, Z(2)
  195. mova Z(4), m3
  196. mova Z(0), m4
  197. subps m3, m5, m0 ; t4
  198. subps m4, m6, m3 ; r3
  199. addps m3, m3, m6 ; r1
  200. mova Z2(6), m4
  201. mova Z(2), m3
  202. mova m2, Z(3)
  203. addps m3, m5, m0 ; t6
  204. subps m2, m2, m1 ; i3
  205. mova m7, Z(1)
  206. addps m1, m1, Z(3) ; i1
  207. mova Z2(7), m2
  208. mova Z(3), m1
  209. subps m4, m7, m3 ; i2
  210. addps m3, m3, m7 ; i0
  211. mova Z(5), m4
  212. mova Z(1), m3
  213. %endmacro
  214. ; scheduled to avoid store->load aliasing
  215. %macro PASS_BIG 1 ; (!interleave)
  216. mova m4, Z(4) ; r2
  217. mova m5, Z(5) ; i2
  218. mova m0, [wq] ; wre
  219. mova m1, [wq+o1q] ; wim
  220. mulps m2, m4, m0 ; r2*wre
  221. mova m6, Z2(6) ; r3
  222. mulps m3, m5, m1 ; i2*wim
  223. mova m7, Z2(7) ; i3
  224. mulps m4, m4, m1 ; r2*wim
  225. mulps m5, m5, m0 ; i2*wre
  226. addps m2, m2, m3 ; r2*wre + i2*wim
  227. mulps m3, m1, m7 ; i3*wim
  228. mulps m1, m1, m6 ; r3*wim
  229. subps m5, m5, m4 ; i2*wre - r2*wim
  230. mulps m4, m0, m6 ; r3*wre
  231. mulps m0, m0, m7 ; i3*wre
  232. subps m4, m4, m3 ; r3*wre - i3*wim
  233. mova m3, Z(0)
  234. addps m0, m0, m1 ; i3*wre + r3*wim
  235. subps m1, m4, m2 ; t3
  236. addps m4, m4, m2 ; t5
  237. subps m3, m3, m4 ; r2
  238. addps m4, m4, Z(0) ; r0
  239. mova m6, Z(2)
  240. mova Z(4), m3
  241. mova Z(0), m4
  242. subps m3, m5, m0 ; t4
  243. subps m4, m6, m3 ; r3
  244. addps m3, m3, m6 ; r1
  245. IF%1 mova Z2(6), m4
  246. IF%1 mova Z(2), m3
  247. mova m2, Z(3)
  248. addps m5, m5, m0 ; t6
  249. subps m2, m2, m1 ; i3
  250. mova m7, Z(1)
  251. addps m1, m1, Z(3) ; i1
  252. IF%1 mova Z2(7), m2
  253. IF%1 mova Z(3), m1
  254. subps m6, m7, m5 ; i2
  255. addps m5, m5, m7 ; i0
  256. IF%1 mova Z(5), m6
  257. IF%1 mova Z(1), m5
  258. %if %1==0
  259. INTERL m1, m3, m7, Z, 2
  260. INTERL m2, m4, m0, Z2, 6
  261. mova m1, Z(0)
  262. mova m2, Z(4)
  263. INTERL m5, m1, m3, Z, 0
  264. INTERL m6, m2, m7, Z, 4
  265. %endif
  266. %endmacro
  267. %macro PUNPCK 3
  268. mova %3, %1
  269. punpckldq %1, %2
  270. punpckhdq %3, %2
  271. %endmacro
  272. %define Z(x) [r0+mmsize*x]
  273. %define Z2(x) [r0+mmsize*x]
  274. %define ZH(x) [r0+mmsize*x+mmsize/2]
  275. INIT_YMM avx
  276. %if HAVE_AVX
  277. align 16
  278. fft8_avx:
  279. mova m0, Z(0)
  280. mova m1, Z(1)
  281. T8_AVX m0, m1, m2, m3, m4
  282. mova Z(0), m0
  283. mova Z(1), m1
  284. ret
  285. align 16
  286. fft16_avx:
  287. mova m2, Z(2)
  288. mova m3, Z(3)
  289. T4_SSE m2, m3, m7
  290. mova m0, Z(0)
  291. mova m1, Z(1)
  292. T8_AVX m0, m1, m4, m5, m7
  293. mova m4, [ps_cos16_1]
  294. mova m5, [ps_cos16_2]
  295. vmulps m6, m2, m4
  296. vmulps m7, m3, m5
  297. vaddps m7, m7, m6
  298. vmulps m2, m2, m5
  299. vmulps m3, m3, m4
  300. vsubps m3, m3, m2
  301. vblendps m2, m7, m3, 0xf0
  302. vperm2f128 m3, m7, m3, 0x21
  303. vaddps m4, m2, m3
  304. vsubps m2, m3, m2
  305. vperm2f128 m2, m2, m2, 0x01
  306. vsubps m3, m1, m2
  307. vaddps m1, m1, m2
  308. vsubps m5, m0, m4
  309. vaddps m0, m0, m4
  310. vextractf128 Z(0), m0, 0
  311. vextractf128 ZH(0), m1, 0
  312. vextractf128 Z(1), m0, 1
  313. vextractf128 ZH(1), m1, 1
  314. vextractf128 Z(2), m5, 0
  315. vextractf128 ZH(2), m3, 0
  316. vextractf128 Z(3), m5, 1
  317. vextractf128 ZH(3), m3, 1
  318. ret
  319. align 16
  320. fft32_avx:
  321. call fft16_avx
  322. mova m0, Z(4)
  323. mova m1, Z(5)
  324. T4_SSE m0, m1, m4
  325. mova m2, Z(6)
  326. mova m3, Z(7)
  327. T8_SSE m0, m1, m2, m3, m4, m6
  328. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  329. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  330. vperm2f128 m4, m0, m2, 0x20
  331. vperm2f128 m5, m1, m3, 0x20
  332. vperm2f128 m6, m0, m2, 0x31
  333. vperm2f128 m7, m1, m3, 0x31
  334. PASS_SMALL 0, [cos_32], [cos_32+32]
  335. ret
  336. fft32_interleave_avx:
  337. call fft32_avx
  338. mov r2d, 32
  339. .deint_loop:
  340. mova m2, Z(0)
  341. mova m3, Z(1)
  342. vunpcklps m0, m2, m3
  343. vunpckhps m1, m2, m3
  344. vextractf128 Z(0), m0, 0
  345. vextractf128 ZH(0), m1, 0
  346. vextractf128 Z(1), m0, 1
  347. vextractf128 ZH(1), m1, 1
  348. add r0, mmsize*2
  349. sub r2d, mmsize/4
  350. jg .deint_loop
  351. ret
  352. %endif
  353. INIT_XMM sse
  354. %define movdqa movaps
  355. align 16
  356. fft4_avx:
  357. fft4_sse:
  358. mova m0, Z(0)
  359. mova m1, Z(1)
  360. T4_SSE m0, m1, m2
  361. mova Z(0), m0
  362. mova Z(1), m1
  363. ret
  364. align 16
  365. fft8_sse:
  366. mova m0, Z(0)
  367. mova m1, Z(1)
  368. T4_SSE m0, m1, m2
  369. mova m2, Z(2)
  370. mova m3, Z(3)
  371. T8_SSE m0, m1, m2, m3, m4, m5
  372. mova Z(0), m0
  373. mova Z(1), m1
  374. mova Z(2), m2
  375. mova Z(3), m3
  376. ret
  377. align 16
  378. fft16_sse:
  379. mova m0, Z(0)
  380. mova m1, Z(1)
  381. T4_SSE m0, m1, m2
  382. mova m2, Z(2)
  383. mova m3, Z(3)
  384. T8_SSE m0, m1, m2, m3, m4, m5
  385. mova m4, Z(4)
  386. mova m5, Z(5)
  387. mova Z(0), m0
  388. mova Z(1), m1
  389. mova Z(2), m2
  390. mova Z(3), m3
  391. T4_SSE m4, m5, m6
  392. mova m6, Z2(6)
  393. mova m7, Z2(7)
  394. T4_SSE m6, m7, m0
  395. PASS_SMALL 0, [cos_16], [cos_16+16]
  396. ret
  397. %macro FFT48_3DN 0
  398. align 16
  399. fft4 %+ SUFFIX:
  400. T2_3DN m0, m1, Z(0), Z(1)
  401. mova m2, Z(2)
  402. mova m3, Z(3)
  403. T4_3DN m0, m1, m2, m3, m4, m5
  404. PUNPCK m0, m1, m4
  405. PUNPCK m2, m3, m5
  406. mova Z(0), m0
  407. mova Z(1), m4
  408. mova Z(2), m2
  409. mova Z(3), m5
  410. ret
  411. align 16
  412. fft8 %+ SUFFIX:
  413. T2_3DN m0, m1, Z(0), Z(1)
  414. mova m2, Z(2)
  415. mova m3, Z(3)
  416. T4_3DN m0, m1, m2, m3, m4, m5
  417. mova Z(0), m0
  418. mova Z(2), m2
  419. T2_3DN m4, m5, Z(4), Z(5)
  420. T2_3DN m6, m7, Z2(6), Z2(7)
  421. pswapd m0, m5
  422. pswapd m2, m7
  423. pxor m0, [ps_m1p1]
  424. pxor m2, [ps_m1p1]
  425. pfsub m5, m0
  426. pfadd m7, m2
  427. pfmul m5, [ps_root2]
  428. pfmul m7, [ps_root2]
  429. T4_3DN m1, m3, m5, m7, m0, m2
  430. mova Z(5), m5
  431. mova Z2(7), m7
  432. mova m0, Z(0)
  433. mova m2, Z(2)
  434. T4_3DN m0, m2, m4, m6, m5, m7
  435. PUNPCK m0, m1, m5
  436. PUNPCK m2, m3, m7
  437. mova Z(0), m0
  438. mova Z(1), m5
  439. mova Z(2), m2
  440. mova Z(3), m7
  441. PUNPCK m4, Z(5), m5
  442. PUNPCK m6, Z2(7), m7
  443. mova Z(4), m4
  444. mova Z(5), m5
  445. mova Z2(6), m6
  446. mova Z2(7), m7
  447. ret
  448. %endmacro
  449. INIT_MMX 3dnow2
  450. FFT48_3DN
  451. %macro pswapd 2
  452. %ifidn %1, %2
  453. movd [r0+12], %1
  454. punpckhdq %1, [r0+8]
  455. %else
  456. movq %1, %2
  457. psrlq %1, 32
  458. punpckldq %1, %2
  459. %endif
  460. %endmacro
  461. INIT_MMX 3dnow
  462. FFT48_3DN
  463. %define Z(x) [zq + o1q*(x&6) + mmsize*(x&1)]
  464. %define Z2(x) [zq + o3q + mmsize*(x&1)]
  465. %define ZH(x) [zq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  466. %define Z2H(x) [zq + o3q + mmsize*(x&1) + mmsize/2]
  467. %macro DECL_PASS 2+ ; name, payload
  468. align 16
  469. %1:
  470. DEFINE_ARGS z, w, n, o1, o3
  471. lea o3q, [nq*3]
  472. lea o1q, [nq*8]
  473. shl o3q, 4
  474. .loop:
  475. %2
  476. add zq, mmsize*2
  477. add wq, mmsize
  478. sub nd, mmsize/8
  479. jg .loop
  480. rep ret
  481. %endmacro
  482. INIT_YMM avx
  483. %if HAVE_AVX
  484. %macro INTERL_AVX 5
  485. vunpckhps %3, %2, %1
  486. vunpcklps %2, %2, %1
  487. vextractf128 %4(%5), %2, 0
  488. vextractf128 %4 %+ H(%5), %3, 0
  489. vextractf128 %4(%5 + 1), %2, 1
  490. vextractf128 %4 %+ H(%5 + 1), %3, 1
  491. %endmacro
  492. %define INTERL INTERL_AVX
  493. DECL_PASS pass_avx, PASS_BIG 1
  494. DECL_PASS pass_interleave_avx, PASS_BIG 0
  495. %endif
  496. INIT_XMM sse
  497. %macro INTERL_SSE 5
  498. mova %3, %2
  499. unpcklps %2, %1
  500. unpckhps %3, %1
  501. mova %4(%5), %2
  502. mova %4(%5+1), %3
  503. %endmacro
  504. %define INTERL INTERL_SSE
  505. DECL_PASS pass_sse, PASS_BIG 1
  506. DECL_PASS pass_interleave_sse, PASS_BIG 0
  507. INIT_MMX 3dnow
  508. %define mulps pfmul
  509. %define addps pfadd
  510. %define subps pfsub
  511. %define unpcklps punpckldq
  512. %define unpckhps punpckhdq
  513. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  514. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  515. %define pass_3dnow2 pass_3dnow
  516. %define pass_interleave_3dnow2 pass_interleave_3dnow
  517. %ifdef PIC
  518. %define SECTION_REL - $$
  519. %else
  520. %define SECTION_REL
  521. %endif
  522. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  523. lea r2, [dispatch_tab%1]
  524. mov r2, [r2 + (%2q-2)*gprsize]
  525. %ifdef PIC
  526. lea r3, [$$]
  527. add r2, r3
  528. %endif
  529. call r2
  530. %endmacro ; FFT_DISPATCH
  531. %macro DECL_FFT 1-2 ; nbits, suffix
  532. %ifidn %0, 1
  533. %xdefine fullsuffix SUFFIX
  534. %else
  535. %xdefine fullsuffix %2 %+ SUFFIX
  536. %endif
  537. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  538. %if %1>=5
  539. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  540. %endif
  541. %if %1>=6
  542. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  543. %endif
  544. %assign n 1<<%1
  545. %rep 17-%1
  546. %assign n2 n/2
  547. %assign n4 n/4
  548. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  549. align 16
  550. fft %+ n %+ fullsuffix:
  551. call fft %+ n2 %+ SUFFIX
  552. add r0, n*4 - (n&(-2<<%1))
  553. call fft %+ n4 %+ SUFFIX
  554. add r0, n*2 - (n2&(-2<<%1))
  555. call fft %+ n4 %+ SUFFIX
  556. sub r0, n*6 + (n2&(-2<<%1))
  557. lea r1, [cos_ %+ n]
  558. mov r2d, n4/2
  559. jmp pass %+ fullsuffix
  560. %assign n n*2
  561. %endrep
  562. %undef n
  563. align 8
  564. dispatch_tab %+ fullsuffix: pointer list_of_fft
  565. section .text
  566. ; On x86_32, this function does the register saving and restoring for all of fft.
  567. ; The others pass args in registers and don't spill anything.
  568. cglobal fft_dispatch%2, 2,5,8, z, nbits
  569. FFT_DISPATCH fullsuffix, nbits
  570. %if mmsize == 32
  571. vzeroupper
  572. %endif
  573. RET
  574. %endmacro ; DECL_FFT
  575. %if HAVE_AVX
  576. INIT_YMM avx
  577. DECL_FFT 6
  578. DECL_FFT 6, _interleave
  579. %endif
  580. INIT_XMM sse
  581. DECL_FFT 5
  582. DECL_FFT 5, _interleave
  583. INIT_MMX 3dnow
  584. DECL_FFT 4
  585. DECL_FFT 4, _interleave
  586. INIT_MMX 3dnow2
  587. DECL_FFT 4
  588. DECL_FFT 4, _interleave
  589. INIT_XMM sse
  590. %undef mulps
  591. %undef addps
  592. %undef subps
  593. %undef unpcklps
  594. %undef unpckhps
  595. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  596. movaps xmm0, [%3+%2*4]
  597. movaps xmm1, [%3+%1*4-0x10]
  598. movaps xmm2, xmm0
  599. shufps xmm0, xmm1, 0x88
  600. shufps xmm1, xmm2, 0x77
  601. movlps xmm4, [%4+%2*2]
  602. movlps xmm5, [%5+%2*2+0x0]
  603. movhps xmm4, [%4+%1*2-0x8]
  604. movhps xmm5, [%5+%1*2-0x8]
  605. movaps xmm2, xmm0
  606. movaps xmm3, xmm1
  607. mulps xmm0, xmm5
  608. mulps xmm1, xmm4
  609. mulps xmm2, xmm4
  610. mulps xmm3, xmm5
  611. subps xmm1, xmm0
  612. addps xmm2, xmm3
  613. movaps xmm0, xmm1
  614. unpcklps xmm1, xmm2
  615. unpckhps xmm0, xmm2
  616. %endmacro
  617. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  618. mulps m6, %3, [%5+%1]
  619. mulps m7, %2, [%5+%1]
  620. mulps %2, %2, [%6+%1]
  621. mulps %3, %3, [%6+%1]
  622. subps %2, %2, m6
  623. addps %3, %3, m7
  624. %endmacro
  625. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  626. .post:
  627. vmovaps ymm1, [%3+%1*2]
  628. vmovaps ymm0, [%3+%1*2+0x20]
  629. vmovaps ymm3, [%3+%2*2]
  630. vmovaps ymm2, [%3+%2*2+0x20]
  631. CMUL %1, ymm0, ymm1, %3, %4, %5
  632. CMUL %2, ymm2, ymm3, %3, %4, %5
  633. vshufps ymm1, ymm1, ymm1, 0x1b
  634. vshufps ymm3, ymm3, ymm3, 0x1b
  635. vperm2f128 ymm1, ymm1, ymm1, 0x01
  636. vperm2f128 ymm3, ymm3, ymm3, 0x01
  637. vunpcklps ymm6, ymm2, ymm1
  638. vunpckhps ymm4, ymm2, ymm1
  639. vunpcklps ymm7, ymm0, ymm3
  640. vunpckhps ymm5, ymm0, ymm3
  641. vextractf128 [%3+%1*2], ymm7, 0
  642. vextractf128 [%3+%1*2+0x10], ymm5, 0
  643. vextractf128 [%3+%1*2+0x20], ymm7, 1
  644. vextractf128 [%3+%1*2+0x30], ymm5, 1
  645. vextractf128 [%3+%2*2], ymm6, 0
  646. vextractf128 [%3+%2*2+0x10], ymm4, 0
  647. vextractf128 [%3+%2*2+0x20], ymm6, 1
  648. vextractf128 [%3+%2*2+0x30], ymm4, 1
  649. sub %2, 0x20
  650. add %1, 0x20
  651. jl .post
  652. %endmacro
  653. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  654. .post:
  655. movaps xmm1, [%3+%1*2]
  656. movaps xmm0, [%3+%1*2+0x10]
  657. CMUL %1, xmm0, xmm1, %3, %4, %5
  658. movaps xmm5, [%3+%2*2]
  659. movaps xmm4, [%3+%2*2+0x10]
  660. CMUL %2, xmm4, xmm5, %3, %4, %5
  661. shufps xmm1, xmm1, 0x1b
  662. shufps xmm5, xmm5, 0x1b
  663. movaps xmm6, xmm4
  664. unpckhps xmm4, xmm1
  665. unpcklps xmm6, xmm1
  666. movaps xmm2, xmm0
  667. unpcklps xmm0, xmm5
  668. unpckhps xmm2, xmm5
  669. movaps [%3+%2*2], xmm6
  670. movaps [%3+%2*2+0x10], xmm4
  671. movaps [%3+%1*2], xmm0
  672. movaps [%3+%1*2+0x10], xmm2
  673. sub %2, 0x10
  674. add %1, 0x10
  675. jl .post
  676. %endmacro
  677. %macro DECL_IMDCT 1
  678. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  679. %if ARCH_X86_64
  680. %define rrevtab r7
  681. %define rtcos r8
  682. %define rtsin r9
  683. %else
  684. %define rrevtab r6
  685. %define rtsin r6
  686. %define rtcos r5
  687. %endif
  688. mov r3d, [r0+FFTContext.mdctsize]
  689. add r2, r3
  690. shr r3, 1
  691. mov rtcos, [r0+FFTContext.tcos]
  692. mov rtsin, [r0+FFTContext.tsin]
  693. add rtcos, r3
  694. add rtsin, r3
  695. %if ARCH_X86_64 == 0
  696. push rtcos
  697. push rtsin
  698. %endif
  699. shr r3, 1
  700. mov rrevtab, [r0+FFTContext.revtab]
  701. add rrevtab, r3
  702. %if ARCH_X86_64 == 0
  703. push rrevtab
  704. %endif
  705. sub r3, 4
  706. %if ARCH_X86_64
  707. xor r4, r4
  708. sub r4, r3
  709. %endif
  710. .pre:
  711. %if ARCH_X86_64 == 0
  712. ;unspill
  713. xor r4, r4
  714. sub r4, r3
  715. mov rtsin, [esp+4]
  716. mov rtcos, [esp+8]
  717. %endif
  718. PREROTATER r4, r3, r2, rtcos, rtsin
  719. %if ARCH_X86_64
  720. movzx r5, word [rrevtab+r4-4]
  721. movzx r6, word [rrevtab+r4-2]
  722. movzx r10, word [rrevtab+r3]
  723. movzx r11, word [rrevtab+r3+2]
  724. movlps [r1+r5 *8], xmm0
  725. movhps [r1+r6 *8], xmm0
  726. movlps [r1+r10*8], xmm1
  727. movhps [r1+r11*8], xmm1
  728. add r4, 4
  729. %else
  730. mov r6, [esp]
  731. movzx r5, word [r6+r4-4]
  732. movzx r4, word [r6+r4-2]
  733. movlps [r1+r5*8], xmm0
  734. movhps [r1+r4*8], xmm0
  735. movzx r5, word [r6+r3]
  736. movzx r4, word [r6+r3+2]
  737. movlps [r1+r5*8], xmm1
  738. movhps [r1+r4*8], xmm1
  739. %endif
  740. sub r3, 4
  741. jns .pre
  742. mov r5, r0
  743. mov r6, r1
  744. mov r0, r1
  745. mov r1d, [r5+FFTContext.nbits]
  746. FFT_DISPATCH SUFFIX, r1
  747. mov r0d, [r5+FFTContext.mdctsize]
  748. add r6, r0
  749. shr r0, 1
  750. %if ARCH_X86_64 == 0
  751. %define rtcos r2
  752. %define rtsin r3
  753. mov rtcos, [esp+8]
  754. mov rtsin, [esp+4]
  755. %endif
  756. neg r0
  757. mov r1, -mmsize
  758. sub r1, r0
  759. %1 r0, r1, r6, rtcos, rtsin
  760. %if ARCH_X86_64 == 0
  761. add esp, 12
  762. %endif
  763. %if mmsize == 32
  764. vzeroupper
  765. %endif
  766. RET
  767. %endmacro
  768. DECL_IMDCT POSROTATESHUF
  769. INIT_YMM avx
  770. %if HAVE_AVX
  771. DECL_IMDCT POSROTATESHUF_AVX
  772. %endif