You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1115 lines
26KB

  1. ;******************************************************************************
  2. ;* FFT transform with SSE/3DNow optimizations
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2011 Vitor Sessak
  5. ;*
  6. ;* This algorithm (though not any of the implementation details) is
  7. ;* based on libdjbfft by D. J. Bernstein.
  8. ;*
  9. ;* This file is part of FFmpeg.
  10. ;*
  11. ;* FFmpeg is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* FFmpeg is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with FFmpeg; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. ; These functions are not individually interchangeable with the C versions.
  26. ; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
  27. ; in blocks as conventient to the vector size.
  28. ; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
  29. %include "libavutil/x86/x86inc.asm"
  30. %include "libavutil/x86/x86util.asm"
  31. %if ARCH_X86_64
  32. %define pointer resq
  33. %else
  34. %define pointer resd
  35. %endif
  36. SECTION_RODATA
  37. struc FFTContext
  38. .nbits: resd 1
  39. .reverse: resd 1
  40. .revtab: pointer 1
  41. .tmpbuf: pointer 1
  42. .mdctsize: resd 1
  43. .mdctbits: resd 1
  44. .tcos: pointer 1
  45. .tsin: pointer 1
  46. .fftperm: pointer 1
  47. .fftcalc: pointer 1
  48. .imdctcalc:pointer 1
  49. .imdcthalf:pointer 1
  50. endstruc
  51. %define M_SQRT1_2 0.70710678118654752440
  52. %define M_COS_PI_1_8 0.923879532511287
  53. %define M_COS_PI_3_8 0.38268343236509
  54. align 32
  55. ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
  56. ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
  57. ps_root2: times 8 dd M_SQRT1_2
  58. ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
  59. ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
  60. perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
  61. perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
  62. ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
  63. ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
  64. ps_m1m1m1m1: times 4 dd 1<<31
  65. ps_m1p1: dd 1<<31, 0
  66. %assign i 16
  67. %rep 13
  68. cextern cos_ %+ i
  69. %assign i i<<1
  70. %endrep
  71. %if ARCH_X86_64
  72. %define pointer dq
  73. %else
  74. %define pointer dd
  75. %endif
  76. %macro IF0 1+
  77. %endmacro
  78. %macro IF1 1+
  79. %1
  80. %endmacro
  81. SECTION_TEXT
  82. %macro T2_3DNOW 4 ; z0, z1, mem0, mem1
  83. mova %1, %3
  84. mova %2, %1
  85. pfadd %1, %4
  86. pfsub %2, %4
  87. %endmacro
  88. %macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
  89. mova %5, %3
  90. pfsub %3, %4
  91. pfadd %5, %4 ; {t6,t5}
  92. pxor %3, [ps_m1p1] ; {t8,t7}
  93. mova %6, %1
  94. PSWAPD %3, %3
  95. pfadd %1, %5 ; {r0,i0}
  96. pfsub %6, %5 ; {r2,i2}
  97. mova %4, %2
  98. pfadd %2, %3 ; {r1,i1}
  99. pfsub %4, %3 ; {r3,i3}
  100. SWAP %3, %6
  101. %endmacro
  102. ; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
  103. ; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
  104. ; %3, %4, %5 tmp
  105. ; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
  106. ; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
  107. %macro T8_AVX 5
  108. vsubps %5, %1, %2 ; v = %1 - %2
  109. vaddps %3, %1, %2 ; w = %1 + %2
  110. vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
  111. vpermilps %2, %2, [perm1]
  112. vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
  113. vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
  114. vsubps %4, %5, %1 ; s = r - q
  115. vaddps %1, %5, %1 ; u = r + q
  116. vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
  117. vshufps %5, %4, %1, 0xbb
  118. vshufps %3, %4, %1, 0xee
  119. vperm2f128 %3, %3, %5, 0x13
  120. vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
  121. vshufps %2, %1, %4, 0xdd
  122. vshufps %1, %1, %4, 0x88
  123. vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
  124. vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
  125. vsubps %5, %1, %3
  126. vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
  127. vsubps %2, %4, %1 ; %2 = v - w
  128. vaddps %1, %4, %1 ; %1 = v + w
  129. %endmacro
  130. ; In SSE mode do one fft4 transforms
  131. ; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
  132. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
  133. ;
  134. ; In AVX mode do two fft4 transforms
  135. ; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
  136. ; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
  137. %macro T4_SSE 3
  138. subps %3, %1, %2 ; {t3,t4,-t8,t7}
  139. addps %1, %1, %2 ; {t1,t2,t6,t5}
  140. xorps %3, %3, [ps_p1p1m1p1]
  141. shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
  142. shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
  143. subps %3, %1, %2 ; {r2,i2,r3,i3}
  144. addps %1, %1, %2 ; {r0,i0,r1,i1}
  145. shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
  146. shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
  147. %endmacro
  148. ; In SSE mode do one FFT8
  149. ; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
  150. ; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
  151. ;
  152. ; In AVX mode do two FFT8
  153. ; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
  154. ; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
  155. ; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
  156. ; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
  157. %macro T8_SSE 6
  158. addps %6, %3, %4 ; {t1,t2,t3,t4}
  159. subps %3, %3, %4 ; {r5,i5,r7,i7}
  160. shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
  161. mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
  162. mulps %4, %4, [ps_root2]
  163. addps %3, %3, %4 ; {t8,t7,ta,t9}
  164. shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
  165. shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
  166. subps %3, %6, %4 ; {t6,t5,tc,tb}
  167. addps %6, %6, %4 ; {t1,t2,t9,ta}
  168. shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
  169. shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
  170. subps %3, %1, %6 ; {r4,r5,r6,r7}
  171. addps %1, %1, %6 ; {r0,r1,r2,r3}
  172. subps %4, %2, %5 ; {i4,i5,i6,i7}
  173. addps %2, %2, %5 ; {i0,i1,i2,i3}
  174. %endmacro
  175. ; scheduled for cpu-bound sizes
  176. %macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
  177. IF%1 mova m4, Z(4)
  178. IF%1 mova m5, Z(5)
  179. mova m0, %2 ; wre
  180. mova m1, %3 ; wim
  181. mulps m2, m4, m0 ; r2*wre
  182. IF%1 mova m6, Z2(6)
  183. mulps m3, m5, m1 ; i2*wim
  184. IF%1 mova m7, Z2(7)
  185. mulps m4, m4, m1 ; r2*wim
  186. mulps m5, m5, m0 ; i2*wre
  187. addps m2, m2, m3 ; r2*wre + i2*wim
  188. mulps m3, m1, m7 ; i3*wim
  189. subps m5, m5, m4 ; i2*wre - r2*wim
  190. mulps m1, m1, m6 ; r3*wim
  191. mulps m4, m0, m6 ; r3*wre
  192. mulps m0, m0, m7 ; i3*wre
  193. subps m4, m4, m3 ; r3*wre - i3*wim
  194. mova m3, Z(0)
  195. addps m0, m0, m1 ; i3*wre + r3*wim
  196. subps m1, m4, m2 ; t3
  197. addps m4, m4, m2 ; t5
  198. subps m3, m3, m4 ; r2
  199. addps m4, m4, Z(0) ; r0
  200. mova m6, Z(2)
  201. mova Z(4), m3
  202. mova Z(0), m4
  203. subps m3, m5, m0 ; t4
  204. subps m4, m6, m3 ; r3
  205. addps m3, m3, m6 ; r1
  206. mova Z2(6), m4
  207. mova Z(2), m3
  208. mova m2, Z(3)
  209. addps m3, m5, m0 ; t6
  210. subps m2, m2, m1 ; i3
  211. mova m7, Z(1)
  212. addps m1, m1, Z(3) ; i1
  213. mova Z2(7), m2
  214. mova Z(3), m1
  215. subps m4, m7, m3 ; i2
  216. addps m3, m3, m7 ; i0
  217. mova Z(5), m4
  218. mova Z(1), m3
  219. %endmacro
  220. ; scheduled to avoid store->load aliasing
  221. %macro PASS_BIG 1 ; (!interleave)
  222. mova m4, Z(4) ; r2
  223. mova m5, Z(5) ; i2
  224. mova m0, [wq] ; wre
  225. mova m1, [wq+o1q] ; wim
  226. mulps m2, m4, m0 ; r2*wre
  227. mova m6, Z2(6) ; r3
  228. mulps m3, m5, m1 ; i2*wim
  229. mova m7, Z2(7) ; i3
  230. mulps m4, m4, m1 ; r2*wim
  231. mulps m5, m5, m0 ; i2*wre
  232. addps m2, m2, m3 ; r2*wre + i2*wim
  233. mulps m3, m1, m7 ; i3*wim
  234. mulps m1, m1, m6 ; r3*wim
  235. subps m5, m5, m4 ; i2*wre - r2*wim
  236. mulps m4, m0, m6 ; r3*wre
  237. mulps m0, m0, m7 ; i3*wre
  238. subps m4, m4, m3 ; r3*wre - i3*wim
  239. mova m3, Z(0)
  240. addps m0, m0, m1 ; i3*wre + r3*wim
  241. subps m1, m4, m2 ; t3
  242. addps m4, m4, m2 ; t5
  243. subps m3, m3, m4 ; r2
  244. addps m4, m4, Z(0) ; r0
  245. mova m6, Z(2)
  246. mova Z(4), m3
  247. mova Z(0), m4
  248. subps m3, m5, m0 ; t4
  249. subps m4, m6, m3 ; r3
  250. addps m3, m3, m6 ; r1
  251. IF%1 mova Z2(6), m4
  252. IF%1 mova Z(2), m3
  253. mova m2, Z(3)
  254. addps m5, m5, m0 ; t6
  255. subps m2, m2, m1 ; i3
  256. mova m7, Z(1)
  257. addps m1, m1, Z(3) ; i1
  258. IF%1 mova Z2(7), m2
  259. IF%1 mova Z(3), m1
  260. subps m6, m7, m5 ; i2
  261. addps m5, m5, m7 ; i0
  262. IF%1 mova Z(5), m6
  263. IF%1 mova Z(1), m5
  264. %if %1==0
  265. INTERL m1, m3, m7, Z, 2
  266. INTERL m2, m4, m0, Z2, 6
  267. mova m1, Z(0)
  268. mova m2, Z(4)
  269. INTERL m5, m1, m3, Z, 0
  270. INTERL m6, m2, m7, Z, 4
  271. %endif
  272. %endmacro
  273. %macro PUNPCK 3
  274. mova %3, %1
  275. punpckldq %1, %2
  276. punpckhdq %3, %2
  277. %endmacro
  278. %define Z(x) [r0+mmsize*x]
  279. %define Z2(x) [r0+mmsize*x]
  280. %define ZH(x) [r0+mmsize*x+mmsize/2]
  281. INIT_YMM avx
  282. %if HAVE_AVX
  283. align 16
  284. fft8_avx:
  285. mova m0, Z(0)
  286. mova m1, Z(1)
  287. T8_AVX m0, m1, m2, m3, m4
  288. mova Z(0), m0
  289. mova Z(1), m1
  290. ret
  291. align 16
  292. fft16_avx:
  293. mova m2, Z(2)
  294. mova m3, Z(3)
  295. T4_SSE m2, m3, m7
  296. mova m0, Z(0)
  297. mova m1, Z(1)
  298. T8_AVX m0, m1, m4, m5, m7
  299. mova m4, [ps_cos16_1]
  300. mova m5, [ps_cos16_2]
  301. vmulps m6, m2, m4
  302. vmulps m7, m3, m5
  303. vaddps m7, m7, m6
  304. vmulps m2, m2, m5
  305. vmulps m3, m3, m4
  306. vsubps m3, m3, m2
  307. vblendps m2, m7, m3, 0xf0
  308. vperm2f128 m3, m7, m3, 0x21
  309. vaddps m4, m2, m3
  310. vsubps m2, m3, m2
  311. vperm2f128 m2, m2, m2, 0x01
  312. vsubps m3, m1, m2
  313. vaddps m1, m1, m2
  314. vsubps m5, m0, m4
  315. vaddps m0, m0, m4
  316. vextractf128 Z(0), m0, 0
  317. vextractf128 ZH(0), m1, 0
  318. vextractf128 Z(1), m0, 1
  319. vextractf128 ZH(1), m1, 1
  320. vextractf128 Z(2), m5, 0
  321. vextractf128 ZH(2), m3, 0
  322. vextractf128 Z(3), m5, 1
  323. vextractf128 ZH(3), m3, 1
  324. ret
  325. align 16
  326. fft32_avx:
  327. call fft16_avx
  328. mova m0, Z(4)
  329. mova m1, Z(5)
  330. T4_SSE m0, m1, m4
  331. mova m2, Z(6)
  332. mova m3, Z(7)
  333. T8_SSE m0, m1, m2, m3, m4, m6
  334. ; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
  335. ; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
  336. vperm2f128 m4, m0, m2, 0x20
  337. vperm2f128 m5, m1, m3, 0x20
  338. vperm2f128 m6, m0, m2, 0x31
  339. vperm2f128 m7, m1, m3, 0x31
  340. PASS_SMALL 0, [cos_32], [cos_32+32]
  341. ret
  342. fft32_interleave_avx:
  343. call fft32_avx
  344. mov r2d, 32
  345. .deint_loop:
  346. mova m2, Z(0)
  347. mova m3, Z(1)
  348. vunpcklps m0, m2, m3
  349. vunpckhps m1, m2, m3
  350. vextractf128 Z(0), m0, 0
  351. vextractf128 ZH(0), m1, 0
  352. vextractf128 Z(1), m0, 1
  353. vextractf128 ZH(1), m1, 1
  354. add r0, mmsize*2
  355. sub r2d, mmsize/4
  356. jg .deint_loop
  357. ret
  358. %endif
  359. INIT_XMM sse
  360. align 16
  361. fft4_avx:
  362. fft4_sse:
  363. mova m0, Z(0)
  364. mova m1, Z(1)
  365. T4_SSE m0, m1, m2
  366. mova Z(0), m0
  367. mova Z(1), m1
  368. ret
  369. align 16
  370. fft8_sse:
  371. mova m0, Z(0)
  372. mova m1, Z(1)
  373. T4_SSE m0, m1, m2
  374. mova m2, Z(2)
  375. mova m3, Z(3)
  376. T8_SSE m0, m1, m2, m3, m4, m5
  377. mova Z(0), m0
  378. mova Z(1), m1
  379. mova Z(2), m2
  380. mova Z(3), m3
  381. ret
  382. align 16
  383. fft16_sse:
  384. mova m0, Z(0)
  385. mova m1, Z(1)
  386. T4_SSE m0, m1, m2
  387. mova m2, Z(2)
  388. mova m3, Z(3)
  389. T8_SSE m0, m1, m2, m3, m4, m5
  390. mova m4, Z(4)
  391. mova m5, Z(5)
  392. mova Z(0), m0
  393. mova Z(1), m1
  394. mova Z(2), m2
  395. mova Z(3), m3
  396. T4_SSE m4, m5, m6
  397. mova m6, Z2(6)
  398. mova m7, Z2(7)
  399. T4_SSE m6, m7, m0
  400. PASS_SMALL 0, [cos_16], [cos_16+16]
  401. ret
  402. %macro FFT48_3DNOW 0
  403. align 16
  404. fft4 %+ SUFFIX:
  405. T2_3DNOW m0, m1, Z(0), Z(1)
  406. mova m2, Z(2)
  407. mova m3, Z(3)
  408. T4_3DNOW m0, m1, m2, m3, m4, m5
  409. PUNPCK m0, m1, m4
  410. PUNPCK m2, m3, m5
  411. mova Z(0), m0
  412. mova Z(1), m4
  413. mova Z(2), m2
  414. mova Z(3), m5
  415. ret
  416. align 16
  417. fft8 %+ SUFFIX:
  418. T2_3DNOW m0, m1, Z(0), Z(1)
  419. mova m2, Z(2)
  420. mova m3, Z(3)
  421. T4_3DNOW m0, m1, m2, m3, m4, m5
  422. mova Z(0), m0
  423. mova Z(2), m2
  424. T2_3DNOW m4, m5, Z(4), Z(5)
  425. T2_3DNOW m6, m7, Z2(6), Z2(7)
  426. PSWAPD m0, m5
  427. PSWAPD m2, m7
  428. pxor m0, [ps_m1p1]
  429. pxor m2, [ps_m1p1]
  430. pfsub m5, m0
  431. pfadd m7, m2
  432. pfmul m5, [ps_root2]
  433. pfmul m7, [ps_root2]
  434. T4_3DNOW m1, m3, m5, m7, m0, m2
  435. mova Z(5), m5
  436. mova Z2(7), m7
  437. mova m0, Z(0)
  438. mova m2, Z(2)
  439. T4_3DNOW m0, m2, m4, m6, m5, m7
  440. PUNPCK m0, m1, m5
  441. PUNPCK m2, m3, m7
  442. mova Z(0), m0
  443. mova Z(1), m5
  444. mova Z(2), m2
  445. mova Z(3), m7
  446. PUNPCK m4, Z(5), m5
  447. PUNPCK m6, Z2(7), m7
  448. mova Z(4), m4
  449. mova Z(5), m5
  450. mova Z2(6), m6
  451. mova Z2(7), m7
  452. ret
  453. %endmacro
  454. %if ARCH_X86_32
  455. %macro PSWAPD 2
  456. %if cpuflag(3dnowext)
  457. pswapd %1, %2
  458. %elifidn %1, %2
  459. movd [r0+12], %1
  460. punpckhdq %1, [r0+8]
  461. %else
  462. movq %1, %2
  463. psrlq %1, 32
  464. punpckldq %1, %2
  465. %endif
  466. %endmacro
  467. INIT_MMX 3dnowext
  468. FFT48_3DNOW
  469. INIT_MMX 3dnow
  470. FFT48_3DNOW
  471. %endif
  472. %define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
  473. %define Z2(x) [zcq + o3q + mmsize*(x&1)]
  474. %define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
  475. %define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
  476. %macro DECL_PASS 2+ ; name, payload
  477. align 16
  478. %1:
  479. DEFINE_ARGS zc, w, n, o1, o3
  480. lea o3q, [nq*3]
  481. lea o1q, [nq*8]
  482. shl o3q, 4
  483. .loop:
  484. %2
  485. add zcq, mmsize*2
  486. add wq, mmsize
  487. sub nd, mmsize/8
  488. jg .loop
  489. rep ret
  490. %endmacro
  491. %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
  492. lea r2, [dispatch_tab%1]
  493. mov r2, [r2 + (%2q-2)*gprsize]
  494. %ifdef PIC
  495. lea r3, [$$]
  496. add r2, r3
  497. %endif
  498. call r2
  499. %endmacro ; FFT_DISPATCH
  500. INIT_YMM avx
  501. %if HAVE_AVX
  502. %macro INTERL_AVX 5
  503. vunpckhps %3, %2, %1
  504. vunpcklps %2, %2, %1
  505. vextractf128 %4(%5), %2, 0
  506. vextractf128 %4 %+ H(%5), %3, 0
  507. vextractf128 %4(%5 + 1), %2, 1
  508. vextractf128 %4 %+ H(%5 + 1), %3, 1
  509. %endmacro
  510. %define INTERL INTERL_AVX
  511. DECL_PASS pass_avx, PASS_BIG 1
  512. DECL_PASS pass_interleave_avx, PASS_BIG 0
  513. cglobal fft_calc, 2,5,8
  514. mov r3d, [r0 + FFTContext.nbits]
  515. mov r0, r1
  516. mov r1, r3
  517. FFT_DISPATCH _interleave %+ SUFFIX, r1
  518. REP_RET
  519. %endif
  520. INIT_XMM sse
  521. %macro INTERL_SSE 5
  522. mova %3, %2
  523. unpcklps %2, %1
  524. unpckhps %3, %1
  525. mova %4(%5), %2
  526. mova %4(%5+1), %3
  527. %endmacro
  528. %define INTERL INTERL_SSE
  529. DECL_PASS pass_sse, PASS_BIG 1
  530. DECL_PASS pass_interleave_sse, PASS_BIG 0
  531. %macro FFT_CALC_FUNC 0
  532. cglobal fft_calc, 2,5,8
  533. mov r3d, [r0 + FFTContext.nbits]
  534. PUSH r1
  535. PUSH r3
  536. mov r0, r1
  537. mov r1, r3
  538. FFT_DISPATCH _interleave %+ SUFFIX, r1
  539. POP rcx
  540. POP r4
  541. cmp rcx, 3+(mmsize/16)
  542. jg .end
  543. mov r2, -1
  544. add rcx, 3
  545. shl r2, cl
  546. sub r4, r2
  547. .loop:
  548. %if mmsize == 8
  549. PSWAPD m0, [r4 + r2 + 4]
  550. mova [r4 + r2 + 4], m0
  551. %else
  552. movaps xmm0, [r4 + r2]
  553. movaps xmm1, xmm0
  554. unpcklps xmm0, [r4 + r2 + 16]
  555. unpckhps xmm1, [r4 + r2 + 16]
  556. movaps [r4 + r2], xmm0
  557. movaps [r4 + r2 + 16], xmm1
  558. %endif
  559. add r2, mmsize*2
  560. jl .loop
  561. .end:
  562. %if cpuflag(3dnow)
  563. femms
  564. RET
  565. %else
  566. REP_RET
  567. %endif
  568. %endmacro
  569. %if ARCH_X86_32
  570. INIT_MMX 3dnow
  571. FFT_CALC_FUNC
  572. INIT_MMX 3dnowext
  573. FFT_CALC_FUNC
  574. %endif
  575. INIT_XMM sse
  576. FFT_CALC_FUNC
  577. cglobal fft_permute, 2,7,1
  578. mov r4, [r0 + FFTContext.revtab]
  579. mov r5, [r0 + FFTContext.tmpbuf]
  580. mov ecx, [r0 + FFTContext.nbits]
  581. mov r2, 1
  582. shl r2, cl
  583. xor r0, r0
  584. %if ARCH_X86_32
  585. mov r1, r1m
  586. %endif
  587. .loop:
  588. movaps xmm0, [r1 + 8*r0]
  589. movzx r6, word [r4 + 2*r0]
  590. movzx r3, word [r4 + 2*r0 + 2]
  591. movlps [r5 + 8*r6], xmm0
  592. movhps [r5 + 8*r3], xmm0
  593. add r0, 2
  594. cmp r0, r2
  595. jl .loop
  596. shl r2, 3
  597. add r1, r2
  598. add r5, r2
  599. neg r2
  600. ; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
  601. .loopcopy:
  602. movaps xmm0, [r5 + r2]
  603. movaps xmm1, [r5 + r2 + 16]
  604. movaps [r1 + r2], xmm0
  605. movaps [r1 + r2 + 16], xmm1
  606. add r2, 32
  607. jl .loopcopy
  608. REP_RET
  609. %macro IMDCT_CALC_FUNC 0
  610. cglobal imdct_calc, 3,5,3
  611. mov r3d, [r0 + FFTContext.mdctsize]
  612. mov r4, [r0 + FFTContext.imdcthalf]
  613. add r1, r3
  614. PUSH r3
  615. PUSH r1
  616. %if ARCH_X86_32
  617. push r2
  618. push r1
  619. push r0
  620. %else
  621. sub rsp, 8
  622. %endif
  623. call r4
  624. %if ARCH_X86_32
  625. add esp, 12
  626. %else
  627. add rsp, 8
  628. %endif
  629. POP r1
  630. POP r3
  631. lea r0, [r1 + 2*r3]
  632. mov r2, r3
  633. sub r3, mmsize
  634. neg r2
  635. mova m2, [ps_m1m1m1m1]
  636. .loop:
  637. %if mmsize == 8
  638. PSWAPD m0, [r1 + r3]
  639. PSWAPD m1, [r0 + r2]
  640. pxor m0, m2
  641. %else
  642. mova m0, [r1 + r3]
  643. mova m1, [r0 + r2]
  644. shufps m0, m0, 0x1b
  645. shufps m1, m1, 0x1b
  646. xorps m0, m2
  647. %endif
  648. mova [r0 + r3], m1
  649. mova [r1 + r2], m0
  650. sub r3, mmsize
  651. add r2, mmsize
  652. jl .loop
  653. %if cpuflag(3dnow)
  654. femms
  655. RET
  656. %else
  657. REP_RET
  658. %endif
  659. %endmacro
  660. %if ARCH_X86_32
  661. INIT_MMX 3dnow
  662. IMDCT_CALC_FUNC
  663. INIT_MMX 3dnowext
  664. IMDCT_CALC_FUNC
  665. %endif
  666. INIT_XMM sse
  667. IMDCT_CALC_FUNC
  668. %if ARCH_X86_32
  669. INIT_MMX 3dnow
  670. %define mulps pfmul
  671. %define addps pfadd
  672. %define subps pfsub
  673. %define unpcklps punpckldq
  674. %define unpckhps punpckhdq
  675. DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
  676. DECL_PASS pass_interleave_3dnow, PASS_BIG 0
  677. %define pass_3dnowext pass_3dnow
  678. %define pass_interleave_3dnowext pass_interleave_3dnow
  679. %endif
  680. %ifdef PIC
  681. %define SECTION_REL - $$
  682. %else
  683. %define SECTION_REL
  684. %endif
  685. %macro DECL_FFT 1-2 ; nbits, suffix
  686. %ifidn %0, 1
  687. %xdefine fullsuffix SUFFIX
  688. %else
  689. %xdefine fullsuffix %2 %+ SUFFIX
  690. %endif
  691. %xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
  692. %if %1>=5
  693. %xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
  694. %endif
  695. %if %1>=6
  696. %xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
  697. %endif
  698. %assign n 1<<%1
  699. %rep 17-%1
  700. %assign n2 n/2
  701. %assign n4 n/4
  702. %xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
  703. align 16
  704. fft %+ n %+ fullsuffix:
  705. call fft %+ n2 %+ SUFFIX
  706. add r0, n*4 - (n&(-2<<%1))
  707. call fft %+ n4 %+ SUFFIX
  708. add r0, n*2 - (n2&(-2<<%1))
  709. call fft %+ n4 %+ SUFFIX
  710. sub r0, n*6 + (n2&(-2<<%1))
  711. lea r1, [cos_ %+ n]
  712. mov r2d, n4/2
  713. jmp pass %+ fullsuffix
  714. %assign n n*2
  715. %endrep
  716. %undef n
  717. align 8
  718. dispatch_tab %+ fullsuffix: pointer list_of_fft
  719. section .text
  720. ; On x86_32, this function does the register saving and restoring for all of fft.
  721. ; The others pass args in registers and don't spill anything.
  722. cglobal fft_dispatch%2, 2,5,8, zc, nbits
  723. FFT_DISPATCH fullsuffix, nbits
  724. RET
  725. %endmacro ; DECL_FFT
  726. %if HAVE_AVX
  727. INIT_YMM avx
  728. DECL_FFT 6
  729. DECL_FFT 6, _interleave
  730. %endif
  731. INIT_XMM sse
  732. DECL_FFT 5
  733. DECL_FFT 5, _interleave
  734. %if ARCH_X86_32
  735. INIT_MMX 3dnow
  736. DECL_FFT 4
  737. DECL_FFT 4, _interleave
  738. INIT_MMX 3dnowext
  739. DECL_FFT 4
  740. DECL_FFT 4, _interleave
  741. %endif
  742. INIT_XMM sse
  743. %undef mulps
  744. %undef addps
  745. %undef subps
  746. %undef unpcklps
  747. %undef unpckhps
  748. %macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
  749. %if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
  750. PSWAPD m0, [%3+%2*4]
  751. movq m2, [%3+%1*4-8]
  752. movq m3, m0
  753. punpckldq m0, m2
  754. punpckhdq m2, m3
  755. movd m1, [%4+%1*2-4] ; tcos[j]
  756. movd m3, [%4+%2*2] ; tcos[n4-j-1]
  757. punpckldq m1, [%5+%1*2-4] ; tsin[j]
  758. punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
  759. mova m4, m0
  760. PSWAPD m5, m1
  761. pfmul m0, m1
  762. pfmul m4, m5
  763. mova m6, m2
  764. PSWAPD m5, m3
  765. pfmul m2, m3
  766. pfmul m6, m5
  767. %if cpuflag(3dnowext)
  768. pfpnacc m0, m4
  769. pfpnacc m2, m6
  770. %else
  771. SBUTTERFLY dq, 0, 4, 1
  772. SBUTTERFLY dq, 2, 6, 3
  773. pxor m4, m7
  774. pxor m6, m7
  775. pfadd m0, m4
  776. pfadd m2, m6
  777. %endif
  778. %else
  779. movaps xmm0, [%3+%2*4]
  780. movaps xmm1, [%3+%1*4-0x10]
  781. movaps xmm2, xmm0
  782. shufps xmm0, xmm1, 0x88
  783. shufps xmm1, xmm2, 0x77
  784. movlps xmm4, [%4+%2*2]
  785. movlps xmm5, [%5+%2*2+0x0]
  786. movhps xmm4, [%4+%1*2-0x8]
  787. movhps xmm5, [%5+%1*2-0x8]
  788. movaps xmm2, xmm0
  789. movaps xmm3, xmm1
  790. mulps xmm0, xmm5
  791. mulps xmm1, xmm4
  792. mulps xmm2, xmm4
  793. mulps xmm3, xmm5
  794. subps xmm1, xmm0
  795. addps xmm2, xmm3
  796. movaps xmm0, xmm1
  797. unpcklps xmm1, xmm2
  798. unpckhps xmm0, xmm2
  799. %endif
  800. %endmacro
  801. %macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
  802. mulps m6, %3, [%5+%1]
  803. mulps m7, %2, [%5+%1]
  804. mulps %2, %2, [%6+%1]
  805. mulps %3, %3, [%6+%1]
  806. subps %2, %2, m6
  807. addps %3, %3, m7
  808. %endmacro
  809. %macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
  810. .post:
  811. vmovaps ymm1, [%3+%1*2]
  812. vmovaps ymm0, [%3+%1*2+0x20]
  813. vmovaps ymm3, [%3+%2*2]
  814. vmovaps ymm2, [%3+%2*2+0x20]
  815. CMUL %1, ymm0, ymm1, %3, %4, %5
  816. CMUL %2, ymm2, ymm3, %3, %4, %5
  817. vshufps ymm1, ymm1, ymm1, 0x1b
  818. vshufps ymm3, ymm3, ymm3, 0x1b
  819. vperm2f128 ymm1, ymm1, ymm1, 0x01
  820. vperm2f128 ymm3, ymm3, ymm3, 0x01
  821. vunpcklps ymm6, ymm2, ymm1
  822. vunpckhps ymm4, ymm2, ymm1
  823. vunpcklps ymm7, ymm0, ymm3
  824. vunpckhps ymm5, ymm0, ymm3
  825. vextractf128 [%3+%1*2], ymm7, 0
  826. vextractf128 [%3+%1*2+0x10], ymm5, 0
  827. vextractf128 [%3+%1*2+0x20], ymm7, 1
  828. vextractf128 [%3+%1*2+0x30], ymm5, 1
  829. vextractf128 [%3+%2*2], ymm6, 0
  830. vextractf128 [%3+%2*2+0x10], ymm4, 0
  831. vextractf128 [%3+%2*2+0x20], ymm6, 1
  832. vextractf128 [%3+%2*2+0x30], ymm4, 1
  833. sub %2, 0x20
  834. add %1, 0x20
  835. jl .post
  836. %endmacro
  837. %macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
  838. .post:
  839. movaps xmm1, [%3+%1*2]
  840. movaps xmm0, [%3+%1*2+0x10]
  841. CMUL %1, xmm0, xmm1, %3, %4, %5
  842. movaps xmm5, [%3+%2*2]
  843. movaps xmm4, [%3+%2*2+0x10]
  844. CMUL %2, xmm4, xmm5, %3, %4, %5
  845. shufps xmm1, xmm1, 0x1b
  846. shufps xmm5, xmm5, 0x1b
  847. movaps xmm6, xmm4
  848. unpckhps xmm4, xmm1
  849. unpcklps xmm6, xmm1
  850. movaps xmm2, xmm0
  851. unpcklps xmm0, xmm5
  852. unpckhps xmm2, xmm5
  853. movaps [%3+%2*2], xmm6
  854. movaps [%3+%2*2+0x10], xmm4
  855. movaps [%3+%1*2], xmm0
  856. movaps [%3+%1*2+0x10], xmm2
  857. sub %2, 0x10
  858. add %1, 0x10
  859. jl .post
  860. %endmacro
  861. %macro CMUL_3DNOW 6
  862. mova m6, [%1+%2*2]
  863. mova %3, [%1+%2*2+8]
  864. mova %4, m6
  865. mova m7, %3
  866. pfmul m6, [%5+%2]
  867. pfmul %3, [%6+%2]
  868. pfmul %4, [%6+%2]
  869. pfmul m7, [%5+%2]
  870. pfsub %3, m6
  871. pfadd %4, m7
  872. %endmacro
  873. %macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
  874. .post:
  875. CMUL_3DNOW %3, %1, m0, m1, %4, %5
  876. CMUL_3DNOW %3, %2, m2, m3, %4, %5
  877. movd [%3+%1*2+ 0], m0
  878. movd [%3+%2*2+12], m1
  879. movd [%3+%2*2+ 0], m2
  880. movd [%3+%1*2+12], m3
  881. psrlq m0, 32
  882. psrlq m1, 32
  883. psrlq m2, 32
  884. psrlq m3, 32
  885. movd [%3+%1*2+ 8], m0
  886. movd [%3+%2*2+ 4], m1
  887. movd [%3+%2*2+ 8], m2
  888. movd [%3+%1*2+ 4], m3
  889. sub %2, 8
  890. add %1, 8
  891. jl .post
  892. %endmacro
  893. %macro DECL_IMDCT 1
  894. cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
  895. %if ARCH_X86_64
  896. %define rrevtab r7
  897. %define rtcos r8
  898. %define rtsin r9
  899. %else
  900. %define rrevtab r6
  901. %define rtsin r6
  902. %define rtcos r5
  903. %endif
  904. mov r3d, [r0+FFTContext.mdctsize]
  905. add r2, r3
  906. shr r3, 1
  907. mov rtcos, [r0+FFTContext.tcos]
  908. mov rtsin, [r0+FFTContext.tsin]
  909. add rtcos, r3
  910. add rtsin, r3
  911. %if ARCH_X86_64 == 0
  912. push rtcos
  913. push rtsin
  914. %endif
  915. shr r3, 1
  916. mov rrevtab, [r0+FFTContext.revtab]
  917. add rrevtab, r3
  918. %if ARCH_X86_64 == 0
  919. push rrevtab
  920. %endif
  921. %if mmsize == 8
  922. sub r3, 2
  923. %else
  924. sub r3, 4
  925. %endif
  926. %if ARCH_X86_64 || mmsize == 8
  927. xor r4, r4
  928. sub r4, r3
  929. %endif
  930. %if notcpuflag(3dnowext) && mmsize == 8
  931. movd m7, [ps_m1m1m1m1]
  932. %endif
  933. .pre:
  934. %if ARCH_X86_64 == 0
  935. ;unspill
  936. %if mmsize != 8
  937. xor r4, r4
  938. sub r4, r3
  939. %endif
  940. mov rtcos, [esp+8]
  941. mov rtsin, [esp+4]
  942. %endif
  943. PREROTATER r4, r3, r2, rtcos, rtsin
  944. %if mmsize == 8
  945. mov r6, [esp] ; rrevtab = ptr+n8
  946. movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
  947. movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
  948. mova [r1+r5*8], m0
  949. mova [r1+r6*8], m2
  950. add r4, 2
  951. sub r3, 2
  952. %else
  953. %if ARCH_X86_64
  954. movzx r5, word [rrevtab+r4-4]
  955. movzx r6, word [rrevtab+r4-2]
  956. movzx r10, word [rrevtab+r3]
  957. movzx r11, word [rrevtab+r3+2]
  958. movlps [r1+r5 *8], xmm0
  959. movhps [r1+r6 *8], xmm0
  960. movlps [r1+r10*8], xmm1
  961. movhps [r1+r11*8], xmm1
  962. add r4, 4
  963. %else
  964. mov r6, [esp]
  965. movzx r5, word [r6+r4-4]
  966. movzx r4, word [r6+r4-2]
  967. movlps [r1+r5*8], xmm0
  968. movhps [r1+r4*8], xmm0
  969. movzx r5, word [r6+r3]
  970. movzx r4, word [r6+r3+2]
  971. movlps [r1+r5*8], xmm1
  972. movhps [r1+r4*8], xmm1
  973. %endif
  974. sub r3, 4
  975. %endif
  976. jns .pre
  977. mov r5, r0
  978. mov r6, r1
  979. mov r0, r1
  980. mov r1d, [r5+FFTContext.nbits]
  981. FFT_DISPATCH SUFFIX, r1
  982. mov r0d, [r5+FFTContext.mdctsize]
  983. add r6, r0
  984. shr r0, 1
  985. %if ARCH_X86_64 == 0
  986. %define rtcos r2
  987. %define rtsin r3
  988. mov rtcos, [esp+8]
  989. mov rtsin, [esp+4]
  990. %endif
  991. neg r0
  992. mov r1, -mmsize
  993. sub r1, r0
  994. %1 r0, r1, r6, rtcos, rtsin
  995. %if ARCH_X86_64 == 0
  996. add esp, 12
  997. %endif
  998. %if mmsize == 8
  999. femms
  1000. %endif
  1001. RET
  1002. %endmacro
  1003. DECL_IMDCT POSROTATESHUF
  1004. %if ARCH_X86_32
  1005. INIT_MMX 3dnow
  1006. DECL_IMDCT POSROTATESHUF_3DNOW
  1007. INIT_MMX 3dnowext
  1008. DECL_IMDCT POSROTATESHUF_3DNOW
  1009. %endif
  1010. INIT_YMM avx
  1011. %if HAVE_AVX
  1012. DECL_IMDCT POSROTATESHUF_AVX
  1013. %endif