You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

495 lines
12KB

  1. ;******************************************************************************
  2. ;* 32 point SSE-optimized DCT transform
  3. ;* Copyright (c) 2010 Vitor Sessak
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA 32
  24. align 32
  25. ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
  26. dd 0.553104, 0.582935, 0.622504, 0.674808
  27. dd -10.190008, -3.407609, -2.057781, -1.484165
  28. dd -1.169440, -0.972568, -0.839350, -0.744536
  29. dd 0.502419, 0.522499, 0.566944, 0.646822
  30. dd 0.788155, 1.060678, 1.722447, 5.101149
  31. dd 0.509796, 0.601345, 0.899976, 2.562916
  32. dd 0.509796, 0.601345, 0.899976, 2.562916
  33. dd 1.000000, 1.000000, 1.306563, 0.541196
  34. dd 1.000000, 1.000000, 1.306563, 0.541196
  35. dd 1.000000, 0.707107, 1.000000, -0.707107
  36. dd 1.000000, 0.707107, 1.000000, -0.707107
  37. dd 0.707107, 0.707107, 0.707107, 0.707107
  38. align 32
  39. ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
  40. %macro BUTTERFLY_SSE 4
  41. movaps %4, %1
  42. subps %1, %2
  43. addps %2, %4
  44. mulps %1, %3
  45. %endmacro
  46. %macro BUTTERFLY_AVX 4
  47. vsubps %4, %1, %2
  48. vaddps %2, %2, %1
  49. vmulps %1, %4, %3
  50. %endmacro
  51. %macro BUTTERFLY0_SSE 5
  52. movaps %4, %1
  53. shufps %1, %1, %5
  54. xorps %4, %2
  55. addps %1, %4
  56. mulps %1, %3
  57. %endmacro
  58. %macro BUTTERFLY0_AVX 5
  59. vshufps %4, %1, %1, %5
  60. vxorps %1, %1, %2
  61. vaddps %4, %4, %1
  62. vmulps %1, %4, %3
  63. %endmacro
  64. %macro BUTTERFLY2 4
  65. BUTTERFLY0 %1, %2, %3, %4, 0x1b
  66. %endmacro
  67. %macro BUTTERFLY3 4
  68. BUTTERFLY0 %1, %2, %3, %4, 0xb1
  69. %endmacro
  70. %macro BUTTERFLY3V 5
  71. movaps m%5, m%1
  72. addps m%1, m%2
  73. subps m%5, m%2
  74. SWAP %2, %5
  75. mulps m%2, [ps_cos_vec+192]
  76. movaps m%5, m%3
  77. addps m%3, m%4
  78. subps m%4, m%5
  79. mulps m%4, [ps_cos_vec+192]
  80. %endmacro
  81. %macro PASS6_AND_PERMUTE 0
  82. mov tmpd, [outq+4]
  83. movss m7, [outq+72]
  84. addss m7, [outq+76]
  85. movss m3, [outq+56]
  86. addss m3, [outq+60]
  87. addss m4, m3
  88. movss m2, [outq+52]
  89. addss m2, m3
  90. movss m3, [outq+104]
  91. addss m3, [outq+108]
  92. addss m1, m3
  93. addss m5, m4
  94. movss [outq+ 16], m1
  95. movss m1, [outq+100]
  96. addss m1, m3
  97. movss m3, [outq+40]
  98. movss [outq+ 48], m1
  99. addss m3, [outq+44]
  100. movss m1, [outq+100]
  101. addss m4, m3
  102. addss m3, m2
  103. addss m1, [outq+108]
  104. movss [outq+ 40], m3
  105. addss m2, [outq+36]
  106. movss m3, [outq+8]
  107. movss [outq+ 56], m2
  108. addss m3, [outq+12]
  109. movss [outq+ 32], m3
  110. movss m3, [outq+80]
  111. movss [outq+ 8], m5
  112. movss [outq+ 80], m1
  113. movss m2, [outq+52]
  114. movss m5, [outq+120]
  115. addss m5, [outq+124]
  116. movss m1, [outq+64]
  117. addss m2, [outq+60]
  118. addss m0, m5
  119. addss m5, [outq+116]
  120. mov [outq+64], tmpd
  121. addss m6, m0
  122. addss m1, m6
  123. mov tmpd, [outq+12]
  124. mov [outq+ 96], tmpd
  125. movss [outq+ 4], m1
  126. movss m1, [outq+24]
  127. movss [outq+ 24], m4
  128. movss m4, [outq+88]
  129. addss m4, [outq+92]
  130. addss m3, m4
  131. addss m4, [outq+84]
  132. mov tmpd, [outq+108]
  133. addss m1, [outq+28]
  134. addss m0, m1
  135. addss m1, m5
  136. addss m6, m3
  137. addss m3, m0
  138. addss m0, m7
  139. addss m5, [outq+20]
  140. addss m7, m1
  141. movss [outq+ 12], m6
  142. mov [outq+112], tmpd
  143. movss m6, [outq+28]
  144. movss [outq+ 28], m0
  145. movss m0, [outq+36]
  146. movss [outq+ 36], m7
  147. addss m1, m4
  148. movss m7, [outq+116]
  149. addss m0, m2
  150. addss m7, [outq+124]
  151. movss [outq+ 72], m0
  152. movss m0, [outq+44]
  153. addss m2, m0
  154. movss [outq+ 44], m1
  155. movss [outq+ 88], m2
  156. addss m0, [outq+60]
  157. mov tmpd, [outq+60]
  158. mov [outq+120], tmpd
  159. movss [outq+104], m0
  160. addss m4, m5
  161. addss m5, [outq+68]
  162. movss [outq+52], m4
  163. movss [outq+60], m5
  164. movss m4, [outq+68]
  165. movss m5, [outq+20]
  166. movss [outq+ 20], m3
  167. addss m5, m7
  168. addss m7, m6
  169. addss m4, m5
  170. movss m2, [outq+84]
  171. addss m2, [outq+92]
  172. addss m5, m2
  173. movss [outq+ 68], m4
  174. addss m2, m7
  175. movss m4, [outq+76]
  176. movss [outq+ 84], m2
  177. movss [outq+ 76], m5
  178. addss m7, m4
  179. addss m6, [outq+124]
  180. addss m4, m6
  181. addss m6, [outq+92]
  182. movss [outq+100], m4
  183. movss [outq+108], m6
  184. movss m6, [outq+92]
  185. movss [outq+92], m7
  186. addss m6, [outq+124]
  187. movss [outq+116], m6
  188. %endmacro
  189. %define BUTTERFLY BUTTERFLY_AVX
  190. %define BUTTERFLY0 BUTTERFLY0_AVX
  191. INIT_YMM
  192. SECTION_TEXT
  193. %ifdef HAVE_AVX
  194. ; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
  195. cglobal dct32_float_avx, 2,3,8, out, in, tmp
  196. ; pass 1
  197. vmovaps m4, [inq+0]
  198. vinsertf128 m5, m5, [inq+96], 1
  199. vinsertf128 m5, m5, [inq+112], 0
  200. vshufps m5, m5, m5, 0x1b
  201. BUTTERFLY m4, m5, [ps_cos_vec], m6
  202. vmovaps m2, [inq+64]
  203. vinsertf128 m6, m6, [inq+32], 1
  204. vinsertf128 m6, m6, [inq+48], 0
  205. vshufps m6, m6, m6, 0x1b
  206. BUTTERFLY m2, m6, [ps_cos_vec+32], m0
  207. ; pass 2
  208. BUTTERFLY m5, m6, [ps_cos_vec+64], m0
  209. BUTTERFLY m4, m2, [ps_cos_vec+64], m7
  210. ; pass 3
  211. vperm2f128 m3, m6, m4, 0x31
  212. vperm2f128 m1, m6, m4, 0x20
  213. vshufps m3, m3, m3, 0x1b
  214. BUTTERFLY m1, m3, [ps_cos_vec+96], m6
  215. vperm2f128 m4, m5, m2, 0x20
  216. vperm2f128 m5, m5, m2, 0x31
  217. vshufps m5, m5, m5, 0x1b
  218. BUTTERFLY m4, m5, [ps_cos_vec+96], m6
  219. ; pass 4
  220. vmovaps m6, [ps_p1p1m1m1+0]
  221. vmovaps m2, [ps_cos_vec+128]
  222. BUTTERFLY2 m5, m6, m2, m7
  223. BUTTERFLY2 m4, m6, m2, m7
  224. BUTTERFLY2 m1, m6, m2, m7
  225. BUTTERFLY2 m3, m6, m2, m7
  226. ; pass 5
  227. vshufps m6, m6, m6, 0xcc
  228. vmovaps m2, [ps_cos_vec+160]
  229. BUTTERFLY3 m5, m6, m2, m7
  230. BUTTERFLY3 m4, m6, m2, m7
  231. BUTTERFLY3 m1, m6, m2, m7
  232. BUTTERFLY3 m3, m6, m2, m7
  233. vperm2f128 m6, m3, m3, 0x31
  234. vmovaps [outq], m3
  235. vextractf128 [outq+64], m5, 1
  236. vextractf128 [outq+32], m5, 0
  237. vextractf128 [outq+80], m4, 1
  238. vextractf128 [outq+48], m4, 0
  239. vperm2f128 m0, m1, m1, 0x31
  240. vmovaps [outq+96], m1
  241. vzeroupper
  242. ; pass 6, no SIMD...
  243. INIT_XMM
  244. PASS6_AND_PERMUTE
  245. RET
  246. %endif
  247. %define BUTTERFLY BUTTERFLY_SSE
  248. %define BUTTERFLY0 BUTTERFLY0_SSE
  249. %ifdef ARCH_X86_64
  250. %define SPILL SWAP
  251. %define UNSPILL SWAP
  252. %macro PASS5 0
  253. nop ; FIXME code alignment
  254. SWAP 5, 8
  255. SWAP 4, 12
  256. SWAP 6, 14
  257. SWAP 7, 13
  258. SWAP 0, 15
  259. PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
  260. TRANSPOSE4x4PS 8, 9, 10, 11, 0
  261. BUTTERFLY3V 8, 9, 10, 11, 0
  262. addps m10, m11
  263. TRANSPOSE4x4PS 12, 13, 14, 15, 0
  264. BUTTERFLY3V 12, 13, 14, 15, 0
  265. addps m14, m15
  266. addps m12, m14
  267. addps m14, m13
  268. addps m13, m15
  269. %endmacro
  270. %macro PASS6 0
  271. SWAP 9, 12
  272. SWAP 11, 14
  273. movss [outq+0x00], m8
  274. pshuflw m0, m8, 0xe
  275. movss [outq+0x10], m9
  276. pshuflw m1, m9, 0xe
  277. movss [outq+0x20], m10
  278. pshuflw m2, m10, 0xe
  279. movss [outq+0x30], m11
  280. pshuflw m3, m11, 0xe
  281. movss [outq+0x40], m12
  282. pshuflw m4, m12, 0xe
  283. movss [outq+0x50], m13
  284. pshuflw m5, m13, 0xe
  285. movss [outq+0x60], m14
  286. pshuflw m6, m14, 0xe
  287. movaps [outq+0x70], m15
  288. pshuflw m7, m15, 0xe
  289. addss m0, m1
  290. addss m1, m2
  291. movss [outq+0x08], m0
  292. addss m2, m3
  293. movss [outq+0x18], m1
  294. addss m3, m4
  295. movss [outq+0x28], m2
  296. addss m4, m5
  297. movss [outq+0x38], m3
  298. addss m5, m6
  299. movss [outq+0x48], m4
  300. addss m6, m7
  301. movss [outq+0x58], m5
  302. movss [outq+0x68], m6
  303. movss [outq+0x78], m7
  304. PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
  305. movhlps m0, m1
  306. pshufd m1, m1, 3
  307. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  308. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  309. %rep 7
  310. movhlps m0, m1
  311. pshufd m1, m1, 3
  312. addss m15, m1
  313. SWAP 0, 2, 4, 6, 8, 10, 12, 14
  314. SWAP 1, 3, 5, 7, 9, 11, 13, 15
  315. %endrep
  316. %assign i 4
  317. %rep 15
  318. addss m0, m1
  319. movss [outq+i], m0
  320. SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
  321. %assign i i+8
  322. %endrep
  323. %endmacro
  324. %else ; ARCH_X86_32
  325. %macro SPILL 2 ; xmm#, mempos
  326. movaps [outq+(%2-8)*16], m%1
  327. %endmacro
  328. %macro UNSPILL 2
  329. movaps m%1, [outq+(%2-8)*16]
  330. %endmacro
  331. %define PASS6 PASS6_AND_PERMUTE
  332. %macro PASS5 0
  333. movaps m2, [ps_cos_vec+160]
  334. shufps m3, m3, 0xcc
  335. BUTTERFLY3 m5, m3, m2, m1
  336. SPILL 5, 8
  337. UNSPILL 1, 9
  338. BUTTERFLY3 m1, m3, m2, m5
  339. SPILL 1, 14
  340. BUTTERFLY3 m4, m3, m2, m5
  341. SPILL 4, 12
  342. BUTTERFLY3 m7, m3, m2, m5
  343. SPILL 7, 13
  344. UNSPILL 5, 10
  345. BUTTERFLY3 m5, m3, m2, m7
  346. SPILL 5, 10
  347. UNSPILL 4, 11
  348. BUTTERFLY3 m4, m3, m2, m7
  349. SPILL 4, 11
  350. BUTTERFLY3 m6, m3, m2, m7
  351. SPILL 6, 9
  352. BUTTERFLY3 m0, m3, m2, m7
  353. SPILL 0, 15
  354. %endmacro
  355. %endif
  356. INIT_XMM
  357. ; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
  358. cglobal dct32_float_sse, 2,3,16, out, in, tmp
  359. ; pass 1
  360. movaps m0, [inq+0]
  361. movaps m1, [inq+112]
  362. shufps m1, m1, 0x1b
  363. BUTTERFLY m0, m1, [ps_cos_vec], m3
  364. movaps m7, [inq+64]
  365. movaps m4, [inq+48]
  366. shufps m4, m4, 0x1b
  367. BUTTERFLY m7, m4, [ps_cos_vec+32], m3
  368. ; pass 2
  369. movaps m2, [ps_cos_vec+64]
  370. BUTTERFLY m1, m4, m2, m3
  371. SPILL 1, 11
  372. SPILL 4, 8
  373. ; pass 1
  374. movaps m1, [inq+16]
  375. movaps m6, [inq+96]
  376. shufps m6, m6, 0x1b
  377. BUTTERFLY m1, m6, [ps_cos_vec+16], m3
  378. movaps m4, [inq+80]
  379. movaps m5, [inq+32]
  380. shufps m5, m5, 0x1b
  381. BUTTERFLY m4, m5, [ps_cos_vec+48], m3
  382. ; pass 2
  383. BUTTERFLY m0, m7, m2, m3
  384. movaps m2, [ps_cos_vec+80]
  385. BUTTERFLY m6, m5, m2, m3
  386. BUTTERFLY m1, m4, m2, m3
  387. ; pass 3
  388. movaps m2, [ps_cos_vec+96]
  389. shufps m1, m1, 0x1b
  390. BUTTERFLY m0, m1, m2, m3
  391. SPILL 0, 15
  392. SPILL 1, 14
  393. UNSPILL 0, 8
  394. shufps m5, m5, 0x1b
  395. BUTTERFLY m0, m5, m2, m3
  396. UNSPILL 1, 11
  397. shufps m6, m6, 0x1b
  398. BUTTERFLY m1, m6, m2, m3
  399. SPILL 1, 11
  400. shufps m4, m4, 0x1b
  401. BUTTERFLY m7, m4, m2, m3
  402. ; pass 4
  403. movaps m3, [ps_p1p1m1m1+0]
  404. movaps m2, [ps_cos_vec+128]
  405. BUTTERFLY2 m5, m3, m2, m1
  406. BUTTERFLY2 m0, m3, m2, m1
  407. SPILL 0, 9
  408. BUTTERFLY2 m6, m3, m2, m1
  409. SPILL 6, 10
  410. UNSPILL 0, 11
  411. BUTTERFLY2 m0, m3, m2, m1
  412. SPILL 0, 11
  413. BUTTERFLY2 m4, m3, m2, m1
  414. BUTTERFLY2 m7, m3, m2, m1
  415. UNSPILL 6, 14
  416. BUTTERFLY2 m6, m3, m2, m1
  417. UNSPILL 0, 15
  418. BUTTERFLY2 m0, m3, m2, m1
  419. PASS5
  420. PASS6
  421. RET