You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

362 lines
8.4KB

  1. ;******************************************************************************
  2. ;* 36 point SSE-optimized IMDCT transform
  3. ;* Copyright (c) 2011 Vitor Sessak
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86inc.asm"
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. align 16
  25. ps_mask: dd 0, ~0, ~0, ~0
  26. ps_mask2: dd 0, ~0, 0, ~0
  27. ps_mask3: dd 0, 0, 0, ~0
  28. ps_mask4: dd 0, ~0, 0, 0
  29. ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
  30. ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
  31. ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
  32. ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
  33. ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
  34. ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
  35. ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
  36. ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
  37. ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
  38. ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
  39. dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
  40. dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
  41. dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
  42. dd 1.0, 0.70710678118654752439, 0.0, 0.0
  43. ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
  44. dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
  45. dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
  46. dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
  47. dd 1.0, 0.70710678118654752439, 0.0, 0.0
  48. %define SBLIMIT 32
  49. SECTION_TEXT
  50. %macro PSHUFD_SSE_AVX 3
  51. shufps %1, %2, %2, %3
  52. %endmacro
  53. %macro PSHUFD_SSE2 3
  54. pshufd %1, %2, %3
  55. %endmacro
  56. ; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4}
  57. ; output %3={x3,x4,y1,y2}
  58. %macro BUILDINVHIGHLOW_SSE 3
  59. movlhps %3, %2
  60. movhlps %3, %1
  61. %endmacro
  62. %macro BUILDINVHIGHLOW_AVX 3
  63. shufps %3, %1, %2, 0x4e
  64. %endmacro
  65. ; input %1={x1,x2,x3,x4}, %2={y1,y2,y3,y4}
  66. ; output %3={x4,y1,y2,y3}
  67. %macro ROTLEFT_SSE 3
  68. BUILDINVHIGHLOW %1, %2, %3
  69. shufps %3, %3, %2, 0x99
  70. %endmacro
  71. %macro ROTLEFT_SSSE3 3
  72. palignr %3, %2, %1, 12
  73. %endmacro
  74. %macro INVERTHL_SSE1 2
  75. movhlps %1, %2
  76. movlhps %1, %2
  77. %endmacro
  78. %macro INVERTHL_SSE2 2
  79. PSHUFD %1, %2, 0x4e
  80. %endmacro
  81. %macro BUTTERF_SSE12 3
  82. INVERTHL %2, %1
  83. xorps %1, [ps_p1p1m1m1]
  84. addps %1, %2
  85. mulps %1, [ps_cosh + %3]
  86. PSHUFD %2, %1, 0xb1
  87. xorps %1, [ps_p1m1p1m1]
  88. addps %1, %2
  89. %endmacro
  90. %macro BUTTERF_SSE3 3
  91. INVERTHL %2, %1
  92. xorps %1, %1, [ps_p1p1m1m1]
  93. addps %1, %1, %2
  94. mulps %1, %1, [ps_cosh_sse3 + %3]
  95. PSHUFD %2, %1, 0xb1
  96. addsubps %1, %1, %2
  97. %endmacro
  98. %macro STORE 3
  99. movhlps %2, %1
  100. movss [%3 ], %1
  101. movss [%3 + 8*SBLIMIT], %2
  102. shufps %1, %1, 0xb1
  103. movss [%3 + 4*SBLIMIT], %1
  104. movhlps %2, %1
  105. movss [%3 + 12*SBLIMIT], %2
  106. %endmacro
  107. %macro LOADA64 2
  108. movlps %1, [%2]
  109. movhps %1, [%2 + 8]
  110. %endmacro
  111. %macro STOREA64 2
  112. movlps [%1 ], %2
  113. movhps [%1 + 8], %2
  114. %endmacro
  115. %macro DEFINE_IMDCT 1
  116. cglobal imdct36_float_%1, 4,4,9, out, buf, in, win
  117. ; for(i=17;i>=1;i--) in[i] += in[i-1];
  118. LOADA64 m0, inq
  119. LOADA64 m1, inq + 16
  120. ROTLEFT m0, m1, m5
  121. PSHUFD m6, m0, 0x93
  122. andps m6, m6, [ps_mask]
  123. addps m0, m0, m6
  124. LOADA64 m2, inq + 32
  125. ROTLEFT m1, m2, m7
  126. addps m1, m1, m5
  127. LOADA64 m3, inq + 48
  128. ROTLEFT m2, m3, m5
  129. xorps m4, m4, m4
  130. movlps m4, [inq+64]
  131. BUILDINVHIGHLOW m3, m4, m6
  132. shufps m6, m6, m4, 0xa9
  133. addps m4, m4, m6
  134. addps m2, m2, m7
  135. addps m3, m3, m5
  136. ; for(i=17;i>=3;i-=2) in[i] += in[i-2];
  137. movlhps m5, m5, m0
  138. andps m5, m5, [ps_mask3]
  139. BUILDINVHIGHLOW m0, m1, m7
  140. andps m7, m7, [ps_mask2]
  141. addps m0, m0, m5
  142. BUILDINVHIGHLOW m1, m2, m6
  143. andps m6, m6, [ps_mask2]
  144. addps m1, m1, m7
  145. BUILDINVHIGHLOW m2, m3, m7
  146. andps m7, m7, [ps_mask2]
  147. addps m2, m2, m6
  148. movhlps m6, m6, m3
  149. andps m6, m6, [ps_mask4]
  150. addps m3, m3, m7
  151. addps m4, m4, m6
  152. ; Populate tmp[]
  153. movlhps m6, m1, m5 ; zero out high values
  154. subps m6, m6, m4
  155. subps m5, m0, m3
  156. %ifdef ARCH_X86_64
  157. SWAP m5, m8
  158. %endif
  159. mulps m7, m2, [ps_val1]
  160. %ifdef ARCH_X86_64
  161. mulps m5, m8, [ps_val2]
  162. %else
  163. mulps m5, m5, [ps_val2]
  164. %endif
  165. addps m7, m7, m5
  166. mulps m5, m6, [ps_val1]
  167. subps m7, m7, m5
  168. %ifndef ARCH_X86_64
  169. subps m5, m0, m3
  170. %else
  171. SWAP m5, m8
  172. %endif
  173. subps m5, m5, m6
  174. addps m5, m5, m2
  175. shufps m6, m4, m3, 0xe4
  176. subps m6, m6, m2
  177. mulps m6, m6, [ps_val3]
  178. addps m4, m4, m1
  179. mulps m4, m4, [ps_val4]
  180. shufps m1, m1, m0, 0xe4
  181. addps m1, m1, m2
  182. mulps m1, m1, [ps_val5]
  183. mulps m3, m3, [ps_val6]
  184. mulps m0, m0, [ps_val7]
  185. addps m0, m0, m3
  186. xorps m2, m1, [ps_p1p1m1m1]
  187. subps m2, m2, m4
  188. addps m2, m2, m0
  189. addps m3, m4, m0
  190. subps m3, m3, m6
  191. xorps m3, m3, [ps_p1p1m1m1]
  192. shufps m0, m0, m4, 0xe4
  193. subps m0, m0, m1
  194. addps m0, m0, m6
  195. BUILDINVHIGHLOW m2, m3, m4
  196. shufps m3, m3, m2, 0x4e
  197. ; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
  198. BUTTERF m0, m1, 0
  199. BUTTERF m7, m2, 16
  200. BUTTERF m3, m6, 32
  201. BUTTERF m4, m1, 48
  202. mulps m5, m5, [ps_cosh + 64]
  203. PSHUFD m1, m5, 0xe1
  204. xorps m5, m5, [ps_p1m1p1m1]
  205. addps m5, m5, m1
  206. ; permutates:
  207. ; m0 0 1 2 3 => 2 6 10 14 m1
  208. ; m7 4 5 6 7 => 3 7 11 15 m2
  209. ; m3 8 9 10 11 => 17 13 9 5 m3
  210. ; m4 12 13 14 15 => 16 12 8 4 m5
  211. ; m5 16 17 xx xx => 0 1 xx xx m0
  212. unpckhps m1, m0, m7
  213. unpckhps m6, m3, m4
  214. movhlps m2, m6, m1
  215. movlhps m1, m1, m6
  216. unpcklps m5, m5, m4
  217. unpcklps m3, m3, m7
  218. movhlps m4, m3, m5
  219. movlhps m5, m5, m3
  220. SWAP m4, m3
  221. ; permutation done
  222. PSHUFD m6, m2, 0xb1
  223. movlps m7, [bufq + 64]
  224. mulps m6, m6, [winq + 16*4]
  225. addps m6, m6, m7
  226. movss [outq + 64*SBLIMIT], m6
  227. shufps m6, m6, m6, 0xb1
  228. movss [outq + 68*SBLIMIT], m6
  229. mulps m6, m3, [winq + 4*4]
  230. LOADA64 m4, bufq + 16
  231. addps m6, m6, m4
  232. STORE m6, m7, outq + 16*SBLIMIT
  233. shufps m4, m0, m3, 0xb5
  234. mulps m4, m4, [winq + 8*4]
  235. LOADA64 m7, bufq + 32
  236. addps m4, m4, m7
  237. STORE m4, m6, outq + 32*SBLIMIT
  238. shufps m3, m3, m2, 0xb1
  239. mulps m3, m3, [winq + 12*4]
  240. LOADA64 m7, bufq + 48
  241. addps m3, m3, m7
  242. STORE m3, m7, outq + 48*SBLIMIT
  243. mulps m2, m2, [winq]
  244. LOADA64 m6, bufq
  245. addps m2, m2, m6
  246. STORE m2, m7, outq
  247. mulps m4, m1, [winq + 20*4]
  248. STOREA64 bufq, m4
  249. mulps m3, m5, [winq + 24*4]
  250. STOREA64 bufq + 16, m3
  251. shufps m0, m0, m5, 0xb0
  252. mulps m0, m0, [winq + 28*4]
  253. STOREA64 bufq + 32, m0
  254. shufps m5, m5, m1, 0xb1
  255. mulps m5, m5, [winq + 32*4]
  256. STOREA64 bufq + 48, m5
  257. shufps m1, m1, m1, 0xb1
  258. mulps m1, m1, [winq + 36*4]
  259. movlps [bufq + 64], m1
  260. RET
  261. %endmacro
  262. %define PSHUFD PSHUFD_SSE_AVX
  263. %define INVERTHL INVERTHL_SSE1
  264. %define BUTTERF BUTTERF_SSE12
  265. %define BUTTERF0 BUTTERF0_SSE12
  266. %define BUILDINVHIGHLOW BUILDINVHIGHLOW_SSE
  267. %define ROTLEFT ROTLEFT_SSE
  268. INIT_XMM
  269. DEFINE_IMDCT sse
  270. %define PSHUFD PSHUFD_SSE2
  271. %define INVERTHL INVERTHL_SSE2
  272. DEFINE_IMDCT sse2
  273. %define BUTTERF BUTTERF_SSE3
  274. %define BUTTERF0 BUTTERF0_SSE3
  275. DEFINE_IMDCT sse3
  276. %define ROTLEFT ROTLEFT_SSSE3
  277. DEFINE_IMDCT ssse3
  278. %define BUILDINVHIGHLOW BUILDINVHIGHLOW_AVX
  279. %define PSHUFD PSHUFD_SSE_AVX
  280. INIT_AVX
  281. DEFINE_IMDCT avx