You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

297 lines
12KB

  1. /*
  2. * 32 point SSE-optimized DCT transform
  3. * Copyright (c) 2010 Vitor Sessak
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include <stdint.h>
  22. #include "libavutil/x86_cpu.h"
  23. #include "libavutil/mem.h"
  24. #include "libavcodec/dsputil.h"
  25. #include "fft.h"
  26. DECLARE_ALIGNED(16, static const float, b1)[] = {
  27. 0.500603, 0.505471, 0.515447, 0.531043,
  28. 0.553104, 0.582935, 0.622504, 0.674808,
  29. -1.169440, -0.972568, -0.839350, -0.744536,
  30. -10.190008, -3.407609, -2.057781, -1.484165,
  31. 0.502419, 0.522499, 0.566944, 0.646822,
  32. 0.788155, 1.060678, 1.722447, 5.101149,
  33. 0.509796, 0.601345, 0.899976, 2.562916,
  34. 1.000000, 1.000000, 1.306563, 0.541196,
  35. 1.000000, 0.707107, 1.000000, -0.707107
  36. };
  37. DECLARE_ALIGNED(16, static const int32_t, smask)[4] = {
  38. 0, 0, 0x80000000, 0x80000000
  39. };
  40. /* butterfly operator */
  41. #define BUTTERFLY(a,b,c,tmp) \
  42. "movaps %%" #a ", %%" #tmp " \n\t" \
  43. "subps %%" #b ", %%" #a " \n\t" \
  44. "addps %%" #tmp ", %%" #b " \n\t" \
  45. "mulps " #c ", %%" #a " \n\t"
  46. ///* Same as BUTTERFLY when vectors a and b overlap */
  47. #define BUTTERFLY0(val, mask, cos, tmp, shuf) \
  48. "movaps %%" #val ", %%" #tmp " \n\t" \
  49. "shufps " #shuf ", %%" #val ",%%" #val " \n\t" \
  50. "xorps %%" #mask ", %%" #tmp " \n\t" /* flip signs */ \
  51. "addps %%" #tmp ", %%" #val " \n\t" \
  52. "mulps %%" #cos ", %%" #val " \n\t"
  53. #define BUTTERFLY2(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0x1b)
  54. #define BUTTERFLY3(val, mask, cos, tmp) BUTTERFLY0(val, mask, cos, tmp, $0xb1)
  55. void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
  56. {
  57. int32_t tmp1 = 0;
  58. __asm__ volatile(
  59. /* pass 1 */
  60. "movaps (%4), %%xmm0 \n\t"
  61. "movaps 112(%4), %%xmm1 \n\t"
  62. "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
  63. BUTTERFLY(xmm0, xmm1, (%2), xmm3)
  64. "movaps 64(%4), %%xmm7 \n\t"
  65. "movaps 48(%4), %%xmm4 \n\t"
  66. "shufps $0x1b, %%xmm4, %%xmm4 \n\t"
  67. BUTTERFLY(xmm7, xmm4, 48(%2), xmm3)
  68. /* pass 2 */
  69. "movaps 64(%2), %%xmm2 \n\t"
  70. BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
  71. "movaps %%xmm1, 48(%1) \n\t"
  72. "movaps %%xmm4, (%1) \n\t"
  73. /* pass 1 */
  74. "movaps 16(%4), %%xmm1 \n\t"
  75. "movaps 96(%4), %%xmm6 \n\t"
  76. "shufps $0x1b, %%xmm6, %%xmm6 \n\t"
  77. BUTTERFLY(xmm1, xmm6, 16(%2), xmm3)
  78. "movaps 80(%4), %%xmm4 \n\t"
  79. "movaps 32(%4), %%xmm5 \n\t"
  80. "shufps $0x1b, %%xmm5, %%xmm5 \n\t"
  81. BUTTERFLY(xmm4, xmm5, 32(%2), xmm3)
  82. /* pass 2 */
  83. BUTTERFLY(xmm0, xmm7, %%xmm2, xmm3)
  84. "movaps 80(%2), %%xmm2 \n\t"
  85. BUTTERFLY(xmm6, xmm5, %%xmm2, xmm3)
  86. BUTTERFLY(xmm1, xmm4, %%xmm2, xmm3)
  87. /* pass 3 */
  88. "movaps 96(%2), %%xmm2 \n\t"
  89. "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
  90. BUTTERFLY(xmm0, xmm1, %%xmm2, xmm3)
  91. "movaps %%xmm0, 112(%1) \n\t"
  92. "movaps %%xmm1, 96(%1) \n\t"
  93. "movaps 0(%1), %%xmm0 \n\t"
  94. "shufps $0x1b, %%xmm5, %%xmm5 \n\t"
  95. BUTTERFLY(xmm0, xmm5, %%xmm2, xmm3)
  96. "movaps 48(%1), %%xmm1 \n\t"
  97. "shufps $0x1b, %%xmm6, %%xmm6 \n\t"
  98. BUTTERFLY(xmm1, xmm6, %%xmm2, xmm3)
  99. "movaps %%xmm1, 48(%1) \n\t"
  100. "shufps $0x1b, %%xmm4, %%xmm4 \n\t"
  101. BUTTERFLY(xmm7, xmm4, %%xmm2, xmm3)
  102. /* pass 4 */
  103. "movaps (%3), %%xmm3 \n\t"
  104. "movaps 112(%2), %%xmm2 \n\t"
  105. BUTTERFLY2(xmm5, xmm3, xmm2, xmm1)
  106. BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
  107. "movaps %%xmm0, 16(%1) \n\t"
  108. BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
  109. "movaps %%xmm6, 32(%1) \n\t"
  110. "movaps 48(%1), %%xmm0 \n\t"
  111. BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
  112. "movaps %%xmm0, 48(%1) \n\t"
  113. BUTTERFLY2(xmm4, xmm3, xmm2, xmm1)
  114. BUTTERFLY2(xmm7, xmm3, xmm2, xmm1)
  115. "movaps 96(%1), %%xmm6 \n\t"
  116. BUTTERFLY2(xmm6, xmm3, xmm2, xmm1)
  117. "movaps 112(%1), %%xmm0 \n\t"
  118. BUTTERFLY2(xmm0, xmm3, xmm2, xmm1)
  119. /* pass 5 */
  120. "movaps 128(%2), %%xmm2 \n\t"
  121. "shufps $0xCC, %%xmm3,%%xmm3 \n\t"
  122. BUTTERFLY3(xmm5, xmm3, xmm2, xmm1)
  123. "movaps %%xmm5, (%1) \n\t"
  124. "movaps 16(%1), %%xmm1 \n\t"
  125. BUTTERFLY3(xmm1, xmm3, xmm2, xmm5)
  126. "movaps %%xmm1, 16(%1) \n\t"
  127. BUTTERFLY3(xmm4, xmm3, xmm2, xmm5)
  128. "movaps %%xmm4, 64(%1) \n\t"
  129. BUTTERFLY3(xmm7, xmm3, xmm2, xmm5)
  130. "movaps %%xmm7, 80(%1) \n\t"
  131. "movaps 32(%1), %%xmm5 \n\t"
  132. BUTTERFLY3(xmm5, xmm3, xmm2, xmm7)
  133. "movaps %%xmm5, 32(%1) \n\t"
  134. "movaps 48(%1), %%xmm4 \n\t"
  135. BUTTERFLY3(xmm4, xmm3, xmm2, xmm7)
  136. "movaps %%xmm4, 48(%1) \n\t"
  137. BUTTERFLY3(xmm6, xmm3, xmm2, xmm7)
  138. "movaps %%xmm6, 96(%1) \n\t"
  139. BUTTERFLY3(xmm0, xmm3, xmm2, xmm7)
  140. "movaps %%xmm0, 112(%1) \n\t"
  141. /* pass 6, no SIMD... */
  142. "movss 56(%1), %%xmm3 \n\t"
  143. "movl 4(%1), %0 \n\t"
  144. "addss 60(%1), %%xmm3 \n\t"
  145. "movss 72(%1), %%xmm7 \n\t"
  146. "addss %%xmm3, %%xmm4 \n\t"
  147. "movss 52(%1), %%xmm2 \n\t"
  148. "addss %%xmm3, %%xmm2 \n\t"
  149. "movss 24(%1), %%xmm3 \n\t"
  150. "addss 28(%1), %%xmm3 \n\t"
  151. "addss 76(%1), %%xmm7 \n\t"
  152. "addss %%xmm3, %%xmm1 \n\t"
  153. "addss %%xmm4, %%xmm5 \n\t"
  154. "movss %%xmm1, 16(%1) \n\t"
  155. "movss 20(%1), %%xmm1 \n\t"
  156. "addss %%xmm3, %%xmm1 \n\t"
  157. "movss 40(%1), %%xmm3 \n\t"
  158. "movss %%xmm1, 48(%1) \n\t"
  159. "addss 44(%1), %%xmm3 \n\t"
  160. "movss 20(%1), %%xmm1 \n\t"
  161. "addss %%xmm3, %%xmm4 \n\t"
  162. "addss %%xmm2, %%xmm3 \n\t"
  163. "addss 28(%1), %%xmm1 \n\t"
  164. "movss %%xmm3, 40(%1) \n\t"
  165. "addss 36(%1), %%xmm2 \n\t"
  166. "movss 8(%1), %%xmm3 \n\t"
  167. "movss %%xmm2, 56(%1) \n\t"
  168. "addss 12(%1), %%xmm3 \n\t"
  169. "movss %%xmm5, 8(%1) \n\t"
  170. "movss %%xmm3, 32(%1) \n\t"
  171. "movss 52(%1), %%xmm2 \n\t"
  172. "movss 80(%1), %%xmm3 \n\t"
  173. "movss 120(%1), %%xmm5 \n\t"
  174. "movss %%xmm1, 80(%1) \n\t"
  175. "movss %%xmm4, 24(%1) \n\t"
  176. "addss 124(%1), %%xmm5 \n\t"
  177. "movss 64(%1), %%xmm1 \n\t"
  178. "addss 60(%1), %%xmm2 \n\t"
  179. "addss %%xmm5, %%xmm0 \n\t"
  180. "addss 116(%1), %%xmm5 \n\t"
  181. "movl %0, 64(%1) \n\t"
  182. "addss %%xmm0, %%xmm6 \n\t"
  183. "addss %%xmm6, %%xmm1 \n\t"
  184. "movl 12(%1), %0 \n\t"
  185. "movss %%xmm1, 4(%1) \n\t"
  186. "movss 88(%1), %%xmm1 \n\t"
  187. "movl %0, 96(%1) \n\t"
  188. "addss 92(%1), %%xmm1 \n\t"
  189. "movss 104(%1), %%xmm4 \n\t"
  190. "movl 28(%1), %0 \n\t"
  191. "addss 108(%1), %%xmm4 \n\t"
  192. "addss %%xmm4, %%xmm0 \n\t"
  193. "addss %%xmm1, %%xmm3 \n\t"
  194. "addss 84(%1), %%xmm1 \n\t"
  195. "addss %%xmm5, %%xmm4 \n\t"
  196. "addss %%xmm3, %%xmm6 \n\t"
  197. "addss %%xmm0, %%xmm3 \n\t"
  198. "addss %%xmm7, %%xmm0 \n\t"
  199. "addss 100(%1), %%xmm5 \n\t"
  200. "addss %%xmm4, %%xmm7 \n\t"
  201. "movl %0, 112(%1) \n\t"
  202. "movss %%xmm0, 28(%1) \n\t"
  203. "movss 36(%1), %%xmm0 \n\t"
  204. "movss %%xmm7, 36(%1) \n\t"
  205. "addss %%xmm1, %%xmm4 \n\t"
  206. "movss 116(%1), %%xmm7 \n\t"
  207. "addss %%xmm2, %%xmm0 \n\t"
  208. "addss 124(%1), %%xmm7 \n\t"
  209. "movss %%xmm0, 72(%1) \n\t"
  210. "movss 44(%1), %%xmm0 \n\t"
  211. "movss %%xmm6, 12(%1) \n\t"
  212. "movss %%xmm3, 20(%1) \n\t"
  213. "addss %%xmm0, %%xmm2 \n\t"
  214. "movss %%xmm4, 44(%1) \n\t"
  215. "movss %%xmm2, 88(%1) \n\t"
  216. "addss 60(%1), %%xmm0 \n\t"
  217. "movl 60(%1), %0 \n\t"
  218. "movl %0, 120(%1) \n\t"
  219. "movss %%xmm0, 104(%1) \n\t"
  220. "addss %%xmm5, %%xmm1 \n\t"
  221. "addss 68(%1), %%xmm5 \n\t"
  222. "movss %%xmm1, 52(%1) \n\t"
  223. "movss %%xmm5, 60(%1) \n\t"
  224. "movss 68(%1), %%xmm1 \n\t"
  225. "movss 100(%1), %%xmm5 \n\t"
  226. "addss %%xmm7, %%xmm5 \n\t"
  227. "addss 108(%1), %%xmm7 \n\t"
  228. "addss %%xmm5, %%xmm1 \n\t"
  229. "movss 84(%1), %%xmm2 \n\t"
  230. "addss 92(%1), %%xmm2 \n\t"
  231. "addss %%xmm2, %%xmm5 \n\t"
  232. "movss %%xmm1, 68(%1) \n\t"
  233. "addss %%xmm7, %%xmm2 \n\t"
  234. "movss 76(%1), %%xmm1 \n\t"
  235. "movss %%xmm2, 84(%1) \n\t"
  236. "movss %%xmm5, 76(%1) \n\t"
  237. "movss 108(%1), %%xmm2 \n\t"
  238. "addss %%xmm1, %%xmm7 \n\t"
  239. "addss 124(%1), %%xmm2 \n\t"
  240. "addss %%xmm2, %%xmm1 \n\t"
  241. "addss 92(%1), %%xmm2 \n\t"
  242. "movss %%xmm1, 100(%1) \n\t"
  243. "movss %%xmm2, 108(%1) \n\t"
  244. "movss 92(%1), %%xmm2 \n\t"
  245. "movss %%xmm7, 92(%1) \n\t"
  246. "addss 124(%1), %%xmm2 \n\t"
  247. "movss %%xmm2, 116(%1) \n\t"
  248. :"+&r"(tmp1)
  249. :"r"(out), "r"(b1), "r"(smask), "r"(in)
  250. :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3",
  251. "%xmm4", "%xmm5", "%xmm6", "%xmm7",)
  252. "memory"
  253. );
  254. }