You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1174 lines
66KB

  1. /*
  2. * Simple IDCT MMX
  3. *
  4. * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavutil/internal.h"
  23. #include "libavutil/mem.h"
  24. #include "libavutil/x86/asm.h"
  25. #include "libavcodec/idctdsp.h"
  26. #include "idctdsp.h"
  27. #include "simple_idct.h"
  28. #if HAVE_INLINE_ASM
  29. /*
  30. 23170.475006
  31. 22725.260826
  32. 21406.727617
  33. 19265.545870
  34. 16384.000000
  35. 12872.826198
  36. 8866.956905
  37. 4520.335430
  38. */
  39. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  40. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  41. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  42. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  43. #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
  44. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  45. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  46. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  47. #define ROW_SHIFT 11
  48. #define COL_SHIFT 20 // 6
  49. DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
  50. DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
  51. DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
  52. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  53. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  54. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  55. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  56. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  57. // 0, 0, 0, 0,
  58. // 0, 0, 0, 0,
  59. C4, C4, C4, C4,
  60. C4, -C4, C4, -C4,
  61. C2, C6, C2, C6,
  62. C6, -C2, C6, -C2,
  63. C1, C3, C1, C3,
  64. C5, C7, C5, C7,
  65. C3, -C7, C3, -C7,
  66. -C1, -C5, -C1, -C5,
  67. C5, -C1, C5, -C1,
  68. C7, C3, C7, C3,
  69. C7, -C5, C7, -C5,
  70. C3, -C1, C3, -C1
  71. };
  72. static inline void idct(int16_t *block)
  73. {
  74. DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
  75. int16_t * const temp= (int16_t*)align_tmp;
  76. __asm__ volatile(
  77. #if 0 //Alternative, simpler variant
  78. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  79. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  80. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  81. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  82. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  83. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  84. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  85. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  86. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  87. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  88. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  89. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  90. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  91. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  92. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  93. #rounder ", %%mm4 \n\t"\
  94. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  95. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  96. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  97. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  98. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  99. #rounder ", %%mm0 \n\t"\
  100. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  101. "paddd %%mm0, %%mm0 \n\t" \
  102. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  103. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  104. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  105. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  106. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  107. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  108. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  109. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  110. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  111. "psrad $" #shift ", %%mm7 \n\t"\
  112. "psrad $" #shift ", %%mm4 \n\t"\
  113. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  114. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  115. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  116. "psrad $" #shift ", %%mm1 \n\t"\
  117. "psrad $" #shift ", %%mm2 \n\t"\
  118. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  119. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  120. "movq %%mm7, " #dst " \n\t"\
  121. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  122. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  123. "movq %%mm2, 24+" #dst " \n\t"\
  124. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  125. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  126. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  127. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  128. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  129. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  130. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  131. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  132. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  133. "psrad $" #shift ", %%mm2 \n\t"\
  134. "psrad $" #shift ", %%mm0 \n\t"\
  135. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  136. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  137. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  138. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  139. "psrad $" #shift ", %%mm6 \n\t"\
  140. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  141. "movq %%mm2, 8+" #dst " \n\t"\
  142. "psrad $" #shift ", %%mm4 \n\t"\
  143. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  144. "movq %%mm4, 16+" #dst " \n\t"\
  145. #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
  146. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  147. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  148. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  149. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  150. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  151. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  152. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  153. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  154. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  155. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  156. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  157. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  158. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  159. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  160. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  161. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  162. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  163. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  164. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  165. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  166. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  167. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  168. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  169. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  170. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  171. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  172. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  173. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  174. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  175. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  176. "psrad $" #shift ", %%mm7 \n\t"\
  177. "psrad $" #shift ", %%mm4 \n\t"\
  178. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  179. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  180. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  181. "psrad $" #shift ", %%mm0 \n\t"\
  182. "psrad $" #shift ", %%mm2 \n\t"\
  183. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  184. "movd %%mm7, " #dst " \n\t"\
  185. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  186. "movd %%mm0, 16+" #dst " \n\t"\
  187. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  188. "movd %%mm2, 96+" #dst " \n\t"\
  189. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  190. "movd %%mm4, 112+" #dst " \n\t"\
  191. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  192. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  193. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  194. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  195. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  196. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  197. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  198. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  199. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  200. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  201. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  202. "psrad $" #shift ", %%mm2 \n\t"\
  203. "psrad $" #shift ", %%mm5 \n\t"\
  204. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  205. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  206. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  207. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  208. "psrad $" #shift ", %%mm6 \n\t"\
  209. "psrad $" #shift ", %%mm4 \n\t"\
  210. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  211. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  212. "movd %%mm2, 32+" #dst " \n\t"\
  213. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  214. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  215. "movd %%mm6, 48+" #dst " \n\t"\
  216. "movd %%mm4, 64+" #dst " \n\t"\
  217. "movd %%mm5, 80+" #dst " \n\t"\
  218. #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  219. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  220. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  221. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  222. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  223. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  224. "pand %%mm0, %%mm4 \n\t"\
  225. "por %%mm1, %%mm4 \n\t"\
  226. "por %%mm2, %%mm4 \n\t"\
  227. "por %%mm3, %%mm4 \n\t"\
  228. "packssdw %%mm4,%%mm4 \n\t"\
  229. "movd %%mm4, %%eax \n\t"\
  230. "orl %%eax, %%eax \n\t"\
  231. "jz 1f \n\t"\
  232. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  233. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  234. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  235. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  236. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  237. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  238. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  239. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  240. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  241. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  242. #rounder ", %%mm4 \n\t"\
  243. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  244. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  245. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  246. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  247. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  248. #rounder ", %%mm0 \n\t"\
  249. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  250. "paddd %%mm0, %%mm0 \n\t" \
  251. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  252. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  253. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  254. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  255. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  256. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  257. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  258. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  259. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  260. "psrad $" #shift ", %%mm7 \n\t"\
  261. "psrad $" #shift ", %%mm4 \n\t"\
  262. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  263. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  264. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  265. "psrad $" #shift ", %%mm1 \n\t"\
  266. "psrad $" #shift ", %%mm2 \n\t"\
  267. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  268. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  269. "movq %%mm7, " #dst " \n\t"\
  270. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  271. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  272. "movq %%mm2, 24+" #dst " \n\t"\
  273. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  274. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  275. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  276. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  277. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  278. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  279. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  280. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  281. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  282. "psrad $" #shift ", %%mm2 \n\t"\
  283. "psrad $" #shift ", %%mm0 \n\t"\
  284. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  285. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  286. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  287. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  288. "psrad $" #shift ", %%mm6 \n\t"\
  289. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  290. "movq %%mm2, 8+" #dst " \n\t"\
  291. "psrad $" #shift ", %%mm4 \n\t"\
  292. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  293. "movq %%mm4, 16+" #dst " \n\t"\
  294. "jmp 2f \n\t"\
  295. "1: \n\t"\
  296. "pslld $16, %%mm0 \n\t"\
  297. "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
  298. "psrad $13, %%mm0 \n\t"\
  299. "packssdw %%mm0, %%mm0 \n\t"\
  300. "movq %%mm0, " #dst " \n\t"\
  301. "movq %%mm0, 8+" #dst " \n\t"\
  302. "movq %%mm0, 16+" #dst " \n\t"\
  303. "movq %%mm0, 24+" #dst " \n\t"\
  304. "2: \n\t"
  305. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  306. ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  307. /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
  308. ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
  309. ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
  310. DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  311. DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  312. DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  313. //IDCT( src0, src4, src1, src5, dst, shift)
  314. COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  315. COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  316. COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  317. COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  318. #else
  319. #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  320. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  321. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  322. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  323. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  324. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  325. "pand %%mm0, %%mm4 \n\t"\
  326. "por %%mm1, %%mm4 \n\t"\
  327. "por %%mm2, %%mm4 \n\t"\
  328. "por %%mm3, %%mm4 \n\t"\
  329. "packssdw %%mm4,%%mm4 \n\t"\
  330. "movd %%mm4, %%eax \n\t"\
  331. "orl %%eax, %%eax \n\t"\
  332. "jz 1f \n\t"\
  333. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  334. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  335. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  336. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  337. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  338. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  339. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  340. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  341. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  342. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  343. #rounder ", %%mm4 \n\t"\
  344. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  345. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  346. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  347. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  348. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  349. #rounder ", %%mm0 \n\t"\
  350. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  351. "paddd %%mm0, %%mm0 \n\t" \
  352. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  353. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  354. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  355. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  356. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  357. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  358. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  359. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  360. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  361. "psrad $" #shift ", %%mm7 \n\t"\
  362. "psrad $" #shift ", %%mm4 \n\t"\
  363. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  364. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  365. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  366. "psrad $" #shift ", %%mm1 \n\t"\
  367. "psrad $" #shift ", %%mm2 \n\t"\
  368. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  369. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  370. "movq %%mm7, " #dst " \n\t"\
  371. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  372. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  373. "movq %%mm2, 24+" #dst " \n\t"\
  374. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  375. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  376. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  377. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  378. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  379. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  380. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  381. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  382. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  383. "psrad $" #shift ", %%mm2 \n\t"\
  384. "psrad $" #shift ", %%mm0 \n\t"\
  385. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  386. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  387. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  388. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  389. "psrad $" #shift ", %%mm6 \n\t"\
  390. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  391. "movq %%mm2, 8+" #dst " \n\t"\
  392. "psrad $" #shift ", %%mm4 \n\t"\
  393. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  394. "movq %%mm4, 16+" #dst " \n\t"\
  395. "jmp 2f \n\t"\
  396. "1: \n\t"\
  397. "pslld $16, %%mm0 \n\t"\
  398. "paddd "MANGLE(d40000)", %%mm0 \n\t"\
  399. "psrad $13, %%mm0 \n\t"\
  400. "packssdw %%mm0, %%mm0 \n\t"\
  401. "movq %%mm0, " #dst " \n\t"\
  402. "movq %%mm0, 8+" #dst " \n\t"\
  403. "movq %%mm0, 16+" #dst " \n\t"\
  404. "movq %%mm0, 24+" #dst " \n\t"\
  405. "2: \n\t"
  406. #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
  407. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  408. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  409. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  410. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  411. "movq %%mm0, %%mm4 \n\t"\
  412. "por %%mm1, %%mm4 \n\t"\
  413. "por %%mm2, %%mm4 \n\t"\
  414. "por %%mm3, %%mm4 \n\t"\
  415. "packssdw %%mm4,%%mm4 \n\t"\
  416. "movd %%mm4, %%eax \n\t"\
  417. "orl %%eax, %%eax \n\t"\
  418. "jz " #bt " \n\t"\
  419. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  420. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  421. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  422. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  423. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  424. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  425. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  426. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  427. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  428. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  429. #rounder ", %%mm4 \n\t"\
  430. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  431. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  432. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  433. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  434. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  435. #rounder ", %%mm0 \n\t"\
  436. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  437. "paddd %%mm0, %%mm0 \n\t" \
  438. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  439. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  440. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  441. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  442. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  443. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  444. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  445. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  446. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  447. "psrad $" #shift ", %%mm7 \n\t"\
  448. "psrad $" #shift ", %%mm4 \n\t"\
  449. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  450. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  451. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  452. "psrad $" #shift ", %%mm1 \n\t"\
  453. "psrad $" #shift ", %%mm2 \n\t"\
  454. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  455. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  456. "movq %%mm7, " #dst " \n\t"\
  457. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  458. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  459. "movq %%mm2, 24+" #dst " \n\t"\
  460. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  461. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  462. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  463. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  464. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  465. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  466. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  467. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  468. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  469. "psrad $" #shift ", %%mm2 \n\t"\
  470. "psrad $" #shift ", %%mm0 \n\t"\
  471. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  472. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  473. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  474. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  475. "psrad $" #shift ", %%mm6 \n\t"\
  476. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  477. "movq %%mm2, 8+" #dst " \n\t"\
  478. "psrad $" #shift ", %%mm4 \n\t"\
  479. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  480. "movq %%mm4, 16+" #dst " \n\t"\
  481. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  482. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  483. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  484. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  485. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  486. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  487. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  488. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  489. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  490. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  491. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  492. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  493. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  494. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  495. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  496. #rounder ", %%mm4 \n\t"\
  497. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  498. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  499. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  500. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  501. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  502. #rounder ", %%mm0 \n\t"\
  503. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  504. "paddd %%mm0, %%mm0 \n\t" \
  505. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  506. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  507. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  508. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  509. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  510. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  511. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  512. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  513. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  514. "psrad $" #shift ", %%mm7 \n\t"\
  515. "psrad $" #shift ", %%mm4 \n\t"\
  516. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  517. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  518. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  519. "psrad $" #shift ", %%mm1 \n\t"\
  520. "psrad $" #shift ", %%mm2 \n\t"\
  521. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  522. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  523. "movq %%mm7, " #dst " \n\t"\
  524. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  525. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  526. "movq %%mm2, 24+" #dst " \n\t"\
  527. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  528. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  529. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  530. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  531. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  532. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  533. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  534. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  535. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  536. "psrad $" #shift ", %%mm2 \n\t"\
  537. "psrad $" #shift ", %%mm0 \n\t"\
  538. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  539. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  540. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  541. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  542. "psrad $" #shift ", %%mm6 \n\t"\
  543. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  544. "movq %%mm2, 8+" #dst " \n\t"\
  545. "psrad $" #shift ", %%mm4 \n\t"\
  546. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  547. "movq %%mm4, 16+" #dst " \n\t"\
  548. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  549. DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  550. Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  551. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  552. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  553. #undef IDCT
  554. #define IDCT(src0, src4, src1, src5, dst, shift) \
  555. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  556. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  557. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  558. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  559. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  560. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  561. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  562. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  563. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  564. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  565. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  566. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  567. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  568. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  569. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  570. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  571. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  572. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  573. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  574. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  575. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  576. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  577. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  578. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  579. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  580. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  581. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  582. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  583. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  584. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  585. "psrad $" #shift ", %%mm7 \n\t"\
  586. "psrad $" #shift ", %%mm4 \n\t"\
  587. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  588. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  589. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  590. "psrad $" #shift ", %%mm0 \n\t"\
  591. "psrad $" #shift ", %%mm2 \n\t"\
  592. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  593. "movd %%mm7, " #dst " \n\t"\
  594. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  595. "movd %%mm0, 16+" #dst " \n\t"\
  596. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  597. "movd %%mm2, 96+" #dst " \n\t"\
  598. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  599. "movd %%mm4, 112+" #dst " \n\t"\
  600. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  601. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  602. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  603. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  604. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  605. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  606. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  607. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  608. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  609. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  610. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  611. "psrad $" #shift ", %%mm2 \n\t"\
  612. "psrad $" #shift ", %%mm5 \n\t"\
  613. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  614. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  615. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  616. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  617. "psrad $" #shift ", %%mm6 \n\t"\
  618. "psrad $" #shift ", %%mm4 \n\t"\
  619. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  620. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  621. "movd %%mm2, 32+" #dst " \n\t"\
  622. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  623. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  624. "movd %%mm6, 48+" #dst " \n\t"\
  625. "movd %%mm4, 64+" #dst " \n\t"\
  626. "movd %%mm5, 80+" #dst " \n\t"
  627. //IDCT( src0, src4, src1, src5, dst, shift)
  628. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  629. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  630. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  631. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  632. "jmp 9f \n\t"
  633. "# .p2align 4 \n\t"\
  634. "4: \n\t"
  635. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  636. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  637. #undef IDCT
  638. #define IDCT(src0, src4, src1, src5, dst, shift) \
  639. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  640. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  641. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  642. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  643. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  644. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  645. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  646. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  647. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  648. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  649. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  650. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  651. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  652. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  653. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  654. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  655. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  656. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  657. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  658. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  659. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  660. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  661. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  662. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  663. "psrad $" #shift ", %%mm1 \n\t"\
  664. "psrad $" #shift ", %%mm4 \n\t"\
  665. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  666. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  667. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  668. "psrad $" #shift ", %%mm0 \n\t"\
  669. "psrad $" #shift ", %%mm2 \n\t"\
  670. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  671. "movd %%mm1, " #dst " \n\t"\
  672. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  673. "movd %%mm0, 16+" #dst " \n\t"\
  674. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  675. "movd %%mm2, 96+" #dst " \n\t"\
  676. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  677. "movd %%mm4, 112+" #dst " \n\t"\
  678. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  679. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  680. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  681. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  682. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  683. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  684. "psrad $" #shift ", %%mm2 \n\t"\
  685. "psrad $" #shift ", %%mm5 \n\t"\
  686. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  687. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  688. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  689. "psrad $" #shift ", %%mm6 \n\t"\
  690. "psrad $" #shift ", %%mm1 \n\t"\
  691. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  692. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  693. "movd %%mm2, 32+" #dst " \n\t"\
  694. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  695. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  696. "movd %%mm6, 48+" #dst " \n\t"\
  697. "movd %%mm1, 64+" #dst " \n\t"\
  698. "movd %%mm5, 80+" #dst " \n\t"
  699. //IDCT( src0, src4, src1, src5, dst, shift)
  700. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  701. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  702. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  703. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  704. "jmp 9f \n\t"
  705. "# .p2align 4 \n\t"\
  706. "6: \n\t"
  707. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  708. #undef IDCT
  709. #define IDCT(src0, src4, src1, src5, dst, shift) \
  710. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  711. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  712. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  713. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  714. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  715. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  716. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  717. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  718. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  719. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  720. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  721. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  722. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  723. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  724. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  725. "psrad $" #shift ", %%mm1 \n\t"\
  726. "psrad $" #shift ", %%mm4 \n\t"\
  727. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  728. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  729. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  730. "psrad $" #shift ", %%mm0 \n\t"\
  731. "psrad $" #shift ", %%mm2 \n\t"\
  732. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  733. "movd %%mm1, " #dst " \n\t"\
  734. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  735. "movd %%mm0, 16+" #dst " \n\t"\
  736. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  737. "movd %%mm2, 96+" #dst " \n\t"\
  738. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  739. "movd %%mm4, 112+" #dst " \n\t"\
  740. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  741. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  742. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  743. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  744. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  745. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  746. "psrad $" #shift ", %%mm2 \n\t"\
  747. "psrad $" #shift ", %%mm5 \n\t"\
  748. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  749. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  750. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  751. "psrad $" #shift ", %%mm6 \n\t"\
  752. "psrad $" #shift ", %%mm1 \n\t"\
  753. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  754. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  755. "movd %%mm2, 32+" #dst " \n\t"\
  756. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  757. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  758. "movd %%mm6, 48+" #dst " \n\t"\
  759. "movd %%mm1, 64+" #dst " \n\t"\
  760. "movd %%mm5, 80+" #dst " \n\t"
  761. //IDCT( src0, src4, src1, src5, dst, shift)
  762. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  763. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  764. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  765. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  766. "jmp 9f \n\t"
  767. "# .p2align 4 \n\t"\
  768. "2: \n\t"
  769. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  770. #undef IDCT
  771. #define IDCT(src0, src4, src1, src5, dst, shift) \
  772. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  773. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  774. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  775. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  776. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  777. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  778. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  779. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  780. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  781. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  782. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  783. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  784. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  785. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  786. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  787. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  788. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  789. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  790. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  791. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  792. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  793. "psrad $" #shift ", %%mm7 \n\t"\
  794. "psrad $" #shift ", %%mm4 \n\t"\
  795. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  796. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  797. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  798. "psrad $" #shift ", %%mm0 \n\t"\
  799. "psrad $" #shift ", %%mm2 \n\t"\
  800. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  801. "movd %%mm7, " #dst " \n\t"\
  802. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  803. "movd %%mm0, 16+" #dst " \n\t"\
  804. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  805. "movd %%mm2, 96+" #dst " \n\t"\
  806. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  807. "movd %%mm4, 112+" #dst " \n\t"\
  808. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  809. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  810. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  811. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  812. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  813. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  814. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  815. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  816. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  817. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  818. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  819. "psrad $" #shift ", %%mm2 \n\t"\
  820. "psrad $" #shift ", %%mm5 \n\t"\
  821. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  822. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  823. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  824. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  825. "psrad $" #shift ", %%mm6 \n\t"\
  826. "psrad $" #shift ", %%mm4 \n\t"\
  827. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  828. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  829. "movd %%mm2, 32+" #dst " \n\t"\
  830. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  831. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  832. "movd %%mm6, 48+" #dst " \n\t"\
  833. "movd %%mm4, 64+" #dst " \n\t"\
  834. "movd %%mm5, 80+" #dst " \n\t"
  835. //IDCT( src0, src4, src1, src5, dst, shift)
  836. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  837. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  838. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  839. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  840. "jmp 9f \n\t"
  841. "# .p2align 4 \n\t"\
  842. "3: \n\t"
  843. #undef IDCT
  844. #define IDCT(src0, src4, src1, src5, dst, shift) \
  845. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  846. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  847. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  848. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  849. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  850. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  851. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  852. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  853. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  854. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  855. "movq 64(%2), %%mm3 \n\t"\
  856. "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  857. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  858. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  859. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  860. "psrad $" #shift ", %%mm7 \n\t"\
  861. "psrad $" #shift ", %%mm4 \n\t"\
  862. "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
  863. "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  864. "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  865. "psrad $" #shift ", %%mm0 \n\t"\
  866. "psrad $" #shift ", %%mm1 \n\t"\
  867. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  868. "movd %%mm7, " #dst " \n\t"\
  869. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  870. "movd %%mm0, 16+" #dst " \n\t"\
  871. "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  872. "movd %%mm1, 96+" #dst " \n\t"\
  873. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  874. "movd %%mm4, 112+" #dst " \n\t"\
  875. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  876. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  877. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  878. "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
  879. "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  880. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  881. "psrad $" #shift ", %%mm1 \n\t"\
  882. "psrad $" #shift ", %%mm5 \n\t"\
  883. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  884. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  885. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  886. "psrad $" #shift ", %%mm6 \n\t"\
  887. "psrad $" #shift ", %%mm4 \n\t"\
  888. "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  889. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  890. "movd %%mm1, 32+" #dst " \n\t"\
  891. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  892. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  893. "movd %%mm6, 48+" #dst " \n\t"\
  894. "movd %%mm4, 64+" #dst " \n\t"\
  895. "movd %%mm5, 80+" #dst " \n\t"
  896. //IDCT( src0, src4, src1, src5, dst, shift)
  897. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  898. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  899. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  900. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  901. "jmp 9f \n\t"
  902. "# .p2align 4 \n\t"\
  903. "5: \n\t"
  904. #undef IDCT
  905. #define IDCT(src0, src4, src1, src5, dst, shift) \
  906. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  907. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  908. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  909. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  910. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  911. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  912. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  913. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  914. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  915. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  916. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  917. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  918. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  919. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  920. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  921. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  922. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  923. "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
  924. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  925. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  926. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  927. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  928. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  929. "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  930. "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  931. "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
  932. "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
  933. "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
  934. "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
  935. "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
  936. "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
  937. "psrad $" #shift ", %%mm4 \n\t"\
  938. "psrad $" #shift ", %%mm7 \n\t"\
  939. "psrad $" #shift ", %%mm3 \n\t"\
  940. "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
  941. "movq %%mm4, " #dst " \n\t"\
  942. "psrad $" #shift ", %%mm0 \n\t"\
  943. "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
  944. "movq %%mm0, 16+" #dst " \n\t"\
  945. "movq %%mm0, 96+" #dst " \n\t"\
  946. "movq %%mm4, 112+" #dst " \n\t"\
  947. "psrad $" #shift ", %%mm5 \n\t"\
  948. "psrad $" #shift ", %%mm6 \n\t"\
  949. "psrad $" #shift ", %%mm2 \n\t"\
  950. "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  951. "movq %%mm5, 32+" #dst " \n\t"\
  952. "psrad $" #shift ", %%mm1 \n\t"\
  953. "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  954. "movq %%mm6, 48+" #dst " \n\t"\
  955. "movq %%mm6, 64+" #dst " \n\t"\
  956. "movq %%mm5, 80+" #dst " \n\t"
  957. //IDCT( src0, src4, src1, src5, dst, shift)
  958. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  959. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  960. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  961. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  962. "jmp 9f \n\t"
  963. "# .p2align 4 \n\t"\
  964. "1: \n\t"
  965. #undef IDCT
  966. #define IDCT(src0, src4, src1, src5, dst, shift) \
  967. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  968. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  969. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  970. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  971. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  972. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  973. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  974. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  975. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  976. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  977. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  978. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  979. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  980. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  981. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  982. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  983. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  984. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  985. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  986. "movq 64(%2), %%mm1 \n\t"\
  987. "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  988. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  989. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  990. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  991. "psrad $" #shift ", %%mm7 \n\t"\
  992. "psrad $" #shift ", %%mm4 \n\t"\
  993. "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
  994. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  995. "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  996. "psrad $" #shift ", %%mm0 \n\t"\
  997. "psrad $" #shift ", %%mm3 \n\t"\
  998. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  999. "movd %%mm7, " #dst " \n\t"\
  1000. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1001. "movd %%mm0, 16+" #dst " \n\t"\
  1002. "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1003. "movd %%mm3, 96+" #dst " \n\t"\
  1004. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1005. "movd %%mm4, 112+" #dst " \n\t"\
  1006. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  1007. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1008. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1009. "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
  1010. "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1011. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  1012. "psrad $" #shift ", %%mm3 \n\t"\
  1013. "psrad $" #shift ", %%mm5 \n\t"\
  1014. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  1015. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1016. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1017. "psrad $" #shift ", %%mm6 \n\t"\
  1018. "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1019. "movd %%mm3, 32+" #dst " \n\t"\
  1020. "psrad $" #shift ", %%mm4 \n\t"\
  1021. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1022. "movd %%mm6, 48+" #dst " \n\t"\
  1023. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1024. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1025. "movd %%mm4, 64+" #dst " \n\t"\
  1026. "movd %%mm5, 80+" #dst " \n\t"
  1027. //IDCT( src0, src4, src1, src5, dst, shift)
  1028. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1029. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1030. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1031. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1032. "jmp 9f \n\t"
  1033. "# .p2align 4 \n\t"
  1034. "7: \n\t"
  1035. #undef IDCT
  1036. #define IDCT(src0, src4, src1, src5, dst, shift) \
  1037. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1038. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1039. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1040. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1041. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1042. "psrad $" #shift ", %%mm4 \n\t"\
  1043. "psrad $" #shift ", %%mm0 \n\t"\
  1044. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1045. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1046. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1047. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1048. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1049. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1050. "psrad $" #shift ", %%mm1 \n\t"\
  1051. "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
  1052. "movq %%mm4, " #dst " \n\t"\
  1053. "psrad $" #shift ", %%mm2 \n\t"\
  1054. "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
  1055. "movq %%mm0, 16+" #dst " \n\t"\
  1056. "movq %%mm0, 96+" #dst " \n\t"\
  1057. "movq %%mm4, 112+" #dst " \n\t"\
  1058. "movq %%mm0, 32+" #dst " \n\t"\
  1059. "movq %%mm4, 48+" #dst " \n\t"\
  1060. "movq %%mm4, 64+" #dst " \n\t"\
  1061. "movq %%mm0, 80+" #dst " \n\t"
  1062. //IDCT( src0, src4, src1, src5, dst, shift)
  1063. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  1064. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  1065. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  1066. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  1067. #endif
  1068. /*
  1069. Input
  1070. 00 40 04 44 20 60 24 64
  1071. 10 30 14 34 50 70 54 74
  1072. 01 41 03 43 21 61 23 63
  1073. 11 31 13 33 51 71 53 73
  1074. 02 42 06 46 22 62 26 66
  1075. 12 32 16 36 52 72 56 76
  1076. 05 45 07 47 25 65 27 67
  1077. 15 35 17 37 55 75 57 77
  1078. Temp
  1079. 00 04 10 14 20 24 30 34
  1080. 40 44 50 54 60 64 70 74
  1081. 01 03 11 13 21 23 31 33
  1082. 41 43 51 53 61 63 71 73
  1083. 02 06 12 16 22 26 32 36
  1084. 42 46 52 56 62 66 72 76
  1085. 05 07 15 17 25 27 35 37
  1086. 45 47 55 57 65 67 75 77
  1087. */
  1088. "9: \n\t"
  1089. :: "r" (block), "r" (temp), "r" (coeffs)
  1090. : "%eax"
  1091. );
  1092. }
  1093. void ff_simple_idct_mmx(int16_t *block)
  1094. {
  1095. idct(block);
  1096. }
  1097. //FIXME merge add/put into the idct
  1098. void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
  1099. {
  1100. idct(block);
  1101. ff_put_pixels_clamped(block, dest, line_size);
  1102. }
  1103. void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
  1104. {
  1105. idct(block);
  1106. ff_add_pixels_clamped(block, dest, line_size);
  1107. }
  1108. #endif /* HAVE_INLINE_ASM */