You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1298 lines
52KB

  1. /*
  2. Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #include <inttypes.h>
  16. #include "../dsputil.h"
  17. #include "../mangle.h"
  18. /*
  19. 23170.475006
  20. 22725.260826
  21. 21406.727617
  22. 19265.545870
  23. 16384.000000
  24. 12872.826198
  25. 8866.956905
  26. 4520.335430
  27. */
  28. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  29. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  30. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  31. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  32. #if 0
  33. #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  34. #else
  35. #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
  36. #endif
  37. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  38. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  39. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  40. #define ROW_SHIFT 11
  41. #define COL_SHIFT 20 // 6
  42. static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
  43. static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
  44. static int16_t __attribute__((aligned(8))) temp[64];
  45. static int16_t __attribute__((aligned(8))) coeffs[]= {
  46. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  47. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  48. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  49. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  50. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  51. // 0, 0, 0, 0,
  52. // 0, 0, 0, 0,
  53. C4, C4, C4, C4,
  54. C4, -C4, C4, -C4,
  55. C2, C6, C2, C6,
  56. C6, -C2, C6, -C2,
  57. C1, C3, C1, C3,
  58. C5, C7, C5, C7,
  59. C3, -C7, C3, -C7,
  60. -C1, -C5, -C1, -C5,
  61. C5, -C1, C5, -C1,
  62. C7, C3, C7, C3,
  63. C7, -C5, C7, -C5,
  64. C3, -C1, C3, -C1
  65. };
  66. #if 0
  67. static void unused_var_killer(){
  68. int a= wm1010 + d40000;
  69. temp[0]=a;
  70. }
  71. static void inline idctCol (int16_t * col, int16_t *input)
  72. {
  73. #undef C0
  74. #undef C1
  75. #undef C2
  76. #undef C3
  77. #undef C4
  78. #undef C5
  79. #undef C6
  80. #undef C7
  81. int a0, a1, a2, a3, b0, b1, b2, b3;
  82. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  83. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  84. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  85. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  86. const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  87. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  88. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  89. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  90. /*
  91. if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
  92. col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
  93. col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
  94. return;
  95. }*/
  96. col[8*0] = input[8*0 + 0];
  97. col[8*1] = input[8*2 + 0];
  98. col[8*2] = input[8*0 + 1];
  99. col[8*3] = input[8*2 + 1];
  100. col[8*4] = input[8*4 + 0];
  101. col[8*5] = input[8*6 + 0];
  102. col[8*6] = input[8*4 + 1];
  103. col[8*7] = input[8*6 + 1];
  104. a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
  105. a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
  106. a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
  107. a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
  108. b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
  109. b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
  110. b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
  111. b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
  112. col[8*0] = (a0 + b0) >> COL_SHIFT;
  113. col[8*1] = (a1 + b1) >> COL_SHIFT;
  114. col[8*2] = (a2 + b2) >> COL_SHIFT;
  115. col[8*3] = (a3 + b3) >> COL_SHIFT;
  116. col[8*4] = (a3 - b3) >> COL_SHIFT;
  117. col[8*5] = (a2 - b2) >> COL_SHIFT;
  118. col[8*6] = (a1 - b1) >> COL_SHIFT;
  119. col[8*7] = (a0 - b0) >> COL_SHIFT;
  120. }
  121. static void inline idctRow (int16_t * output, int16_t * input)
  122. {
  123. int16_t row[8];
  124. int a0, a1, a2, a3, b0, b1, b2, b3;
  125. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  126. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  127. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  128. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  129. const int C4 = 16383; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  130. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  131. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  132. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  133. row[0] = input[0];
  134. row[2] = input[1];
  135. row[4] = input[4];
  136. row[6] = input[5];
  137. row[1] = input[8];
  138. row[3] = input[9];
  139. row[5] = input[12];
  140. row[7] = input[13];
  141. if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
  142. row[0] = row[1] = row[2] = row[3] = row[4] =
  143. row[5] = row[6] = row[7] = row[0]<<3;
  144. output[0] = row[0];
  145. output[2] = row[1];
  146. output[4] = row[2];
  147. output[6] = row[3];
  148. output[8] = row[4];
  149. output[10] = row[5];
  150. output[12] = row[6];
  151. output[14] = row[7];
  152. return;
  153. }
  154. a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
  155. a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
  156. a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
  157. a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
  158. b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
  159. b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
  160. b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
  161. b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
  162. row[0] = (a0 + b0) >> ROW_SHIFT;
  163. row[1] = (a1 + b1) >> ROW_SHIFT;
  164. row[2] = (a2 + b2) >> ROW_SHIFT;
  165. row[3] = (a3 + b3) >> ROW_SHIFT;
  166. row[4] = (a3 - b3) >> ROW_SHIFT;
  167. row[5] = (a2 - b2) >> ROW_SHIFT;
  168. row[6] = (a1 - b1) >> ROW_SHIFT;
  169. row[7] = (a0 - b0) >> ROW_SHIFT;
  170. output[0] = row[0];
  171. output[2] = row[1];
  172. output[4] = row[2];
  173. output[6] = row[3];
  174. output[8] = row[4];
  175. output[10] = row[5];
  176. output[12] = row[6];
  177. output[14] = row[7];
  178. }
  179. #endif
  180. static inline void idct(int16_t *block)
  181. {
  182. asm volatile(
  183. #if 0 //Alternative, simpler variant
  184. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  185. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  186. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  187. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  188. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  189. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  190. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  191. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  192. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  193. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  194. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  195. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  196. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  197. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  198. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  199. #rounder ", %%mm4 \n\t"\
  200. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  201. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  202. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  203. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  204. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  205. #rounder ", %%mm0 \n\t"\
  206. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  207. "paddd %%mm0, %%mm0 \n\t" \
  208. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  209. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  210. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  211. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  212. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  213. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  214. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  215. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  216. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  217. "psrad $" #shift ", %%mm7 \n\t"\
  218. "psrad $" #shift ", %%mm4 \n\t"\
  219. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  220. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  221. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  222. "psrad $" #shift ", %%mm1 \n\t"\
  223. "psrad $" #shift ", %%mm2 \n\t"\
  224. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  225. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  226. "movq %%mm7, " #dst " \n\t"\
  227. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  228. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  229. "movq %%mm2, 24+" #dst " \n\t"\
  230. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  231. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  232. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  233. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  234. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  235. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  236. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  237. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  238. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  239. "psrad $" #shift ", %%mm2 \n\t"\
  240. "psrad $" #shift ", %%mm0 \n\t"\
  241. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  242. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  243. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  244. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  245. "psrad $" #shift ", %%mm6 \n\t"\
  246. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  247. "movq %%mm2, 8+" #dst " \n\t"\
  248. "psrad $" #shift ", %%mm4 \n\t"\
  249. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  250. "movq %%mm4, 16+" #dst " \n\t"\
  251. #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  252. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  253. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  254. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  255. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  256. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  257. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  258. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  259. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  260. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  261. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  262. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  263. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  264. #rounder ", %%mm4 \n\t"\
  265. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  266. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  267. #rounder ", %%mm0 \n\t"\
  268. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  269. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  270. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  271. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  272. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  273. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  274. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  275. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  276. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  277. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  278. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  279. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  280. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  281. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  282. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  283. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  284. "psrad $" #shift ", %%mm7 \n\t"\
  285. "psrad $" #shift ", %%mm4 \n\t"\
  286. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  287. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  288. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  289. "psrad $" #shift ", %%mm0 \n\t"\
  290. "psrad $" #shift ", %%mm2 \n\t"\
  291. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  292. "movd %%mm7, " #dst " \n\t"\
  293. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  294. "movd %%mm0, 16+" #dst " \n\t"\
  295. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  296. "movd %%mm2, 96+" #dst " \n\t"\
  297. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  298. "movd %%mm4, 112+" #dst " \n\t"\
  299. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  300. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  301. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  302. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  303. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  304. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  305. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  306. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  307. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  308. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  309. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  310. "psrad $" #shift ", %%mm2 \n\t"\
  311. "psrad $" #shift ", %%mm5 \n\t"\
  312. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  313. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  314. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  315. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  316. "psrad $" #shift ", %%mm6 \n\t"\
  317. "psrad $" #shift ", %%mm4 \n\t"\
  318. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  319. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  320. "movd %%mm2, 32+" #dst " \n\t"\
  321. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  322. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  323. "movd %%mm6, 48+" #dst " \n\t"\
  324. "movd %%mm4, 64+" #dst " \n\t"\
  325. "movd %%mm5, 80+" #dst " \n\t"\
  326. #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  327. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  328. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  329. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  330. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  331. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  332. "pand %%mm0, %%mm4 \n\t"\
  333. "por %%mm1, %%mm4 \n\t"\
  334. "por %%mm2, %%mm4 \n\t"\
  335. "por %%mm3, %%mm4 \n\t"\
  336. "packssdw %%mm4,%%mm4 \n\t"\
  337. "movd %%mm4, %%eax \n\t"\
  338. "orl %%eax, %%eax \n\t"\
  339. "jz 1f \n\t"\
  340. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  341. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  342. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  343. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  344. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  345. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  346. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  347. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  348. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  349. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  350. #rounder ", %%mm4 \n\t"\
  351. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  352. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  353. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  354. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  355. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  356. #rounder ", %%mm0 \n\t"\
  357. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  358. "paddd %%mm0, %%mm0 \n\t" \
  359. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  360. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  361. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  362. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  363. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  364. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  365. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  366. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  367. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  368. "psrad $" #shift ", %%mm7 \n\t"\
  369. "psrad $" #shift ", %%mm4 \n\t"\
  370. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  371. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  372. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  373. "psrad $" #shift ", %%mm1 \n\t"\
  374. "psrad $" #shift ", %%mm2 \n\t"\
  375. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  376. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  377. "movq %%mm7, " #dst " \n\t"\
  378. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  379. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  380. "movq %%mm2, 24+" #dst " \n\t"\
  381. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  382. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  383. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  384. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  385. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  386. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  387. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  388. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  389. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  390. "psrad $" #shift ", %%mm2 \n\t"\
  391. "psrad $" #shift ", %%mm0 \n\t"\
  392. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  393. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  394. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  395. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  396. "psrad $" #shift ", %%mm6 \n\t"\
  397. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  398. "movq %%mm2, 8+" #dst " \n\t"\
  399. "psrad $" #shift ", %%mm4 \n\t"\
  400. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  401. "movq %%mm4, 16+" #dst " \n\t"\
  402. "jmp 2f \n\t"\
  403. "1: \n\t"\
  404. "pslld $16, %%mm0 \n\t"\
  405. "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
  406. "psrad $13, %%mm0 \n\t"\
  407. "packssdw %%mm0, %%mm0 \n\t"\
  408. "movq %%mm0, " #dst " \n\t"\
  409. "movq %%mm0, 8+" #dst " \n\t"\
  410. "movq %%mm0, 16+" #dst " \n\t"\
  411. "movq %%mm0, 24+" #dst " \n\t"\
  412. "2: \n\t"
  413. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  414. ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  415. /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
  416. ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
  417. ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
  418. DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  419. DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  420. DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  421. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  422. COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  423. COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  424. COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  425. COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  426. #else
  427. #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  428. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  429. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  430. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  431. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  432. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  433. "pand %%mm0, %%mm4 \n\t"\
  434. "por %%mm1, %%mm4 \n\t"\
  435. "por %%mm2, %%mm4 \n\t"\
  436. "por %%mm3, %%mm4 \n\t"\
  437. "packssdw %%mm4,%%mm4 \n\t"\
  438. "movd %%mm4, %%eax \n\t"\
  439. "orl %%eax, %%eax \n\t"\
  440. "jz 1f \n\t"\
  441. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  442. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  443. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  444. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  445. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  446. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  447. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  448. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  449. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  450. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  451. #rounder ", %%mm4 \n\t"\
  452. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  453. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  454. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  455. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  456. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  457. #rounder ", %%mm0 \n\t"\
  458. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  459. "paddd %%mm0, %%mm0 \n\t" \
  460. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  461. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  462. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  463. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  464. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  465. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  466. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  467. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  468. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  469. "psrad $" #shift ", %%mm7 \n\t"\
  470. "psrad $" #shift ", %%mm4 \n\t"\
  471. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  472. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  473. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  474. "psrad $" #shift ", %%mm1 \n\t"\
  475. "psrad $" #shift ", %%mm2 \n\t"\
  476. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  477. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  478. "movq %%mm7, " #dst " \n\t"\
  479. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  480. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  481. "movq %%mm2, 24+" #dst " \n\t"\
  482. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  483. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  484. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  485. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  486. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  487. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  488. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  489. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  490. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  491. "psrad $" #shift ", %%mm2 \n\t"\
  492. "psrad $" #shift ", %%mm0 \n\t"\
  493. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  494. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  495. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  496. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  497. "psrad $" #shift ", %%mm6 \n\t"\
  498. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  499. "movq %%mm2, 8+" #dst " \n\t"\
  500. "psrad $" #shift ", %%mm4 \n\t"\
  501. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  502. "movq %%mm4, 16+" #dst " \n\t"\
  503. "jmp 2f \n\t"\
  504. "1: \n\t"\
  505. "pslld $16, %%mm0 \n\t"\
  506. "paddd "MANGLE(d40000)", %%mm0 \n\t"\
  507. "psrad $13, %%mm0 \n\t"\
  508. "packssdw %%mm0, %%mm0 \n\t"\
  509. "movq %%mm0, " #dst " \n\t"\
  510. "movq %%mm0, 8+" #dst " \n\t"\
  511. "movq %%mm0, 16+" #dst " \n\t"\
  512. "movq %%mm0, 24+" #dst " \n\t"\
  513. "2: \n\t"
  514. #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
  515. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  516. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  517. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  518. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  519. "movq %%mm0, %%mm4 \n\t"\
  520. "por %%mm1, %%mm4 \n\t"\
  521. "por %%mm2, %%mm4 \n\t"\
  522. "por %%mm3, %%mm4 \n\t"\
  523. "packssdw %%mm4,%%mm4 \n\t"\
  524. "movd %%mm4, %%eax \n\t"\
  525. "orl %%eax, %%eax \n\t"\
  526. "jz " #bt " \n\t"\
  527. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  528. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  529. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  530. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  531. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  532. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  533. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  534. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  535. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  536. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  537. #rounder ", %%mm4 \n\t"\
  538. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  539. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  540. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  541. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  542. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  543. #rounder ", %%mm0 \n\t"\
  544. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  545. "paddd %%mm0, %%mm0 \n\t" \
  546. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  547. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  548. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  549. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  550. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  551. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  552. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  553. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  554. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  555. "psrad $" #shift ", %%mm7 \n\t"\
  556. "psrad $" #shift ", %%mm4 \n\t"\
  557. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  558. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  559. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  560. "psrad $" #shift ", %%mm1 \n\t"\
  561. "psrad $" #shift ", %%mm2 \n\t"\
  562. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  563. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  564. "movq %%mm7, " #dst " \n\t"\
  565. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  566. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  567. "movq %%mm2, 24+" #dst " \n\t"\
  568. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  569. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  570. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  571. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  572. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  573. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  574. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  575. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  576. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  577. "psrad $" #shift ", %%mm2 \n\t"\
  578. "psrad $" #shift ", %%mm0 \n\t"\
  579. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  580. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  581. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  582. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  583. "psrad $" #shift ", %%mm6 \n\t"\
  584. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  585. "movq %%mm2, 8+" #dst " \n\t"\
  586. "psrad $" #shift ", %%mm4 \n\t"\
  587. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  588. "movq %%mm4, 16+" #dst " \n\t"\
  589. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  590. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  591. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  592. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  593. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  594. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  595. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  596. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  597. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  598. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  599. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  600. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  601. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  602. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  603. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  604. #rounder ", %%mm4 \n\t"\
  605. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  606. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  607. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  608. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  609. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  610. #rounder ", %%mm0 \n\t"\
  611. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  612. "paddd %%mm0, %%mm0 \n\t" \
  613. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  614. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  615. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  616. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  617. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  618. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  619. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  620. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  621. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  622. "psrad $" #shift ", %%mm7 \n\t"\
  623. "psrad $" #shift ", %%mm4 \n\t"\
  624. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  625. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  626. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  627. "psrad $" #shift ", %%mm1 \n\t"\
  628. "psrad $" #shift ", %%mm2 \n\t"\
  629. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  630. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  631. "movq %%mm7, " #dst " \n\t"\
  632. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  633. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  634. "movq %%mm2, 24+" #dst " \n\t"\
  635. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  636. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  637. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  638. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  639. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  640. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  641. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  642. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  643. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  644. "psrad $" #shift ", %%mm2 \n\t"\
  645. "psrad $" #shift ", %%mm0 \n\t"\
  646. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  647. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  648. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  649. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  650. "psrad $" #shift ", %%mm6 \n\t"\
  651. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  652. "movq %%mm2, 8+" #dst " \n\t"\
  653. "psrad $" #shift ", %%mm4 \n\t"\
  654. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  655. "movq %%mm4, 16+" #dst " \n\t"\
  656. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  657. DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  658. Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  659. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  660. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  661. #undef IDCT
  662. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  663. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  664. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  665. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  666. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  667. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  668. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  669. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  670. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  671. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  672. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  673. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  674. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  675. #rounder ", %%mm4 \n\t"\
  676. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  677. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  678. #rounder ", %%mm0 \n\t"\
  679. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  680. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  681. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  682. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  683. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  684. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  685. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  686. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  687. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  688. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  689. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  690. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  691. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  692. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  693. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  694. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  695. "psrad $" #shift ", %%mm7 \n\t"\
  696. "psrad $" #shift ", %%mm4 \n\t"\
  697. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  698. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  699. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  700. "psrad $" #shift ", %%mm0 \n\t"\
  701. "psrad $" #shift ", %%mm2 \n\t"\
  702. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  703. "movd %%mm7, " #dst " \n\t"\
  704. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  705. "movd %%mm0, 16+" #dst " \n\t"\
  706. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  707. "movd %%mm2, 96+" #dst " \n\t"\
  708. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  709. "movd %%mm4, 112+" #dst " \n\t"\
  710. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  711. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  712. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  713. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  714. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  715. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  716. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  717. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  718. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  719. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  720. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  721. "psrad $" #shift ", %%mm2 \n\t"\
  722. "psrad $" #shift ", %%mm5 \n\t"\
  723. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  724. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  725. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  726. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  727. "psrad $" #shift ", %%mm6 \n\t"\
  728. "psrad $" #shift ", %%mm4 \n\t"\
  729. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  730. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  731. "movd %%mm2, 32+" #dst " \n\t"\
  732. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  733. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  734. "movd %%mm6, 48+" #dst " \n\t"\
  735. "movd %%mm4, 64+" #dst " \n\t"\
  736. "movd %%mm5, 80+" #dst " \n\t"
  737. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  738. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  739. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  740. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  741. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  742. "jmp 9f \n\t"
  743. "#.balign 16 \n\t"\
  744. "4: \n\t"
  745. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  746. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  747. #undef IDCT
  748. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  749. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  750. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  751. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  752. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  753. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  754. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  755. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  756. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  757. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  758. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  759. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  760. #rounder ", %%mm4 \n\t"\
  761. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  762. #rounder ", %%mm0 \n\t"\
  763. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  764. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  765. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  766. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  767. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  768. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  769. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  770. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  771. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  772. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  773. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  774. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  775. "psrad $" #shift ", %%mm1 \n\t"\
  776. "psrad $" #shift ", %%mm4 \n\t"\
  777. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  778. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  779. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  780. "psrad $" #shift ", %%mm0 \n\t"\
  781. "psrad $" #shift ", %%mm2 \n\t"\
  782. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  783. "movd %%mm1, " #dst " \n\t"\
  784. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  785. "movd %%mm0, 16+" #dst " \n\t"\
  786. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  787. "movd %%mm2, 96+" #dst " \n\t"\
  788. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  789. "movd %%mm4, 112+" #dst " \n\t"\
  790. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  791. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  792. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  793. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  794. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  795. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  796. "psrad $" #shift ", %%mm2 \n\t"\
  797. "psrad $" #shift ", %%mm5 \n\t"\
  798. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  799. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  800. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  801. "psrad $" #shift ", %%mm6 \n\t"\
  802. "psrad $" #shift ", %%mm1 \n\t"\
  803. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  804. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  805. "movd %%mm2, 32+" #dst " \n\t"\
  806. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  807. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  808. "movd %%mm6, 48+" #dst " \n\t"\
  809. "movd %%mm1, 64+" #dst " \n\t"\
  810. "movd %%mm5, 80+" #dst " \n\t"
  811. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  812. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  813. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  814. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  815. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  816. "jmp 9f \n\t"
  817. "#.balign 16 \n\t"\
  818. "6: \n\t"
  819. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  820. #undef IDCT
  821. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  822. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  823. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  824. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  825. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  826. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  827. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  828. #rounder ", %%mm4 \n\t"\
  829. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  830. #rounder ", %%mm0 \n\t"\
  831. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  832. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  833. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  834. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  835. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  836. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  837. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  838. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  839. "psrad $" #shift ", %%mm1 \n\t"\
  840. "psrad $" #shift ", %%mm4 \n\t"\
  841. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  842. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  843. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  844. "psrad $" #shift ", %%mm0 \n\t"\
  845. "psrad $" #shift ", %%mm2 \n\t"\
  846. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  847. "movd %%mm1, " #dst " \n\t"\
  848. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  849. "movd %%mm0, 16+" #dst " \n\t"\
  850. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  851. "movd %%mm2, 96+" #dst " \n\t"\
  852. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  853. "movd %%mm4, 112+" #dst " \n\t"\
  854. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  855. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  856. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  857. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  858. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  859. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  860. "psrad $" #shift ", %%mm2 \n\t"\
  861. "psrad $" #shift ", %%mm5 \n\t"\
  862. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  863. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  864. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  865. "psrad $" #shift ", %%mm6 \n\t"\
  866. "psrad $" #shift ", %%mm1 \n\t"\
  867. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  868. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  869. "movd %%mm2, 32+" #dst " \n\t"\
  870. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  871. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  872. "movd %%mm6, 48+" #dst " \n\t"\
  873. "movd %%mm1, 64+" #dst " \n\t"\
  874. "movd %%mm5, 80+" #dst " \n\t"
  875. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  876. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  877. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  878. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  879. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  880. "jmp 9f \n\t"
  881. "#.balign 16 \n\t"\
  882. "2: \n\t"
  883. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  884. #undef IDCT
  885. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  886. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  887. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  888. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  889. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  890. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  891. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  892. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  893. #rounder ", %%mm4 \n\t"\
  894. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  895. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  896. #rounder ", %%mm0 \n\t"\
  897. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  898. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  899. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  900. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  901. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  902. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  903. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  904. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  905. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  906. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  907. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  908. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  909. "psrad $" #shift ", %%mm7 \n\t"\
  910. "psrad $" #shift ", %%mm4 \n\t"\
  911. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  912. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  913. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  914. "psrad $" #shift ", %%mm0 \n\t"\
  915. "psrad $" #shift ", %%mm2 \n\t"\
  916. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  917. "movd %%mm7, " #dst " \n\t"\
  918. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  919. "movd %%mm0, 16+" #dst " \n\t"\
  920. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  921. "movd %%mm2, 96+" #dst " \n\t"\
  922. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  923. "movd %%mm4, 112+" #dst " \n\t"\
  924. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  925. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  926. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  927. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  928. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  929. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  930. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  931. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  932. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  933. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  934. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  935. "psrad $" #shift ", %%mm2 \n\t"\
  936. "psrad $" #shift ", %%mm5 \n\t"\
  937. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  938. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  939. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  940. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  941. "psrad $" #shift ", %%mm6 \n\t"\
  942. "psrad $" #shift ", %%mm4 \n\t"\
  943. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  944. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  945. "movd %%mm2, 32+" #dst " \n\t"\
  946. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  947. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  948. "movd %%mm6, 48+" #dst " \n\t"\
  949. "movd %%mm4, 64+" #dst " \n\t"\
  950. "movd %%mm5, 80+" #dst " \n\t"
  951. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  952. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  953. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  954. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  955. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  956. "jmp 9f \n\t"
  957. "#.balign 16 \n\t"\
  958. "3: \n\t"
  959. #undef IDCT
  960. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  961. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  962. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  963. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  964. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  965. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  966. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  967. #rounder ", %%mm4 \n\t"\
  968. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  969. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  970. #rounder ", %%mm0 \n\t"\
  971. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  972. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  973. "movq 64(%2), %%mm3 \n\t"\
  974. "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  975. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  976. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  977. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  978. "psrad $" #shift ", %%mm7 \n\t"\
  979. "psrad $" #shift ", %%mm4 \n\t"\
  980. "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
  981. "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  982. "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  983. "psrad $" #shift ", %%mm0 \n\t"\
  984. "psrad $" #shift ", %%mm1 \n\t"\
  985. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  986. "movd %%mm7, " #dst " \n\t"\
  987. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  988. "movd %%mm0, 16+" #dst " \n\t"\
  989. "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  990. "movd %%mm1, 96+" #dst " \n\t"\
  991. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  992. "movd %%mm4, 112+" #dst " \n\t"\
  993. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  994. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  995. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  996. "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
  997. "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  998. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  999. "psrad $" #shift ", %%mm1 \n\t"\
  1000. "psrad $" #shift ", %%mm5 \n\t"\
  1001. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  1002. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1003. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1004. "psrad $" #shift ", %%mm6 \n\t"\
  1005. "psrad $" #shift ", %%mm4 \n\t"\
  1006. "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  1007. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1008. "movd %%mm1, 32+" #dst " \n\t"\
  1009. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1010. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1011. "movd %%mm6, 48+" #dst " \n\t"\
  1012. "movd %%mm4, 64+" #dst " \n\t"\
  1013. "movd %%mm5, 80+" #dst " \n\t"
  1014. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1015. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1016. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1017. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1018. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1019. "jmp 9f \n\t"
  1020. "#.balign 16 \n\t"\
  1021. "5: \n\t"
  1022. #undef IDCT
  1023. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1024. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1025. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1026. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1027. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1028. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1029. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1030. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1031. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1032. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1033. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1034. #rounder ", %%mm4 \n\t"\
  1035. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1036. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1037. #rounder ", %%mm0 \n\t"\
  1038. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1039. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1040. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1041. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1042. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1043. "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
  1044. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1045. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1046. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1047. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1048. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1049. "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1050. "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1051. #rounder ", %%mm1 \n\t"\
  1052. "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
  1053. "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
  1054. #rounder ", %%mm2 \n\t"\
  1055. "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
  1056. "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
  1057. "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
  1058. "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
  1059. "psrad $" #shift ", %%mm4 \n\t"\
  1060. "psrad $" #shift ", %%mm7 \n\t"\
  1061. "psrad $" #shift ", %%mm3 \n\t"\
  1062. "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
  1063. "movq %%mm4, " #dst " \n\t"\
  1064. "psrad $" #shift ", %%mm0 \n\t"\
  1065. "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
  1066. "movq %%mm0, 16+" #dst " \n\t"\
  1067. "movq %%mm0, 96+" #dst " \n\t"\
  1068. "movq %%mm4, 112+" #dst " \n\t"\
  1069. "psrad $" #shift ", %%mm5 \n\t"\
  1070. "psrad $" #shift ", %%mm6 \n\t"\
  1071. "psrad $" #shift ", %%mm2 \n\t"\
  1072. "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1073. "movq %%mm5, 32+" #dst " \n\t"\
  1074. "psrad $" #shift ", %%mm1 \n\t"\
  1075. "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1076. "movq %%mm6, 48+" #dst " \n\t"\
  1077. "movq %%mm6, 64+" #dst " \n\t"\
  1078. "movq %%mm5, 80+" #dst " \n\t"
  1079. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1080. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1081. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1082. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1083. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1084. "jmp 9f \n\t"
  1085. "#.balign 16 \n\t"\
  1086. "1: \n\t"
  1087. #undef IDCT
  1088. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1089. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1090. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1091. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  1092. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1093. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1094. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1095. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1096. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1097. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1098. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1099. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1100. #rounder ", %%mm4 \n\t"\
  1101. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1102. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  1103. #rounder ", %%mm0 \n\t"\
  1104. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  1105. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1106. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1107. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1108. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1109. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1110. "movq 64(%2), %%mm1 \n\t"\
  1111. "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  1112. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1113. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  1114. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1115. "psrad $" #shift ", %%mm7 \n\t"\
  1116. "psrad $" #shift ", %%mm4 \n\t"\
  1117. "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
  1118. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1119. "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1120. "psrad $" #shift ", %%mm0 \n\t"\
  1121. "psrad $" #shift ", %%mm3 \n\t"\
  1122. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1123. "movd %%mm7, " #dst " \n\t"\
  1124. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1125. "movd %%mm0, 16+" #dst " \n\t"\
  1126. "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1127. "movd %%mm3, 96+" #dst " \n\t"\
  1128. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1129. "movd %%mm4, 112+" #dst " \n\t"\
  1130. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  1131. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1132. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1133. "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
  1134. "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1135. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  1136. "psrad $" #shift ", %%mm3 \n\t"\
  1137. "psrad $" #shift ", %%mm5 \n\t"\
  1138. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  1139. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1140. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1141. "psrad $" #shift ", %%mm6 \n\t"\
  1142. "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1143. "movd %%mm3, 32+" #dst " \n\t"\
  1144. "psrad $" #shift ", %%mm4 \n\t"\
  1145. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1146. "movd %%mm6, 48+" #dst " \n\t"\
  1147. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1148. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1149. "movd %%mm4, 64+" #dst " \n\t"\
  1150. "movd %%mm5, 80+" #dst " \n\t"
  1151. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1152. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1153. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1154. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1155. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1156. "jmp 9f \n\t"
  1157. "#.balign 16 \n\t"
  1158. "7: \n\t"
  1159. #undef IDCT
  1160. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1161. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1162. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1163. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1164. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1165. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1166. #rounder ", %%mm4 \n\t"\
  1167. #rounder ", %%mm0 \n\t"\
  1168. "psrad $" #shift ", %%mm4 \n\t"\
  1169. "psrad $" #shift ", %%mm0 \n\t"\
  1170. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1171. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1172. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1173. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1174. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1175. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1176. #rounder ", %%mm1 \n\t"\
  1177. #rounder ", %%mm2 \n\t"\
  1178. "psrad $" #shift ", %%mm1 \n\t"\
  1179. "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
  1180. "movq %%mm4, " #dst " \n\t"\
  1181. "psrad $" #shift ", %%mm2 \n\t"\
  1182. "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
  1183. "movq %%mm0, 16+" #dst " \n\t"\
  1184. "movq %%mm0, 96+" #dst " \n\t"\
  1185. "movq %%mm4, 112+" #dst " \n\t"\
  1186. "movq %%mm0, 32+" #dst " \n\t"\
  1187. "movq %%mm4, 48+" #dst " \n\t"\
  1188. "movq %%mm4, 64+" #dst " \n\t"\
  1189. "movq %%mm0, 80+" #dst " \n\t"
  1190. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1191. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1192. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1193. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1194. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1195. #endif
  1196. /*
  1197. Input
  1198. 00 40 04 44 20 60 24 64
  1199. 10 30 14 34 50 70 54 74
  1200. 01 41 03 43 21 61 23 63
  1201. 11 31 13 33 51 71 53 73
  1202. 02 42 06 46 22 62 26 66
  1203. 12 32 16 36 52 72 56 76
  1204. 05 45 07 47 25 65 27 67
  1205. 15 35 17 37 55 75 57 77
  1206. Temp
  1207. 00 04 10 14 20 24 30 34
  1208. 40 44 50 54 60 64 70 74
  1209. 01 03 11 13 21 23 31 33
  1210. 41 43 51 53 61 63 71 73
  1211. 02 06 12 16 22 26 32 36
  1212. 42 46 52 56 62 66 72 76
  1213. 05 07 15 17 25 27 35 37
  1214. 45 47 55 57 65 67 75 77
  1215. */
  1216. "9: \n\t"
  1217. :: "r" (block), "r" (temp), "r" (coeffs)
  1218. : "%eax"
  1219. );
  1220. }
  1221. void simple_idct_mmx(int16_t *block)
  1222. {
  1223. idct(block);
  1224. }