You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1285 lines
52KB

  1. /*
  2. Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #include <inttypes.h>
  16. #include "../dsputil.h"
  17. #include "../mangle.h"
  18. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  19. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  20. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  21. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  22. #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  23. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  24. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  25. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  26. #define ROW_SHIFT 11
  27. #define COL_SHIFT 20 // 6
  28. static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
  29. static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
  30. static int16_t __attribute__((aligned(8))) temp[64];
  31. static int16_t __attribute__((aligned(8))) coeffs[]= {
  32. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  33. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  34. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  35. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  36. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  37. // 0, 0, 0, 0,
  38. // 0, 0, 0, 0,
  39. C4, C4, C4, C4,
  40. C4, -C4, C4, -C4,
  41. C2, C6, C2, C6,
  42. C6, -C2, C6, -C2,
  43. C1, C3, C1, C3,
  44. C5, C7, C5, C7,
  45. C3, -C7, C3, -C7,
  46. -C1, -C5, -C1, -C5,
  47. C5, -C1, C5, -C1,
  48. C7, C3, C7, C3,
  49. C7, -C5, C7, -C5,
  50. C3, -C1, C3, -C1
  51. };
  52. static void unused_var_killer(){
  53. int a= wm1010 + d40000;
  54. temp[0]=a;
  55. }
  56. #if 0
  57. static void inline idctCol (int16_t * col, int16_t *input)
  58. {
  59. #undef C0
  60. #undef C1
  61. #undef C2
  62. #undef C3
  63. #undef C4
  64. #undef C5
  65. #undef C6
  66. #undef C7
  67. int a0, a1, a2, a3, b0, b1, b2, b3;
  68. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  69. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  70. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  71. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  72. const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  73. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  74. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  75. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  76. /*
  77. if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
  78. col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
  79. col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
  80. return;
  81. }*/
  82. col[8*0] = input[8*0 + 0];
  83. col[8*1] = input[8*2 + 0];
  84. col[8*2] = input[8*0 + 1];
  85. col[8*3] = input[8*2 + 1];
  86. col[8*4] = input[8*4 + 0];
  87. col[8*5] = input[8*6 + 0];
  88. col[8*6] = input[8*4 + 1];
  89. col[8*7] = input[8*6 + 1];
  90. a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
  91. a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
  92. a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
  93. a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
  94. b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
  95. b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
  96. b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
  97. b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
  98. col[8*0] = (a0 + b0) >> COL_SHIFT;
  99. col[8*1] = (a1 + b1) >> COL_SHIFT;
  100. col[8*2] = (a2 + b2) >> COL_SHIFT;
  101. col[8*3] = (a3 + b3) >> COL_SHIFT;
  102. col[8*4] = (a3 - b3) >> COL_SHIFT;
  103. col[8*5] = (a2 - b2) >> COL_SHIFT;
  104. col[8*6] = (a1 - b1) >> COL_SHIFT;
  105. col[8*7] = (a0 - b0) >> COL_SHIFT;
  106. }
  107. static void inline idctRow (int16_t * output, int16_t * input)
  108. {
  109. int16_t row[8];
  110. int a0, a1, a2, a3, b0, b1, b2, b3;
  111. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  112. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  113. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  114. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  115. const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  116. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  117. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  118. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  119. row[0] = input[0];
  120. row[2] = input[1];
  121. row[4] = input[4];
  122. row[6] = input[5];
  123. row[1] = input[8];
  124. row[3] = input[9];
  125. row[5] = input[12];
  126. row[7] = input[13];
  127. if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
  128. row[0] = row[1] = row[2] = row[3] = row[4] =
  129. row[5] = row[6] = row[7] = row[0]<<3;
  130. output[0] = row[0];
  131. output[2] = row[1];
  132. output[4] = row[2];
  133. output[6] = row[3];
  134. output[8] = row[4];
  135. output[10] = row[5];
  136. output[12] = row[6];
  137. output[14] = row[7];
  138. return;
  139. }
  140. a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
  141. a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
  142. a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
  143. a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
  144. b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
  145. b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
  146. b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
  147. b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
  148. row[0] = (a0 + b0) >> ROW_SHIFT;
  149. row[1] = (a1 + b1) >> ROW_SHIFT;
  150. row[2] = (a2 + b2) >> ROW_SHIFT;
  151. row[3] = (a3 + b3) >> ROW_SHIFT;
  152. row[4] = (a3 - b3) >> ROW_SHIFT;
  153. row[5] = (a2 - b2) >> ROW_SHIFT;
  154. row[6] = (a1 - b1) >> ROW_SHIFT;
  155. row[7] = (a0 - b0) >> ROW_SHIFT;
  156. output[0] = row[0];
  157. output[2] = row[1];
  158. output[4] = row[2];
  159. output[6] = row[3];
  160. output[8] = row[4];
  161. output[10] = row[5];
  162. output[12] = row[6];
  163. output[14] = row[7];
  164. }
  165. #endif
  166. static inline void idct(int16_t *block)
  167. {
  168. asm volatile(
  169. #if 0 //Alternative, simpler variant
  170. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  171. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  172. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  173. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  174. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  175. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  176. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  177. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  178. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  179. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  180. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  181. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  182. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  183. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  184. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  185. #rounder ", %%mm4 \n\t"\
  186. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  187. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  188. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  189. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  190. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  191. #rounder ", %%mm0 \n\t"\
  192. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  193. "paddd %%mm0, %%mm0 \n\t" \
  194. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  195. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  196. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  197. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  198. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  199. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  200. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  201. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  202. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  203. "psrad $" #shift ", %%mm7 \n\t"\
  204. "psrad $" #shift ", %%mm4 \n\t"\
  205. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  206. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  207. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  208. "psrad $" #shift ", %%mm1 \n\t"\
  209. "psrad $" #shift ", %%mm2 \n\t"\
  210. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  211. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  212. "movq %%mm7, " #dst " \n\t"\
  213. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  214. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  215. "movq %%mm2, 24+" #dst " \n\t"\
  216. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  217. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  218. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  219. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  220. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  221. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  222. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  223. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  224. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  225. "psrad $" #shift ", %%mm2 \n\t"\
  226. "psrad $" #shift ", %%mm0 \n\t"\
  227. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  228. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  229. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  230. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  231. "psrad $" #shift ", %%mm6 \n\t"\
  232. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  233. "movq %%mm2, 8+" #dst " \n\t"\
  234. "psrad $" #shift ", %%mm4 \n\t"\
  235. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  236. "movq %%mm4, 16+" #dst " \n\t"\
  237. #define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  238. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  239. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  240. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  241. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  242. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  243. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  244. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  245. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  246. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  247. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  248. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  249. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  250. #rounder ", %%mm4 \n\t"\
  251. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  252. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  253. #rounder ", %%mm0 \n\t"\
  254. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  255. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  256. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  257. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  258. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  259. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  260. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  261. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  262. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  263. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  264. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  265. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  266. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  267. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  268. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  269. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  270. "psrad $" #shift ", %%mm7 \n\t"\
  271. "psrad $" #shift ", %%mm4 \n\t"\
  272. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  273. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  274. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  275. "psrad $" #shift ", %%mm0 \n\t"\
  276. "psrad $" #shift ", %%mm2 \n\t"\
  277. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  278. "movd %%mm7, " #dst " \n\t"\
  279. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  280. "movd %%mm0, 16+" #dst " \n\t"\
  281. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  282. "movd %%mm2, 96+" #dst " \n\t"\
  283. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  284. "movd %%mm4, 112+" #dst " \n\t"\
  285. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  286. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  287. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  288. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  289. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  290. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  291. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  292. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  293. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  294. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  295. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  296. "psrad $" #shift ", %%mm2 \n\t"\
  297. "psrad $" #shift ", %%mm5 \n\t"\
  298. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  299. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  300. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  301. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  302. "psrad $" #shift ", %%mm6 \n\t"\
  303. "psrad $" #shift ", %%mm4 \n\t"\
  304. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  305. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  306. "movd %%mm2, 32+" #dst " \n\t"\
  307. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  308. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  309. "movd %%mm6, 48+" #dst " \n\t"\
  310. "movd %%mm4, 64+" #dst " \n\t"\
  311. "movd %%mm5, 80+" #dst " \n\t"\
  312. #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  313. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  314. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  315. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  316. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  317. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  318. "pand %%mm0, %%mm4 \n\t"\
  319. "por %%mm1, %%mm4 \n\t"\
  320. "por %%mm2, %%mm4 \n\t"\
  321. "por %%mm3, %%mm4 \n\t"\
  322. "packssdw %%mm4,%%mm4 \n\t"\
  323. "movd %%mm4, %%eax \n\t"\
  324. "orl %%eax, %%eax \n\t"\
  325. "jz 1f \n\t"\
  326. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  327. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  328. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  329. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  330. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  331. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  332. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  333. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  334. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  335. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  336. #rounder ", %%mm4 \n\t"\
  337. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  338. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  339. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  340. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  341. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  342. #rounder ", %%mm0 \n\t"\
  343. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  344. "paddd %%mm0, %%mm0 \n\t" \
  345. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  346. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  347. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  348. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  349. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  350. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  351. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  352. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  353. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  354. "psrad $" #shift ", %%mm7 \n\t"\
  355. "psrad $" #shift ", %%mm4 \n\t"\
  356. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  357. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  358. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  359. "psrad $" #shift ", %%mm1 \n\t"\
  360. "psrad $" #shift ", %%mm2 \n\t"\
  361. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  362. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  363. "movq %%mm7, " #dst " \n\t"\
  364. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  365. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  366. "movq %%mm2, 24+" #dst " \n\t"\
  367. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  368. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  369. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  370. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  371. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  372. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  373. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  374. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  375. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  376. "psrad $" #shift ", %%mm2 \n\t"\
  377. "psrad $" #shift ", %%mm0 \n\t"\
  378. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  379. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  380. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  381. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  382. "psrad $" #shift ", %%mm6 \n\t"\
  383. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  384. "movq %%mm2, 8+" #dst " \n\t"\
  385. "psrad $" #shift ", %%mm4 \n\t"\
  386. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  387. "movq %%mm4, 16+" #dst " \n\t"\
  388. "jmp 2f \n\t"\
  389. "1: \n\t"\
  390. "pslld $16, %%mm0 \n\t"\
  391. "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
  392. "psrad $13, %%mm0 \n\t"\
  393. "packssdw %%mm0, %%mm0 \n\t"\
  394. "movq %%mm0, " #dst " \n\t"\
  395. "movq %%mm0, 8+" #dst " \n\t"\
  396. "movq %%mm0, 16+" #dst " \n\t"\
  397. "movq %%mm0, 24+" #dst " \n\t"\
  398. "2: \n\t"
  399. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  400. ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  401. /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
  402. ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
  403. ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
  404. DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  405. DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  406. DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  407. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  408. COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  409. COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  410. COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  411. COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  412. #else
  413. #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  414. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  415. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  416. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  417. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  418. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  419. "pand %%mm0, %%mm4 \n\t"\
  420. "por %%mm1, %%mm4 \n\t"\
  421. "por %%mm2, %%mm4 \n\t"\
  422. "por %%mm3, %%mm4 \n\t"\
  423. "packssdw %%mm4,%%mm4 \n\t"\
  424. "movd %%mm4, %%eax \n\t"\
  425. "orl %%eax, %%eax \n\t"\
  426. "jz 1f \n\t"\
  427. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  428. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  429. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  430. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  431. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  432. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  433. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  434. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  435. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  436. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  437. #rounder ", %%mm4 \n\t"\
  438. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  439. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  440. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  441. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  442. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  443. #rounder ", %%mm0 \n\t"\
  444. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  445. "paddd %%mm0, %%mm0 \n\t" \
  446. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  447. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  448. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  449. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  450. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  451. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  452. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  453. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  454. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  455. "psrad $" #shift ", %%mm7 \n\t"\
  456. "psrad $" #shift ", %%mm4 \n\t"\
  457. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  458. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  459. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  460. "psrad $" #shift ", %%mm1 \n\t"\
  461. "psrad $" #shift ", %%mm2 \n\t"\
  462. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  463. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  464. "movq %%mm7, " #dst " \n\t"\
  465. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  466. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  467. "movq %%mm2, 24+" #dst " \n\t"\
  468. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  469. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  470. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  471. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  472. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  473. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  474. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  475. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  476. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  477. "psrad $" #shift ", %%mm2 \n\t"\
  478. "psrad $" #shift ", %%mm0 \n\t"\
  479. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  480. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  481. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  482. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  483. "psrad $" #shift ", %%mm6 \n\t"\
  484. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  485. "movq %%mm2, 8+" #dst " \n\t"\
  486. "psrad $" #shift ", %%mm4 \n\t"\
  487. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  488. "movq %%mm4, 16+" #dst " \n\t"\
  489. "jmp 2f \n\t"\
  490. "1: \n\t"\
  491. "pslld $16, %%mm0 \n\t"\
  492. "paddd "MANGLE(d40000)", %%mm0 \n\t"\
  493. "psrad $13, %%mm0 \n\t"\
  494. "packssdw %%mm0, %%mm0 \n\t"\
  495. "movq %%mm0, " #dst " \n\t"\
  496. "movq %%mm0, 8+" #dst " \n\t"\
  497. "movq %%mm0, 16+" #dst " \n\t"\
  498. "movq %%mm0, 24+" #dst " \n\t"\
  499. "2: \n\t"
  500. #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
  501. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  502. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  503. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  504. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  505. "movq %%mm0, %%mm4 \n\t"\
  506. "por %%mm1, %%mm4 \n\t"\
  507. "por %%mm2, %%mm4 \n\t"\
  508. "por %%mm3, %%mm4 \n\t"\
  509. "packssdw %%mm4,%%mm4 \n\t"\
  510. "movd %%mm4, %%eax \n\t"\
  511. "orl %%eax, %%eax \n\t"\
  512. "jz " #bt " \n\t"\
  513. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  514. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  515. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  516. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  517. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  518. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  519. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  520. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  521. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  522. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  523. #rounder ", %%mm4 \n\t"\
  524. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  525. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  526. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  527. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  528. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  529. #rounder ", %%mm0 \n\t"\
  530. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  531. "paddd %%mm0, %%mm0 \n\t" \
  532. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  533. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  534. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  535. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  536. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  537. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  538. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  539. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  540. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  541. "psrad $" #shift ", %%mm7 \n\t"\
  542. "psrad $" #shift ", %%mm4 \n\t"\
  543. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  544. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  545. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  546. "psrad $" #shift ", %%mm1 \n\t"\
  547. "psrad $" #shift ", %%mm2 \n\t"\
  548. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  549. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  550. "movq %%mm7, " #dst " \n\t"\
  551. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  552. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  553. "movq %%mm2, 24+" #dst " \n\t"\
  554. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  555. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  556. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  557. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  558. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  559. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  560. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  561. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  562. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  563. "psrad $" #shift ", %%mm2 \n\t"\
  564. "psrad $" #shift ", %%mm0 \n\t"\
  565. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  566. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  567. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  568. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  569. "psrad $" #shift ", %%mm6 \n\t"\
  570. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  571. "movq %%mm2, 8+" #dst " \n\t"\
  572. "psrad $" #shift ", %%mm4 \n\t"\
  573. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  574. "movq %%mm4, 16+" #dst " \n\t"\
  575. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  576. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  577. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  578. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  579. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  580. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  581. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  582. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  583. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  584. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  585. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  586. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  587. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  588. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  589. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  590. #rounder ", %%mm4 \n\t"\
  591. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  592. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  593. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  594. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  595. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  596. #rounder ", %%mm0 \n\t"\
  597. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  598. "paddd %%mm0, %%mm0 \n\t" \
  599. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  600. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  601. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  602. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  603. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  604. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  605. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  606. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  607. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  608. "psrad $" #shift ", %%mm7 \n\t"\
  609. "psrad $" #shift ", %%mm4 \n\t"\
  610. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  611. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  612. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  613. "psrad $" #shift ", %%mm1 \n\t"\
  614. "psrad $" #shift ", %%mm2 \n\t"\
  615. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  616. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  617. "movq %%mm7, " #dst " \n\t"\
  618. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  619. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  620. "movq %%mm2, 24+" #dst " \n\t"\
  621. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  622. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  623. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  624. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  625. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  626. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  627. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  628. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  629. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  630. "psrad $" #shift ", %%mm2 \n\t"\
  631. "psrad $" #shift ", %%mm0 \n\t"\
  632. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  633. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  634. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  635. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  636. "psrad $" #shift ", %%mm6 \n\t"\
  637. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  638. "movq %%mm2, 8+" #dst " \n\t"\
  639. "psrad $" #shift ", %%mm4 \n\t"\
  640. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  641. "movq %%mm4, 16+" #dst " \n\t"\
  642. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  643. DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  644. Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  645. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  646. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  647. #undef IDCT
  648. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  649. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  650. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  651. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  652. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  653. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  654. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  655. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  656. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  657. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  658. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  659. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  660. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  661. #rounder ", %%mm4 \n\t"\
  662. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  663. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  664. #rounder ", %%mm0 \n\t"\
  665. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  666. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  667. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  668. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  669. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  670. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  671. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  672. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  673. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  674. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  675. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  676. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  677. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  678. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  679. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  680. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  681. "psrad $" #shift ", %%mm7 \n\t"\
  682. "psrad $" #shift ", %%mm4 \n\t"\
  683. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  684. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  685. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  686. "psrad $" #shift ", %%mm0 \n\t"\
  687. "psrad $" #shift ", %%mm2 \n\t"\
  688. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  689. "movd %%mm7, " #dst " \n\t"\
  690. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  691. "movd %%mm0, 16+" #dst " \n\t"\
  692. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  693. "movd %%mm2, 96+" #dst " \n\t"\
  694. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  695. "movd %%mm4, 112+" #dst " \n\t"\
  696. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  697. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  698. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  699. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  700. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  701. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  702. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  703. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  704. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  705. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  706. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  707. "psrad $" #shift ", %%mm2 \n\t"\
  708. "psrad $" #shift ", %%mm5 \n\t"\
  709. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  710. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  711. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  712. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  713. "psrad $" #shift ", %%mm6 \n\t"\
  714. "psrad $" #shift ", %%mm4 \n\t"\
  715. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  716. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  717. "movd %%mm2, 32+" #dst " \n\t"\
  718. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  719. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  720. "movd %%mm6, 48+" #dst " \n\t"\
  721. "movd %%mm4, 64+" #dst " \n\t"\
  722. "movd %%mm5, 80+" #dst " \n\t"
  723. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  724. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  725. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  726. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  727. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  728. "jmp 9f \n\t"
  729. "#.balign 16 \n\t"\
  730. "4: \n\t"
  731. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  732. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  733. #undef IDCT
  734. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  735. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  736. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  737. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  738. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  739. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  740. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  741. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  742. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  743. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  744. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  745. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  746. #rounder ", %%mm4 \n\t"\
  747. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  748. #rounder ", %%mm0 \n\t"\
  749. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  750. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  751. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  752. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  753. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  754. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  755. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  756. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  757. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  758. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  759. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  760. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  761. "psrad $" #shift ", %%mm1 \n\t"\
  762. "psrad $" #shift ", %%mm4 \n\t"\
  763. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  764. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  765. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  766. "psrad $" #shift ", %%mm0 \n\t"\
  767. "psrad $" #shift ", %%mm2 \n\t"\
  768. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  769. "movd %%mm1, " #dst " \n\t"\
  770. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  771. "movd %%mm0, 16+" #dst " \n\t"\
  772. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  773. "movd %%mm2, 96+" #dst " \n\t"\
  774. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  775. "movd %%mm4, 112+" #dst " \n\t"\
  776. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  777. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  778. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  779. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  780. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  781. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  782. "psrad $" #shift ", %%mm2 \n\t"\
  783. "psrad $" #shift ", %%mm5 \n\t"\
  784. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  785. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  786. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  787. "psrad $" #shift ", %%mm6 \n\t"\
  788. "psrad $" #shift ", %%mm1 \n\t"\
  789. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  790. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  791. "movd %%mm2, 32+" #dst " \n\t"\
  792. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  793. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  794. "movd %%mm6, 48+" #dst " \n\t"\
  795. "movd %%mm1, 64+" #dst " \n\t"\
  796. "movd %%mm5, 80+" #dst " \n\t"
  797. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  798. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  799. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  800. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  801. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  802. "jmp 9f \n\t"
  803. "#.balign 16 \n\t"\
  804. "6: \n\t"
  805. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  806. #undef IDCT
  807. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  808. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  809. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  810. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  811. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  812. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  813. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  814. #rounder ", %%mm4 \n\t"\
  815. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  816. #rounder ", %%mm0 \n\t"\
  817. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  818. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  819. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  820. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  821. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  822. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  823. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  824. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  825. "psrad $" #shift ", %%mm1 \n\t"\
  826. "psrad $" #shift ", %%mm4 \n\t"\
  827. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  828. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  829. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  830. "psrad $" #shift ", %%mm0 \n\t"\
  831. "psrad $" #shift ", %%mm2 \n\t"\
  832. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  833. "movd %%mm1, " #dst " \n\t"\
  834. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  835. "movd %%mm0, 16+" #dst " \n\t"\
  836. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  837. "movd %%mm2, 96+" #dst " \n\t"\
  838. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  839. "movd %%mm4, 112+" #dst " \n\t"\
  840. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  841. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  842. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  843. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  844. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  845. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  846. "psrad $" #shift ", %%mm2 \n\t"\
  847. "psrad $" #shift ", %%mm5 \n\t"\
  848. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  849. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  850. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  851. "psrad $" #shift ", %%mm6 \n\t"\
  852. "psrad $" #shift ", %%mm1 \n\t"\
  853. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  854. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  855. "movd %%mm2, 32+" #dst " \n\t"\
  856. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  857. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  858. "movd %%mm6, 48+" #dst " \n\t"\
  859. "movd %%mm1, 64+" #dst " \n\t"\
  860. "movd %%mm5, 80+" #dst " \n\t"
  861. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  862. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  863. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  864. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  865. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  866. "jmp 9f \n\t"
  867. "#.balign 16 \n\t"\
  868. "2: \n\t"
  869. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  870. #undef IDCT
  871. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  872. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  873. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  874. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  875. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  876. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  877. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  878. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  879. #rounder ", %%mm4 \n\t"\
  880. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  881. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  882. #rounder ", %%mm0 \n\t"\
  883. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  884. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  885. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  886. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  887. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  888. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  889. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  890. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  891. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  892. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  893. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  894. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  895. "psrad $" #shift ", %%mm7 \n\t"\
  896. "psrad $" #shift ", %%mm4 \n\t"\
  897. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  898. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  899. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  900. "psrad $" #shift ", %%mm0 \n\t"\
  901. "psrad $" #shift ", %%mm2 \n\t"\
  902. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  903. "movd %%mm7, " #dst " \n\t"\
  904. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  905. "movd %%mm0, 16+" #dst " \n\t"\
  906. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  907. "movd %%mm2, 96+" #dst " \n\t"\
  908. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  909. "movd %%mm4, 112+" #dst " \n\t"\
  910. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  911. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  912. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  913. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  914. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  915. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  916. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  917. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  918. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  919. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  920. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  921. "psrad $" #shift ", %%mm2 \n\t"\
  922. "psrad $" #shift ", %%mm5 \n\t"\
  923. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  924. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  925. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  926. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  927. "psrad $" #shift ", %%mm6 \n\t"\
  928. "psrad $" #shift ", %%mm4 \n\t"\
  929. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  930. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  931. "movd %%mm2, 32+" #dst " \n\t"\
  932. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  933. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  934. "movd %%mm6, 48+" #dst " \n\t"\
  935. "movd %%mm4, 64+" #dst " \n\t"\
  936. "movd %%mm5, 80+" #dst " \n\t"
  937. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  938. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  939. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  940. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  941. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  942. "jmp 9f \n\t"
  943. "#.balign 16 \n\t"\
  944. "3: \n\t"
  945. #undef IDCT
  946. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  947. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  948. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  949. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  950. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  951. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  952. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  953. #rounder ", %%mm4 \n\t"\
  954. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  955. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  956. #rounder ", %%mm0 \n\t"\
  957. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  958. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  959. "movq 64(%2), %%mm3 \n\t"\
  960. "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  961. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  962. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  963. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  964. "psrad $" #shift ", %%mm7 \n\t"\
  965. "psrad $" #shift ", %%mm4 \n\t"\
  966. "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
  967. "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  968. "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  969. "psrad $" #shift ", %%mm0 \n\t"\
  970. "psrad $" #shift ", %%mm1 \n\t"\
  971. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  972. "movd %%mm7, " #dst " \n\t"\
  973. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  974. "movd %%mm0, 16+" #dst " \n\t"\
  975. "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  976. "movd %%mm1, 96+" #dst " \n\t"\
  977. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  978. "movd %%mm4, 112+" #dst " \n\t"\
  979. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  980. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  981. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  982. "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
  983. "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  984. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  985. "psrad $" #shift ", %%mm1 \n\t"\
  986. "psrad $" #shift ", %%mm5 \n\t"\
  987. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  988. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  989. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  990. "psrad $" #shift ", %%mm6 \n\t"\
  991. "psrad $" #shift ", %%mm4 \n\t"\
  992. "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  993. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  994. "movd %%mm1, 32+" #dst " \n\t"\
  995. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  996. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  997. "movd %%mm6, 48+" #dst " \n\t"\
  998. "movd %%mm4, 64+" #dst " \n\t"\
  999. "movd %%mm5, 80+" #dst " \n\t"
  1000. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1001. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1002. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1003. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1004. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1005. "jmp 9f \n\t"
  1006. "#.balign 16 \n\t"\
  1007. "5: \n\t"
  1008. #undef IDCT
  1009. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1010. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1011. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1012. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1013. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1014. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1015. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1016. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1017. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1018. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1019. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1020. #rounder ", %%mm4 \n\t"\
  1021. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1022. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1023. #rounder ", %%mm0 \n\t"\
  1024. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1025. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1026. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1027. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1028. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1029. "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
  1030. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1031. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1032. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1033. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1034. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1035. "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1036. "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1037. #rounder ", %%mm1 \n\t"\
  1038. "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
  1039. "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
  1040. #rounder ", %%mm2 \n\t"\
  1041. "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
  1042. "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
  1043. "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
  1044. "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
  1045. "psrad $" #shift ", %%mm4 \n\t"\
  1046. "psrad $" #shift ", %%mm7 \n\t"\
  1047. "psrad $" #shift ", %%mm3 \n\t"\
  1048. "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
  1049. "movq %%mm4, " #dst " \n\t"\
  1050. "psrad $" #shift ", %%mm0 \n\t"\
  1051. "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
  1052. "movq %%mm0, 16+" #dst " \n\t"\
  1053. "movq %%mm0, 96+" #dst " \n\t"\
  1054. "movq %%mm4, 112+" #dst " \n\t"\
  1055. "psrad $" #shift ", %%mm5 \n\t"\
  1056. "psrad $" #shift ", %%mm6 \n\t"\
  1057. "psrad $" #shift ", %%mm2 \n\t"\
  1058. "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1059. "movq %%mm5, 32+" #dst " \n\t"\
  1060. "psrad $" #shift ", %%mm1 \n\t"\
  1061. "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1062. "movq %%mm6, 48+" #dst " \n\t"\
  1063. "movq %%mm6, 64+" #dst " \n\t"\
  1064. "movq %%mm5, 80+" #dst " \n\t"
  1065. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1066. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1067. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1068. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1069. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1070. "jmp 9f \n\t"
  1071. "#.balign 16 \n\t"\
  1072. "1: \n\t"
  1073. #undef IDCT
  1074. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1075. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1076. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  1077. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  1078. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1079. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1080. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1081. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1082. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  1083. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  1084. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  1085. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  1086. #rounder ", %%mm4 \n\t"\
  1087. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1088. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  1089. #rounder ", %%mm0 \n\t"\
  1090. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  1091. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1092. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  1093. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1094. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  1095. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  1096. "movq 64(%2), %%mm1 \n\t"\
  1097. "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  1098. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1099. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  1100. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1101. "psrad $" #shift ", %%mm7 \n\t"\
  1102. "psrad $" #shift ", %%mm4 \n\t"\
  1103. "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
  1104. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1105. "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1106. "psrad $" #shift ", %%mm0 \n\t"\
  1107. "psrad $" #shift ", %%mm3 \n\t"\
  1108. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  1109. "movd %%mm7, " #dst " \n\t"\
  1110. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  1111. "movd %%mm0, 16+" #dst " \n\t"\
  1112. "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  1113. "movd %%mm3, 96+" #dst " \n\t"\
  1114. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1115. "movd %%mm4, 112+" #dst " \n\t"\
  1116. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  1117. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1118. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1119. "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
  1120. "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1121. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  1122. "psrad $" #shift ", %%mm3 \n\t"\
  1123. "psrad $" #shift ", %%mm5 \n\t"\
  1124. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  1125. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1126. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  1127. "psrad $" #shift ", %%mm6 \n\t"\
  1128. "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  1129. "movd %%mm3, 32+" #dst " \n\t"\
  1130. "psrad $" #shift ", %%mm4 \n\t"\
  1131. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  1132. "movd %%mm6, 48+" #dst " \n\t"\
  1133. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  1134. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  1135. "movd %%mm4, 64+" #dst " \n\t"\
  1136. "movd %%mm5, 80+" #dst " \n\t"
  1137. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1138. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1139. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1140. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1141. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1142. "jmp 9f \n\t"
  1143. "#.balign 16 \n\t"
  1144. "7: \n\t"
  1145. #undef IDCT
  1146. #define IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  1147. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  1148. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  1149. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1150. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  1151. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1152. #rounder ", %%mm4 \n\t"\
  1153. #rounder ", %%mm0 \n\t"\
  1154. "psrad $" #shift ", %%mm4 \n\t"\
  1155. "psrad $" #shift ", %%mm0 \n\t"\
  1156. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  1157. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  1158. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  1159. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  1160. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  1161. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  1162. #rounder ", %%mm1 \n\t"\
  1163. #rounder ", %%mm2 \n\t"\
  1164. "psrad $" #shift ", %%mm1 \n\t"\
  1165. "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
  1166. "movq %%mm4, " #dst " \n\t"\
  1167. "psrad $" #shift ", %%mm2 \n\t"\
  1168. "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
  1169. "movq %%mm0, 16+" #dst " \n\t"\
  1170. "movq %%mm0, 96+" #dst " \n\t"\
  1171. "movq %%mm4, 112+" #dst " \n\t"\
  1172. "movq %%mm0, 32+" #dst " \n\t"\
  1173. "movq %%mm4, 48+" #dst " \n\t"\
  1174. "movq %%mm4, 64+" #dst " \n\t"\
  1175. "movq %%mm0, 80+" #dst " \n\t"
  1176. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  1177. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1178. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1179. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1180. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1181. #endif
  1182. /*
  1183. Input
  1184. 00 40 04 44 20 60 24 64
  1185. 10 30 14 34 50 70 54 74
  1186. 01 41 03 43 21 61 23 63
  1187. 11 31 13 33 51 71 53 73
  1188. 02 42 06 46 22 62 26 66
  1189. 12 32 16 36 52 72 56 76
  1190. 05 45 07 47 25 65 27 67
  1191. 15 35 17 37 55 75 57 77
  1192. Temp
  1193. 00 04 10 14 20 24 30 34
  1194. 40 44 50 54 60 64 70 74
  1195. 01 03 11 13 21 23 31 33
  1196. 41 43 51 53 61 63 71 73
  1197. 02 06 12 16 22 26 32 36
  1198. 42 46 52 56 62 66 72 76
  1199. 05 07 15 17 25 27 35 37
  1200. 45 47 55 57 65 67 75 77
  1201. */
  1202. "9: \n\t"
  1203. :: "r" (block), "r" (temp), "r" (coeffs)
  1204. : "%eax"
  1205. );
  1206. }
  1207. void simple_idct_mmx(int16_t *block)
  1208. {
  1209. idct(block);
  1210. }