You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1456 lines
53KB

  1. /*
  2. Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at)
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #include <inttypes.h>
  16. #include "../dsputil.h"
  17. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  18. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  19. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  20. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  21. #define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  22. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  23. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  24. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  25. #define ROW_SHIFT 11
  26. #define COL_SHIFT 20 // 6
  27. static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
  28. static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
  29. static int16_t __attribute__((aligned(8))) temp[64];
  30. static int16_t __attribute__((aligned(8))) coeffs[]= {
  31. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  32. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  33. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  34. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  35. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  36. // 0, 0, 0, 0,
  37. // 0, 0, 0, 0,
  38. C4, C2, C4, C2,
  39. C4, C6, C4, C6,
  40. C1, C3, C1, C3,
  41. C5, C7, C5, C7,
  42. C4, C6, C4, C6,
  43. -C4, -C2, -C4, -C2,
  44. C3, -C7, C3, -C7,
  45. -C1, -C5, -C1, -C5,
  46. C4, -C6, C4, -C6,
  47. -C4, C2, -C4, C2,
  48. C5, -C1, C5, -C1,
  49. C7, C3, C7, C3,
  50. C4, -C2, C4, -C2,
  51. C4, -C6, C4, -C6,
  52. C7, -C5, C7, -C5,
  53. C3, -C1, C3, -C1
  54. };
  55. #if 0
  56. static void inline idctCol (int16_t * col, int16_t *input)
  57. {
  58. #undef C0
  59. #undef C1
  60. #undef C2
  61. #undef C3
  62. #undef C4
  63. #undef C5
  64. #undef C6
  65. #undef C7
  66. int a0, a1, a2, a3, b0, b1, b2, b3;
  67. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  68. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  69. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  70. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  71. const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  72. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  73. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  74. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  75. /*
  76. if( !(col[8*1] | col[8*2] |col[8*3] |col[8*4] |col[8*5] |col[8*6] | col[8*7])) {
  77. col[8*0] = col[8*1] = col[8*2] = col[8*3] = col[8*4] =
  78. col[8*5] = col[8*6] = col[8*7] = col[8*0]<<3;
  79. return;
  80. }*/
  81. col[8*0] = input[8*0 + 0];
  82. col[8*1] = input[8*2 + 0];
  83. col[8*2] = input[8*0 + 1];
  84. col[8*3] = input[8*2 + 1];
  85. col[8*4] = input[8*4 + 0];
  86. col[8*5] = input[8*6 + 0];
  87. col[8*6] = input[8*4 + 1];
  88. col[8*7] = input[8*6 + 1];
  89. a0 = C4*col[8*0] + C2*col[8*2] + C4*col[8*4] + C6*col[8*6] + (1<<(COL_SHIFT-1));
  90. a1 = C4*col[8*0] + C6*col[8*2] - C4*col[8*4] - C2*col[8*6] + (1<<(COL_SHIFT-1));
  91. a2 = C4*col[8*0] - C6*col[8*2] - C4*col[8*4] + C2*col[8*6] + (1<<(COL_SHIFT-1));
  92. a3 = C4*col[8*0] - C2*col[8*2] + C4*col[8*4] - C6*col[8*6] + (1<<(COL_SHIFT-1));
  93. b0 = C1*col[8*1] + C3*col[8*3] + C5*col[8*5] + C7*col[8*7];
  94. b1 = C3*col[8*1] - C7*col[8*3] - C1*col[8*5] - C5*col[8*7];
  95. b2 = C5*col[8*1] - C1*col[8*3] + C7*col[8*5] + C3*col[8*7];
  96. b3 = C7*col[8*1] - C5*col[8*3] + C3*col[8*5] - C1*col[8*7];
  97. col[8*0] = (a0 + b0) >> COL_SHIFT;
  98. col[8*1] = (a1 + b1) >> COL_SHIFT;
  99. col[8*2] = (a2 + b2) >> COL_SHIFT;
  100. col[8*3] = (a3 + b3) >> COL_SHIFT;
  101. col[8*4] = (a3 - b3) >> COL_SHIFT;
  102. col[8*5] = (a2 - b2) >> COL_SHIFT;
  103. col[8*6] = (a1 - b1) >> COL_SHIFT;
  104. col[8*7] = (a0 - b0) >> COL_SHIFT;
  105. }
  106. static void inline idctRow (int16_t * output, int16_t * input)
  107. {
  108. int16_t row[8];
  109. int a0, a1, a2, a3, b0, b1, b2, b3;
  110. const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  111. const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  112. const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  113. const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  114. const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  115. const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  116. const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  117. const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  118. row[0] = input[0];
  119. row[2] = input[1];
  120. row[4] = input[4];
  121. row[6] = input[5];
  122. row[1] = input[8];
  123. row[3] = input[9];
  124. row[5] = input[12];
  125. row[7] = input[13];
  126. if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) {
  127. row[0] = row[1] = row[2] = row[3] = row[4] =
  128. row[5] = row[6] = row[7] = row[0]<<3;
  129. output[0] = row[0];
  130. output[2] = row[1];
  131. output[4] = row[2];
  132. output[6] = row[3];
  133. output[8] = row[4];
  134. output[10] = row[5];
  135. output[12] = row[6];
  136. output[14] = row[7];
  137. return;
  138. }
  139. a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1));
  140. a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1));
  141. a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1));
  142. a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1));
  143. b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7];
  144. b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7];
  145. b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7];
  146. b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7];
  147. row[0] = (a0 + b0) >> ROW_SHIFT;
  148. row[1] = (a1 + b1) >> ROW_SHIFT;
  149. row[2] = (a2 + b2) >> ROW_SHIFT;
  150. row[3] = (a3 + b3) >> ROW_SHIFT;
  151. row[4] = (a3 - b3) >> ROW_SHIFT;
  152. row[5] = (a2 - b2) >> ROW_SHIFT;
  153. row[6] = (a1 - b1) >> ROW_SHIFT;
  154. row[7] = (a0 - b0) >> ROW_SHIFT;
  155. output[0] = row[0];
  156. output[2] = row[1];
  157. output[4] = row[2];
  158. output[6] = row[3];
  159. output[8] = row[4];
  160. output[10] = row[5];
  161. output[12] = row[6];
  162. output[14] = row[7];
  163. }
  164. #endif
  165. static inline void idct(int16_t *block)
  166. {
  167. int i;
  168. //for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ];
  169. //for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ];
  170. //for(i=0; i<64; i++) block[i]= temp[i];
  171. //block_permute(block);
  172. /*
  173. idctRow(temp, block);
  174. idctRow(temp+16, block+16);
  175. idctRow(temp+1, block+2);
  176. idctRow(temp+17, block+18);
  177. idctRow(temp+32, block+32);
  178. idctRow(temp+48, block+48);
  179. idctRow(temp+33, block+34);
  180. idctRow(temp+49, block+50);
  181. */
  182. asm volatile(
  183. // "lea 64(%0), %%eax \n\t"
  184. //r0,r2,R0,R2 r4,r6,R4,R6 r1,r3,R1,R3 r5,r7,R5,R7
  185. //src0 src4 src1 src5
  186. //r0,R0,r7,R7 r1,R1,r6,R6 r2,R2,r5,R5 r3,R3,r4,R4
  187. //dst0 dst1 dst2 dst3
  188. #if 0 //Alternative, simpler variant
  189. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  190. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  191. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  192. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  193. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  194. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  195. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  196. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  197. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  198. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  199. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  200. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  201. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  202. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  203. #rounder ", %%mm4 \n\t"\
  204. \
  205. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  206. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  207. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  208. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  209. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  210. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  211. "psrad $" #shift ", %%mm6 \n\t"\
  212. "psrad $" #shift ", %%mm4 \n\t"\
  213. WRITE0(%%mm6, %%mm4, dst) \
  214. \
  215. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  216. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  217. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  218. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  219. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  220. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  221. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  222. #rounder ", %%mm4 \n\t"\
  223. \
  224. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  225. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  226. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  227. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  228. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  229. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  230. "psrad $" #shift ", %%mm6 \n\t"\
  231. "psrad $" #shift ", %%mm4 \n\t"\
  232. WRITE1(%%mm6, %%mm4, dst, %%mm7) \
  233. \
  234. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  235. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  236. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  237. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  238. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  239. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  240. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  241. #rounder ", %%mm4 \n\t"\
  242. \
  243. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  244. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  245. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  246. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  247. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  248. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  249. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  250. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  251. "psrad $" #shift ", %%mm6 \n\t"\
  252. "psrad $" #shift ", %%mm4 \n\t"\
  253. \
  254. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  255. #rounder ", %%mm0 \n\t"\
  256. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  257. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  258. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  259. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  260. "psrad $" #shift ", %%mm2 \n\t"\
  261. "psrad $" #shift ", %%mm0 \n\t"\
  262. WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  263. #define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  264. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  265. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  266. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  267. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  268. "movq wm1010, %%mm4 \n\t"\
  269. "pand %%mm0, %%mm4 \n\t"\
  270. "por %%mm1, %%mm4 \n\t"\
  271. "por %%mm2, %%mm4 \n\t"\
  272. "por %%mm3, %%mm4 \n\t"\
  273. "packssdw %%mm4,%%mm4 \n\t"\
  274. "movd %%mm4, %%eax \n\t"\
  275. "orl %%eax, %%eax \n\t"\
  276. "jz 1f \n\t"\
  277. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  278. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  279. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  280. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  281. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  282. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  283. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  284. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  285. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  286. #rounder ", %%mm4 \n\t"\
  287. \
  288. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  289. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  290. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  291. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  292. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  293. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  294. "psrad $" #shift ", %%mm6 \n\t"\
  295. "psrad $" #shift ", %%mm4 \n\t"\
  296. WRITE0(%%mm6, %%mm4, dst) \
  297. \
  298. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  299. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  300. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  301. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  302. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  303. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  304. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  305. #rounder ", %%mm4 \n\t"\
  306. \
  307. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  308. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  309. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  310. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  311. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  312. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  313. "psrad $" #shift ", %%mm6 \n\t"\
  314. "psrad $" #shift ", %%mm4 \n\t"\
  315. WRITE1(%%mm6, %%mm4, dst, %%mm7) \
  316. \
  317. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  318. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  319. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  320. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  321. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  322. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  323. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  324. #rounder ", %%mm4 \n\t"\
  325. \
  326. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  327. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  328. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  329. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  330. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  331. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  332. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  333. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  334. "psrad $" #shift ", %%mm6 \n\t"\
  335. "psrad $" #shift ", %%mm4 \n\t"\
  336. \
  337. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  338. #rounder ", %%mm0 \n\t"\
  339. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  340. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  341. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  342. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  343. "psrad $" #shift ", %%mm2 \n\t"\
  344. "psrad $" #shift ", %%mm0 \n\t"\
  345. WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
  346. "jmp 2f \n\t"\
  347. "1: \n\t"\
  348. WRITE3(%%mm0, dst)\
  349. "2: \n\t"\
  350. #define WRITE0(s0, s7, dst)\
  351. "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\
  352. "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */
  353. #define WRITE1(s1, s6, dst, tmp)\
  354. "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\
  355. "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\
  356. "movq " #tmp ", " #dst " \n\t"\
  357. "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\
  358. "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\
  359. "movq " #s6 ", 24+" #dst " \n\t"
  360. #define WRITE2(s2, s5, s3, s4, dst)\
  361. "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\
  362. "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\
  363. "movq " #s2 ", 8+" #dst " \n\t"\
  364. "movq " #s4 ", 16+" #dst " \n\t"
  365. #define WRITE3(a, dst)\
  366. "pslld $16, " #a " \n\t"\
  367. "psrad $13, " #a " \n\t"\
  368. "packssdw " #a ", " #a " \n\t"\
  369. "movq " #a ", " #dst " \n\t"\
  370. "movq " #a ", 8+" #dst " \n\t"\
  371. "movq " #a ", 16+" #dst " \n\t"\
  372. "movq " #a ", 24+" #dst " \n\t"\
  373. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  374. IDCT_CORE( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  375. /*
  376. DC_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  377. DC_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  378. DC_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  379. */
  380. IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
  381. IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
  382. IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
  383. #undef WRITE0
  384. #undef WRITE1
  385. #undef WRITE2
  386. #define WRITE0(s0, s7, dst)\
  387. "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\
  388. "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\
  389. "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\
  390. "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */
  391. #define WRITE1(s1, s6, dst, tmp)\
  392. "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\
  393. "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\
  394. "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\
  395. "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */
  396. #define WRITE2(s2, s5, s3, s4, dst)\
  397. "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\
  398. "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\
  399. "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\
  400. "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\
  401. "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\
  402. "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\
  403. "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\
  404. "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\
  405. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  406. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  407. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  408. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  409. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  410. #else
  411. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  412. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  413. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  414. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  415. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  416. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  417. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  418. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  419. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  420. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  421. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  422. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  423. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  424. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  425. #rounder ", %%mm4 \n\t"\
  426. \
  427. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  428. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  429. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  430. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  431. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  432. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  433. "psrad $" #shift ", %%mm6 \n\t"\
  434. "psrad $" #shift ", %%mm4 \n\t"\
  435. WRITE0(%%mm6, %%mm4, dst) \
  436. \
  437. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  438. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  439. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  440. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  441. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  442. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  443. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  444. #rounder ", %%mm4 \n\t"\
  445. \
  446. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  447. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  448. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  449. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  450. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  451. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  452. "psrad $" #shift ", %%mm6 \n\t"\
  453. "psrad $" #shift ", %%mm4 \n\t"\
  454. WRITE1(%%mm6, %%mm4, dst, %%mm7) \
  455. \
  456. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  457. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  458. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  459. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  460. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  461. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  462. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  463. #rounder ", %%mm4 \n\t"\
  464. \
  465. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  466. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  467. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  468. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  469. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  470. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  471. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  472. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  473. "psrad $" #shift ", %%mm6 \n\t"\
  474. "psrad $" #shift ", %%mm4 \n\t"\
  475. \
  476. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  477. #rounder ", %%mm0 \n\t"\
  478. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  479. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  480. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  481. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  482. "psrad $" #shift ", %%mm2 \n\t"\
  483. "psrad $" #shift ", %%mm0 \n\t"\
  484. WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  485. #define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  486. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  487. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  488. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  489. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  490. "movq wm1010, %%mm4 \n\t"\
  491. "pand %%mm0, %%mm4 \n\t"\
  492. "por %%mm1, %%mm4 \n\t"\
  493. "por %%mm2, %%mm4 \n\t"\
  494. "por %%mm3, %%mm4 \n\t"\
  495. "packssdw %%mm4,%%mm4 \n\t"\
  496. "movd %%mm4, %%eax \n\t"\
  497. "orl %%eax, %%eax \n\t"\
  498. "jz 1f \n\t"\
  499. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  500. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  501. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  502. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  503. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  504. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  505. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  506. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  507. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  508. #rounder ", %%mm4 \n\t"\
  509. \
  510. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  511. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  512. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  513. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  514. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  515. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  516. "psrad $" #shift ", %%mm6 \n\t"\
  517. "psrad $" #shift ", %%mm4 \n\t"\
  518. WRITE0(%%mm6, %%mm4, dst) \
  519. \
  520. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  521. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  522. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  523. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  524. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  525. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  526. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  527. #rounder ", %%mm4 \n\t"\
  528. \
  529. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  530. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  531. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  532. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  533. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  534. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  535. "psrad $" #shift ", %%mm6 \n\t"\
  536. "psrad $" #shift ", %%mm4 \n\t"\
  537. WRITE1(%%mm6, %%mm4, dst, %%mm7) \
  538. \
  539. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  540. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  541. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  542. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  543. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  544. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  545. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  546. #rounder ", %%mm4 \n\t"\
  547. \
  548. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  549. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  550. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  551. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  552. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  553. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  554. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  555. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  556. "psrad $" #shift ", %%mm6 \n\t"\
  557. "psrad $" #shift ", %%mm4 \n\t"\
  558. \
  559. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  560. #rounder ", %%mm0 \n\t"\
  561. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  562. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  563. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  564. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  565. "psrad $" #shift ", %%mm2 \n\t"\
  566. "psrad $" #shift ", %%mm0 \n\t"\
  567. WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
  568. "jmp 2f \n\t"\
  569. "#.balign 16 \n\t"\
  570. "1: \n\t"\
  571. WRITE3(%%mm0, dst)\
  572. "2: \n\t"\
  573. #define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \
  574. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  575. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  576. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  577. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  578. "movq %%mm0, %%mm4 \n\t"\
  579. "por %%mm1, %%mm4 \n\t"\
  580. "por %%mm2, %%mm4 \n\t"\
  581. "por %%mm3, %%mm4 \n\t"\
  582. "packssdw %%mm4, %%mm4 \n\t"\
  583. "movd %%mm4, %%eax \n\t"\
  584. "orl %%eax, %%eax \n\t"\
  585. "jz " #bt " \n\t"\
  586. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  587. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  588. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  589. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  590. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  591. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  592. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  593. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  594. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  595. #rounder ", %%mm4 \n\t"\
  596. \
  597. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  598. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  599. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  600. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  601. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  602. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  603. "psrad $" #shift ", %%mm6 \n\t"\
  604. "psrad $" #shift ", %%mm4 \n\t"\
  605. WRITE0(%%mm6, %%mm4, dst) \
  606. \
  607. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  608. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  609. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  610. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  611. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  612. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  613. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  614. #rounder ", %%mm4 \n\t"\
  615. \
  616. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  617. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  618. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  619. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  620. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  621. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  622. "psrad $" #shift ", %%mm6 \n\t"\
  623. "psrad $" #shift ", %%mm4 \n\t"\
  624. WRITE1(%%mm6, %%mm4, dst, %%mm7) \
  625. \
  626. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  627. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  628. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  629. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  630. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  631. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  632. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  633. #rounder ", %%mm4 \n\t"\
  634. \
  635. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  636. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  637. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  638. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  639. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  640. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  641. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  642. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  643. "psrad $" #shift ", %%mm6 \n\t"\
  644. "psrad $" #shift ", %%mm4 \n\t"\
  645. \
  646. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  647. #rounder ", %%mm0 \n\t"\
  648. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  649. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  650. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  651. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  652. "psrad $" #shift ", %%mm2 \n\t"\
  653. "psrad $" #shift ", %%mm0 \n\t"\
  654. WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\
  655. #define WRITE0(s0, s7, dst)\
  656. "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\
  657. "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */
  658. #define WRITE1(s1, s6, dst, tmp)\
  659. "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\
  660. "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\
  661. "movq " #tmp ", " #dst " \n\t"\
  662. "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\
  663. "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\
  664. "movq " #s6 ", 24+" #dst " \n\t"
  665. #define WRITE2(s2, s5, s3, s4, dst)\
  666. "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\
  667. "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\
  668. "movq " #s2 ", 8+" #dst " \n\t"\
  669. "movq " #s4 ", 16+" #dst " \n\t"
  670. #define WRITE3(a, dst)\
  671. "pslld $16, " #a " \n\t"\
  672. "paddd d40000, " #a " \n\t"\
  673. "psrad $13, " #a " \n\t"\
  674. "packssdw " #a ", " #a " \n\t"\
  675. "movq " #a ", " #dst " \n\t"\
  676. "movq " #a ", 8+" #dst " \n\t"\
  677. "movq " #a ", 16+" #dst " \n\t"\
  678. "movq " #a ", 24+" #dst " \n\t"\
  679. #define WRITE0b(s0, s7, dst)\
  680. "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\
  681. "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\
  682. "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\
  683. "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */
  684. #define WRITE1b(s1, s6, dst, tmp)\
  685. "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\
  686. "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\
  687. "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\
  688. "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */
  689. #define WRITE2b(s2, s5, s3, s4, dst)\
  690. "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\
  691. "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\
  692. "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\
  693. "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\
  694. "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\
  695. "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\
  696. "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\
  697. "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\
  698. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  699. DC_COND_IDCT_CORE( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  700. Z_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  701. Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  702. Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  703. #undef IDCT_CORE
  704. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  705. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  706. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  707. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  708. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  709. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  710. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  711. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  712. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  713. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  714. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  715. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  716. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  717. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  718. \
  719. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  720. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  721. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  722. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  723. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  724. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  725. "psrad $" #shift ", %%mm6 \n\t"\
  726. "psrad $" #shift ", %%mm4 \n\t"\
  727. WRITE0b(%%mm6, %%mm4, dst) \
  728. \
  729. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  730. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  731. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  732. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  733. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  734. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  735. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  736. \
  737. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  738. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  739. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  740. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  741. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  742. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  743. "psrad $" #shift ", %%mm6 \n\t"\
  744. "psrad $" #shift ", %%mm4 \n\t"\
  745. WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
  746. \
  747. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  748. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  749. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  750. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  751. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  752. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  753. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  754. \
  755. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  756. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  757. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  758. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  759. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  760. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  761. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  762. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  763. "psrad $" #shift ", %%mm6 \n\t"\
  764. "psrad $" #shift ", %%mm4 \n\t"\
  765. \
  766. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  767. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  768. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  769. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  770. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  771. "psrad $" #shift ", %%mm2 \n\t"\
  772. "psrad $" #shift ", %%mm0 \n\t"\
  773. WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  774. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  775. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  776. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  777. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  778. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  779. "jmp 9f \n\t"
  780. "#.balign 16 \n\t"\
  781. "4: \n\t"
  782. Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  783. Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  784. #undef IDCT_CORE
  785. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  786. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  787. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  788. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  789. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  790. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  791. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  792. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  793. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  794. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  795. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  796. \
  797. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  798. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  799. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  800. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  801. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  802. "psrad $" #shift ", %%mm7 \n\t"\
  803. "psrad $" #shift ", %%mm4 \n\t"\
  804. WRITE0b(%%mm7, %%mm4, dst) \
  805. \
  806. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  807. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  808. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  809. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  810. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  811. \
  812. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  813. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  814. "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
  815. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  816. "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  817. "psrad $" #shift ", %%mm7 \n\t"\
  818. "psrad $" #shift ", %%mm4 \n\t"\
  819. WRITE1b(%%mm7, %%mm4, dst, %%mm6) \
  820. \
  821. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  822. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  823. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  824. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  825. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  826. \
  827. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  828. "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
  829. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  830. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  831. "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  832. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  833. "psrad $" #shift ", %%mm7 \n\t"\
  834. "psrad $" #shift ", %%mm4 \n\t"\
  835. \
  836. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  837. "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\
  838. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  839. "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  840. "psrad $" #shift ", %%mm3 \n\t"\
  841. "psrad $" #shift ", %%mm0 \n\t"\
  842. WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
  843. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  844. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  845. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  846. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  847. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  848. "jmp 9f \n\t"
  849. "#.balign 16 \n\t"\
  850. "6: \n\t"
  851. Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  852. #undef IDCT_CORE
  853. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  854. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  855. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  856. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  857. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  858. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  859. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  860. \
  861. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  862. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  863. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  864. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  865. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  866. "psrad $" #shift ", %%mm7 \n\t"\
  867. "psrad $" #shift ", %%mm4 \n\t"\
  868. WRITE0b(%%mm7, %%mm4, dst) \
  869. \
  870. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  871. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  872. \
  873. "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
  874. "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  875. "paddd %%mm5, %%mm7 \n\t" /* A1+B1 a1+b1 */\
  876. "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
  877. "psubd %%mm7, %%mm5 \n\t" /* A1-B1 a1-b1 */\
  878. "psrad $" #shift ", %%mm7 \n\t"\
  879. "psrad $" #shift ", %%mm5 \n\t"\
  880. WRITE1b(%%mm7, %%mm5, dst, %%mm6) \
  881. \
  882. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  883. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  884. \
  885. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  886. "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\
  887. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  888. "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  889. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  890. "psrad $" #shift ", %%mm7 \n\t"\
  891. "psrad $" #shift ", %%mm4 \n\t"\
  892. \
  893. "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\
  894. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  895. "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  896. "psrad $" #shift ", %%mm3 \n\t"\
  897. "psrad $" #shift ", %%mm0 \n\t"\
  898. WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst)
  899. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  900. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  901. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  902. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  903. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  904. "jmp 9f \n\t"
  905. "#.balign 16 \n\t"\
  906. "2: \n\t"
  907. Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  908. #undef IDCT_CORE
  909. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  910. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  911. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  912. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  913. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  914. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  915. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  916. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  917. "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\
  918. "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  919. \
  920. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  921. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  922. "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\
  923. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  924. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  925. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  926. "psrad $" #shift ", %%mm6 \n\t"\
  927. "psrad $" #shift ", %%mm4 \n\t"\
  928. WRITE0b(%%mm6, %%mm4, dst) \
  929. \
  930. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  931. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  932. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  933. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  934. \
  935. "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
  936. "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  937. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  938. "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  939. "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
  940. "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\
  941. "psrad $" #shift ", %%mm6 \n\t"\
  942. "psrad $" #shift ", %%mm5 \n\t"\
  943. WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
  944. \
  945. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  946. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  947. "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  948. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  949. \
  950. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  951. "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\
  952. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  953. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  954. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  955. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  956. "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  957. "psrad $" #shift ", %%mm6 \n\t"\
  958. "psrad $" #shift ", %%mm4 \n\t"\
  959. \
  960. "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\
  961. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  962. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  963. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  964. "psrad $" #shift ", %%mm2 \n\t"\
  965. "psrad $" #shift ", %%mm0 \n\t"\
  966. WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  967. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  968. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  969. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  970. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  971. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  972. "jmp 9f \n\t"
  973. "#.balign 16 \n\t"\
  974. "3: \n\t"
  975. #undef IDCT_CORE
  976. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  977. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  978. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  979. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  980. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  981. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  982. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  983. \
  984. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  985. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  986. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  987. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  988. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  989. "psrad $" #shift ", %%mm6 \n\t"\
  990. "psrad $" #shift ", %%mm4 \n\t"\
  991. WRITE0b(%%mm6, %%mm4, dst) \
  992. \
  993. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  994. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  995. \
  996. "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\
  997. "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  998. "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  999. "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\
  1000. "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\
  1001. "psrad $" #shift ", %%mm6 \n\t"\
  1002. "psrad $" #shift ", %%mm5 \n\t"\
  1003. WRITE1b(%%mm6, %%mm5, dst, %%mm7) \
  1004. \
  1005. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  1006. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1007. \
  1008. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  1009. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  1010. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  1011. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  1012. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1013. "psrad $" #shift ", %%mm6 \n\t"\
  1014. "psrad $" #shift ", %%mm4 \n\t"\
  1015. \
  1016. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  1017. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  1018. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  1019. "psrad $" #shift ", %%mm2 \n\t"\
  1020. "psrad $" #shift ", %%mm0 \n\t"\
  1021. WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  1022. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  1023. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1024. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1025. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1026. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1027. "jmp 9f \n\t"
  1028. "#.balign 16 \n\t"\
  1029. "5: \n\t"
  1030. #undef IDCT_CORE
  1031. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  1032. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  1033. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  1034. "movq %%mm4, %%mm6\n\t"\
  1035. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  1036. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  1037. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  1038. "movq %%mm5, %%mm7\n\t"\
  1039. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  1040. "movq 8+" #src0 ", %%mm2 \n\t" /*2R2 R0 r2 r0 */\
  1041. "pmaddwd %%mm2, %%mm6 \n\t" /*2C2R2+C4R0 C2r2+C4r0 */\
  1042. "movq 8+" #src4 ", %%mm3 \n\t" /*2R6 R4 r6 r4 */\
  1043. "pmaddwd %%mm3, %%mm7 \n\t" /*2C6R6+C4R4 C6r6+C4r4 */\
  1044. \
  1045. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1046. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  1047. "psrad $" #shift ", %%mm4 \n\t"\
  1048. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  1049. \
  1050. "paddd %%mm7, %%mm6 \n\t" /*2A0 a0 */\
  1051. "movq 56(%2), %%mm7 \n\t" /* -C2 -C4 -C2 -C4 */\
  1052. "psrad $" #shift ", %%mm6 \n\t"\
  1053. "pmaddwd %%mm1, %%mm7 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  1054. \
  1055. "packssdw %%mm6, %%mm4 \n\t" /* C0, c0, C0, c0 */\
  1056. "movq 48(%2), %%mm6 \n\t" /* C6 C4 C6 C4 */\
  1057. "movq %%mm4, " #dst " \n\t" /* C0, c0 */\
  1058. "pmaddwd %%mm2, %%mm6 \n\t" /*2C6R2+C4R0 C6r2+C4r0 */\
  1059. \
  1060. "movq %%mm4, 112+" #dst " \n\t" /* C0, c0 */\
  1061. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  1062. "pmaddwd %%mm3, %%mm4 \n\t" /*2-C2R6-C4R4 -C2r6-C4r4 */\
  1063. \
  1064. "paddd %%mm5, %%mm7 \n\t" /* A1 a1 */\
  1065. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  1066. "psrad $" #shift ", %%mm7 \n\t"\
  1067. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  1068. \
  1069. "paddd %%mm4, %%mm6 \n\t" /*2A1 a1 */\
  1070. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  1071. \
  1072. "psrad $" #shift ", %%mm6 \n\t"\
  1073. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  1074. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  1075. \
  1076. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  1077. "packssdw %%mm6, %%mm7 \n\t" /* C1, c1, C1, c1 */\
  1078. \
  1079. "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\
  1080. "movq %%mm7, 16+" #dst " \n\t" /* C1, c1 */\
  1081. "pmaddwd %%mm2, %%mm6 \n\t" /*2-C6R2+C4R0 -C6r2+C4r0 */\
  1082. \
  1083. "movq %%mm7, 96+" #dst " \n\t" /* C1, c1 */\
  1084. "movq 88(%2), %%mm7 \n\t" /* C2 -C4 C2 -C4 */\
  1085. "pmaddwd %%mm3, %%mm7 \n\t" /*2C2R6-C4R4 C2r6-C4r4 */\
  1086. \
  1087. "pmaddwd 112(%2), %%mm2 \n\t" /*2-C2R2+C4R0 -C2r2+C4r0 */\
  1088. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  1089. \
  1090. "pmaddwd 120(%2), %%mm3 \n\t" /*2-C6R6+C4R4 -C6r6+C4r4 */\
  1091. "psrad $" #shift ", %%mm4 \n\t"\
  1092. \
  1093. "paddd %%mm7, %%mm6 \n\t" /*2A2 a2 */\
  1094. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  1095. \
  1096. "psrad $" #shift ", %%mm6 \n\t"\
  1097. \
  1098. "packssdw %%mm6, %%mm4 \n\t" /* C2, c2, C2, c2 */\
  1099. "movq %%mm4, 32+" #dst " \n\t" /* C2, c2 */\
  1100. "psrad $" #shift ", %%mm0 \n\t"\
  1101. "paddd %%mm3, %%mm2 \n\t" /*2A3 a3 */\
  1102. \
  1103. "movq %%mm4, 80+" #dst " \n\t" /* C2, c2 */\
  1104. "psrad $" #shift ", %%mm2 \n\t"\
  1105. \
  1106. "packssdw %%mm2, %%mm0 \n\t" /* C3, c3, C3, c3 */\
  1107. "movq %%mm0, 48+" #dst " \n\t" /* C3, c3 */\
  1108. "movq %%mm0, 64+" #dst " \n\t" /* C3, c3 */\
  1109. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  1110. IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1111. //IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1112. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1113. //IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1114. "jmp 9f \n\t"
  1115. "#.balign 16 \n\t"\
  1116. "1: \n\t"
  1117. #undef IDCT_CORE
  1118. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  1119. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  1120. "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\
  1121. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  1122. "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\
  1123. "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  1124. "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  1125. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\
  1126. "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\
  1127. "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  1128. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  1129. \
  1130. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  1131. "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  1132. "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\
  1133. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  1134. "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  1135. "psrad $" #shift ", %%mm6 \n\t"\
  1136. "psrad $" #shift ", %%mm4 \n\t"\
  1137. WRITE0b(%%mm6, %%mm4, dst) \
  1138. \
  1139. "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\
  1140. "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\
  1141. "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\
  1142. "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  1143. "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\
  1144. \
  1145. "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\
  1146. "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  1147. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  1148. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  1149. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  1150. "psrad $" #shift ", %%mm6 \n\t"\
  1151. "psrad $" #shift ", %%mm4 \n\t"\
  1152. WRITE1b(%%mm6, %%mm4, dst, %%mm7) \
  1153. \
  1154. "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\
  1155. "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\
  1156. "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\
  1157. "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  1158. "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\
  1159. \
  1160. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  1161. "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\
  1162. "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\
  1163. "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\
  1164. "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\
  1165. "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  1166. "psrad $" #shift ", %%mm6 \n\t"\
  1167. "psrad $" #shift ", %%mm4 \n\t"\
  1168. \
  1169. "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\
  1170. "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\
  1171. "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\
  1172. "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\
  1173. "psrad $" #shift ", %%mm2 \n\t"\
  1174. "psrad $" #shift ", %%mm0 \n\t"\
  1175. WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst)
  1176. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  1177. IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1178. IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1179. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1180. IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1181. "jmp 9f \n\t"
  1182. "#.balign 16 \n\t"
  1183. "7: \n\t"
  1184. #undef IDCT_CORE
  1185. #define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \
  1186. "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\
  1187. "movq 16(%2), %%mm2 \n\t" /* C2 C4 C2 C4 */\
  1188. "movq 8+" #src0 ", %%mm1 \n\t" /* R2 R0 r2 r0 */\
  1189. "pmaddwd %%mm0, %%mm2 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  1190. "movq 16(%2), %%mm3 \n\t" /* C2 C4 C2 C4 */\
  1191. "pmaddwd %%mm1, %%mm3 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\
  1192. \
  1193. "movq 48(%2), %%mm4 \n\t" /* C6 C4 C6 C4 */\
  1194. "pmaddwd %%mm0, %%mm4 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  1195. "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\
  1196. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\
  1197. "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\
  1198. "pmaddwd %%mm0, %%mm6 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  1199. "movq 80(%2), %%mm7 \n\t" /* -C6 C4 -C6 C4 */\
  1200. "pmaddwd %%mm1, %%mm7 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\
  1201. "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  1202. "psrad $" #shift ", %%mm2 \n\t"\
  1203. "psrad $" #shift ", %%mm3 \n\t"\
  1204. "pmaddwd 112(%2), %%mm1 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\
  1205. "packssdw %%mm3, %%mm2 \n\t" /* C0, c0, C0, c0 */\
  1206. "movq %%mm2, " #dst " \n\t" /* C0, c0 */\
  1207. "psrad $" #shift ", %%mm4 \n\t"\
  1208. "psrad $" #shift ", %%mm5 \n\t"\
  1209. "movq %%mm2, 112+" #dst " \n\t" /* C0, c0 */\
  1210. "packssdw %%mm5, %%mm4 \n\t" /* C1, c1, C1, c1 */\
  1211. "movq %%mm4, 16+" #dst " \n\t" /* C0, c0 */\
  1212. "psrad $" #shift ", %%mm7 \n\t"\
  1213. "psrad $" #shift ", %%mm6 \n\t"\
  1214. "movq %%mm4, 96+" #dst " \n\t" /* C0, c0 */\
  1215. "packssdw %%mm7, %%mm6 \n\t" /* C2, c2, C2, c2 */\
  1216. "movq %%mm6, 32+" #dst " \n\t" /* C0, c0 */\
  1217. "psrad $" #shift ", %%mm0 \n\t"\
  1218. "movq %%mm6, 80+" #dst " \n\t" /* C0, c0 */\
  1219. "psrad $" #shift ", %%mm1 \n\t"\
  1220. "packssdw %%mm1, %%mm0 \n\t" /* C3, c3, C3, c3 */\
  1221. "movq %%mm0, 48+" #dst " \n\t" /* C0, c0 */\
  1222. "movq %%mm0, 64+" #dst " \n\t" /* C0, c0 */\
  1223. //IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift)
  1224. IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20)
  1225. //IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20)
  1226. IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20)
  1227. //IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20)
  1228. #endif
  1229. /*
  1230. Input
  1231. 00 20 02 22 40 60 42 62
  1232. 10 30 12 32 50 70 52 72
  1233. 01 21 03 23 41 61 43 63
  1234. 11 31 13 33 51 71 53 73
  1235. 04 24 06 26 44 64 46 66
  1236. 14 34 16 36 54 74 56 76
  1237. ...
  1238. */
  1239. /*
  1240. Temp
  1241. 00 02 10 12 20 22 30 32
  1242. 40 42 50 52 60 62 70 72
  1243. 01 03 11 13 21 23 31 33
  1244. 41 43 51 53 61 63 71 73
  1245. 04 06 14 16 24 26 34 36
  1246. 44 46 54 56 64 66 74 76
  1247. 05 07 15 17 25 27 35 37
  1248. 45 47 55 57 65 67 75 77
  1249. */
  1250. /*
  1251. Output
  1252. 00 10 20 30 40 50 60 70
  1253. 01 11 21 31 41 51 61 71
  1254. ...
  1255. */
  1256. "9: \n\t"
  1257. :: "r" (block), "r" (temp), "r" (coeffs)
  1258. : "%eax"
  1259. );
  1260. /*
  1261. idctCol(block, temp);
  1262. idctCol(block+1, temp+2);
  1263. idctCol(block+2, temp+4);
  1264. idctCol(block+3, temp+6);
  1265. idctCol(block+4, temp+8);
  1266. idctCol(block+5, temp+10);
  1267. idctCol(block+6, temp+12);
  1268. idctCol(block+7, temp+14);
  1269. */
  1270. }
  1271. void simple_idct_mmx(int16_t *block)
  1272. {
  1273. static int imax=0, imin=0;
  1274. static int omax=0, omin=0;
  1275. int i, j;
  1276. /*
  1277. for(i=0; i<64; i++)
  1278. {
  1279. if(block[i] > imax)
  1280. {
  1281. imax= block[i];
  1282. printf("Input-Max: %d\n", imax);
  1283. printf("Input-Min: %d\n", imin);
  1284. printf("Output-Max: %d\n", omax);
  1285. printf("Output-Min: %d\n", omin);
  1286. }
  1287. if(block[i] < imin)
  1288. {
  1289. imin= block[i];
  1290. printf("Input-Max: %d\n", imax);
  1291. printf("Input-Min: %d\n", imin);
  1292. printf("Output-Max: %d\n", omax);
  1293. printf("Output-Min: %d\n", omin);
  1294. }
  1295. }*/
  1296. /* static int stat[64];
  1297. for(j=0; j<4; j++)
  1298. {
  1299. static int line[8]={0,2,1,3,4,6,5,7};
  1300. for(i=0; i<16; i++)
  1301. {
  1302. if(block[j*16+i])
  1303. {
  1304. stat[j*16+1]++;
  1305. break;
  1306. }
  1307. }
  1308. for(i=0; i<16; i++)
  1309. {
  1310. if(block[j*16+i] && i!=0 && i!=2)
  1311. {
  1312. stat[j*16+2]++;
  1313. break;
  1314. }
  1315. }
  1316. }
  1317. stat[0]++;*/
  1318. /* for(i=1; i<8; i++)
  1319. {
  1320. if(block[i] != 0)
  1321. {
  1322. stat[1]++;
  1323. break;
  1324. }
  1325. }
  1326. for(i=32; i<64; i++)
  1327. {
  1328. if(block[i] != 0)
  1329. {
  1330. stat[2]++;
  1331. break;
  1332. }
  1333. }
  1334. stat[0]++;
  1335. */
  1336. // return;
  1337. idct(block);
  1338. // memset(block, 0, 128);
  1339. /*
  1340. if(stat[0] > 100000)
  1341. for(i=0; i<64; i++)
  1342. {
  1343. if((i&7) == 0) printf("\n");
  1344. printf("%06d ", stat[i]);
  1345. }
  1346. */
  1347. /*
  1348. for(i=0; i<4; i++) printf("%d", stat[1+i*16]);
  1349. printf(" ");
  1350. for(i=0; i<4; i++) printf("%d", stat[2+i*16]);
  1351. printf("\n");
  1352. */
  1353. // printf("%d", stat[2]);
  1354. // memset(stat, 0, 256);
  1355. /*
  1356. for(i=0; i<64; i++)
  1357. {
  1358. if(block[i] > omax)
  1359. {
  1360. omax= block[i];
  1361. printf("Input-Max: %d\n", imax);
  1362. printf("Input-Min: %d\n", imin);
  1363. printf("Output-Max: %d\n", omax);
  1364. printf("Output-Min: %d\n", omin);
  1365. }
  1366. if(block[i] < omin)
  1367. {
  1368. omin= block[i];
  1369. printf("Input-Max: %d\n", imax);
  1370. printf("Input-Min: %d\n", imin);
  1371. printf("Output-Max: %d\n", omax);
  1372. printf("Output-Min: %d\n", omin);
  1373. }
  1374. }*/
  1375. }