You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

919 lines
51KB

  1. /*
  2. * Simple IDCT MMX
  3. *
  4. * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavutil/internal.h"
  23. #include "libavutil/mem.h"
  24. #include "libavutil/x86/asm.h"
  25. #include "libavcodec/idctdsp.h"
  26. #include "idctdsp.h"
  27. #include "simple_idct.h"
  28. #if HAVE_INLINE_ASM
  29. /*
  30. 23170.475006
  31. 22725.260826
  32. 21406.727617
  33. 19265.545870
  34. 16384.000000
  35. 12872.826198
  36. 8866.956905
  37. 4520.335430
  38. */
  39. #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  40. #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  41. #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  42. #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  43. #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
  44. #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  45. #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  46. #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
  47. #define ROW_SHIFT 11
  48. #define COL_SHIFT 20 // 6
  49. DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
  50. DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
  51. DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
  52. 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
  53. // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
  54. // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
  55. 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
  56. // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
  57. // 0, 0, 0, 0,
  58. // 0, 0, 0, 0,
  59. C4, C4, C4, C4,
  60. C4, -C4, C4, -C4,
  61. C2, C6, C2, C6,
  62. C6, -C2, C6, -C2,
  63. C1, C3, C1, C3,
  64. C5, C7, C5, C7,
  65. C3, -C7, C3, -C7,
  66. -C1, -C5, -C1, -C5,
  67. C5, -C1, C5, -C1,
  68. C7, C3, C7, C3,
  69. C7, -C5, C7, -C5,
  70. C3, -C1, C3, -C1
  71. };
  72. static inline void idct(int16_t *block)
  73. {
  74. DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
  75. int16_t * const temp= (int16_t*)align_tmp;
  76. __asm__ volatile(
  77. #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  78. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  79. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  80. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  81. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  82. "movq "MANGLE(wm1010)", %%mm4 \n\t"\
  83. "pand %%mm0, %%mm4 \n\t"\
  84. "por %%mm1, %%mm4 \n\t"\
  85. "por %%mm2, %%mm4 \n\t"\
  86. "por %%mm3, %%mm4 \n\t"\
  87. "packssdw %%mm4,%%mm4 \n\t"\
  88. "movd %%mm4, %%eax \n\t"\
  89. "orl %%eax, %%eax \n\t"\
  90. "jz 1f \n\t"\
  91. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  92. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  93. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  94. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  95. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  96. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  97. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  98. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  99. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  100. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  101. #rounder ", %%mm4 \n\t"\
  102. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  103. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  104. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  105. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  106. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  107. #rounder ", %%mm0 \n\t"\
  108. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  109. "paddd %%mm0, %%mm0 \n\t" \
  110. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  111. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  112. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  113. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  114. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  115. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  116. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  117. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  118. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  119. "psrad $" #shift ", %%mm7 \n\t"\
  120. "psrad $" #shift ", %%mm4 \n\t"\
  121. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  122. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  123. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  124. "psrad $" #shift ", %%mm1 \n\t"\
  125. "psrad $" #shift ", %%mm2 \n\t"\
  126. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  127. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  128. "movq %%mm7, " #dst " \n\t"\
  129. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  130. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  131. "movq %%mm2, 24+" #dst " \n\t"\
  132. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  133. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  134. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  135. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  136. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  137. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  138. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  139. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  140. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  141. "psrad $" #shift ", %%mm2 \n\t"\
  142. "psrad $" #shift ", %%mm0 \n\t"\
  143. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  144. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  145. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  146. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  147. "psrad $" #shift ", %%mm6 \n\t"\
  148. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  149. "movq %%mm2, 8+" #dst " \n\t"\
  150. "psrad $" #shift ", %%mm4 \n\t"\
  151. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  152. "movq %%mm4, 16+" #dst " \n\t"\
  153. "jmp 2f \n\t"\
  154. "1: \n\t"\
  155. "pslld $16, %%mm0 \n\t"\
  156. "paddd "MANGLE(d40000)", %%mm0 \n\t"\
  157. "psrad $13, %%mm0 \n\t"\
  158. "packssdw %%mm0, %%mm0 \n\t"\
  159. "movq %%mm0, " #dst " \n\t"\
  160. "movq %%mm0, 8+" #dst " \n\t"\
  161. "movq %%mm0, 16+" #dst " \n\t"\
  162. "movq %%mm0, 24+" #dst " \n\t"\
  163. "2: \n\t"
  164. #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
  165. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  166. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  167. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  168. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  169. "movq %%mm0, %%mm4 \n\t"\
  170. "por %%mm1, %%mm4 \n\t"\
  171. "por %%mm2, %%mm4 \n\t"\
  172. "por %%mm3, %%mm4 \n\t"\
  173. "packssdw %%mm4,%%mm4 \n\t"\
  174. "movd %%mm4, %%eax \n\t"\
  175. "orl %%eax, %%eax \n\t"\
  176. "jz " #bt " \n\t"\
  177. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  178. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  179. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  180. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  181. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  182. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  183. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  184. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  185. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  186. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  187. #rounder ", %%mm4 \n\t"\
  188. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  189. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  190. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  191. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  192. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  193. #rounder ", %%mm0 \n\t"\
  194. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  195. "paddd %%mm0, %%mm0 \n\t" \
  196. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  197. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  198. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  199. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  200. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  201. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  202. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  203. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  204. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  205. "psrad $" #shift ", %%mm7 \n\t"\
  206. "psrad $" #shift ", %%mm4 \n\t"\
  207. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  208. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  209. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  210. "psrad $" #shift ", %%mm1 \n\t"\
  211. "psrad $" #shift ", %%mm2 \n\t"\
  212. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  213. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  214. "movq %%mm7, " #dst " \n\t"\
  215. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  216. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  217. "movq %%mm2, 24+" #dst " \n\t"\
  218. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  219. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  220. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  221. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  222. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  223. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  224. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  225. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  226. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  227. "psrad $" #shift ", %%mm2 \n\t"\
  228. "psrad $" #shift ", %%mm0 \n\t"\
  229. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  230. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  231. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  232. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  233. "psrad $" #shift ", %%mm6 \n\t"\
  234. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  235. "movq %%mm2, 8+" #dst " \n\t"\
  236. "psrad $" #shift ", %%mm4 \n\t"\
  237. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  238. "movq %%mm4, 16+" #dst " \n\t"\
  239. #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
  240. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  241. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  242. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  243. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  244. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  245. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  246. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  247. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  248. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  249. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  250. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  251. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  252. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  253. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  254. #rounder ", %%mm4 \n\t"\
  255. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  256. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  257. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  258. "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
  259. "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  260. #rounder ", %%mm0 \n\t"\
  261. "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
  262. "paddd %%mm0, %%mm0 \n\t" \
  263. "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
  264. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  265. "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
  266. "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
  267. "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  268. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  269. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  270. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  271. "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
  272. "psrad $" #shift ", %%mm7 \n\t"\
  273. "psrad $" #shift ", %%mm4 \n\t"\
  274. "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
  275. "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
  276. "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  277. "psrad $" #shift ", %%mm1 \n\t"\
  278. "psrad $" #shift ", %%mm2 \n\t"\
  279. "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
  280. "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
  281. "movq %%mm7, " #dst " \n\t"\
  282. "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
  283. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  284. "movq %%mm2, 24+" #dst " \n\t"\
  285. "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  286. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  287. "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  288. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  289. "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
  290. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  291. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  292. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  293. "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
  294. "psrad $" #shift ", %%mm2 \n\t"\
  295. "psrad $" #shift ", %%mm0 \n\t"\
  296. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  297. "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
  298. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  299. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  300. "psrad $" #shift ", %%mm6 \n\t"\
  301. "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
  302. "movq %%mm2, 8+" #dst " \n\t"\
  303. "psrad $" #shift ", %%mm4 \n\t"\
  304. "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
  305. "movq %%mm4, 16+" #dst " \n\t"\
  306. //IDCT( src0, src4, src1, src5, dst, rounder, shift)
  307. DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
  308. Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
  309. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
  310. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
  311. #undef IDCT
  312. #define IDCT(src0, src4, src1, src5, dst, shift) \
  313. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  314. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  315. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  316. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  317. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  318. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  319. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  320. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  321. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  322. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  323. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  324. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  325. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  326. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  327. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  328. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  329. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  330. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  331. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  332. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  333. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  334. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  335. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  336. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  337. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  338. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  339. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  340. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  341. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  342. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  343. "psrad $" #shift ", %%mm7 \n\t"\
  344. "psrad $" #shift ", %%mm4 \n\t"\
  345. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  346. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  347. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  348. "psrad $" #shift ", %%mm0 \n\t"\
  349. "psrad $" #shift ", %%mm2 \n\t"\
  350. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  351. "movd %%mm7, " #dst " \n\t"\
  352. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  353. "movd %%mm0, 16+" #dst " \n\t"\
  354. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  355. "movd %%mm2, 96+" #dst " \n\t"\
  356. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  357. "movd %%mm4, 112+" #dst " \n\t"\
  358. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  359. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  360. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  361. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  362. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  363. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  364. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  365. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  366. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  367. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  368. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  369. "psrad $" #shift ", %%mm2 \n\t"\
  370. "psrad $" #shift ", %%mm5 \n\t"\
  371. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  372. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  373. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  374. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  375. "psrad $" #shift ", %%mm6 \n\t"\
  376. "psrad $" #shift ", %%mm4 \n\t"\
  377. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  378. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  379. "movd %%mm2, 32+" #dst " \n\t"\
  380. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  381. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  382. "movd %%mm6, 48+" #dst " \n\t"\
  383. "movd %%mm4, 64+" #dst " \n\t"\
  384. "movd %%mm5, 80+" #dst " \n\t"
  385. //IDCT( src0, src4, src1, src5, dst, shift)
  386. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  387. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  388. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  389. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  390. "jmp 9f \n\t"
  391. "# .p2align 4 \n\t"\
  392. "4: \n\t"
  393. Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
  394. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
  395. #undef IDCT
  396. #define IDCT(src0, src4, src1, src5, dst, shift) \
  397. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  398. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  399. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  400. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  401. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  402. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  403. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  404. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  405. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  406. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  407. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  408. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  409. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  410. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  411. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  412. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  413. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  414. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  415. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  416. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  417. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  418. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  419. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  420. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  421. "psrad $" #shift ", %%mm1 \n\t"\
  422. "psrad $" #shift ", %%mm4 \n\t"\
  423. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  424. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  425. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  426. "psrad $" #shift ", %%mm0 \n\t"\
  427. "psrad $" #shift ", %%mm2 \n\t"\
  428. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  429. "movd %%mm1, " #dst " \n\t"\
  430. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  431. "movd %%mm0, 16+" #dst " \n\t"\
  432. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  433. "movd %%mm2, 96+" #dst " \n\t"\
  434. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  435. "movd %%mm4, 112+" #dst " \n\t"\
  436. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  437. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  438. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  439. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  440. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  441. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  442. "psrad $" #shift ", %%mm2 \n\t"\
  443. "psrad $" #shift ", %%mm5 \n\t"\
  444. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  445. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  446. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  447. "psrad $" #shift ", %%mm6 \n\t"\
  448. "psrad $" #shift ", %%mm1 \n\t"\
  449. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  450. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  451. "movd %%mm2, 32+" #dst " \n\t"\
  452. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  453. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  454. "movd %%mm6, 48+" #dst " \n\t"\
  455. "movd %%mm1, 64+" #dst " \n\t"\
  456. "movd %%mm5, 80+" #dst " \n\t"
  457. //IDCT( src0, src4, src1, src5, dst, shift)
  458. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  459. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  460. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  461. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  462. "jmp 9f \n\t"
  463. "# .p2align 4 \n\t"\
  464. "6: \n\t"
  465. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
  466. #undef IDCT
  467. #define IDCT(src0, src4, src1, src5, dst, shift) \
  468. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  469. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  470. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  471. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  472. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  473. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  474. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  475. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  476. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  477. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  478. "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
  479. "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  480. "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  481. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  482. "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  483. "psrad $" #shift ", %%mm1 \n\t"\
  484. "psrad $" #shift ", %%mm4 \n\t"\
  485. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  486. "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  487. "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  488. "psrad $" #shift ", %%mm0 \n\t"\
  489. "psrad $" #shift ", %%mm2 \n\t"\
  490. "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
  491. "movd %%mm1, " #dst " \n\t"\
  492. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  493. "movd %%mm0, 16+" #dst " \n\t"\
  494. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  495. "movd %%mm2, 96+" #dst " \n\t"\
  496. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  497. "movd %%mm4, 112+" #dst " \n\t"\
  498. "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
  499. "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  500. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  501. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  502. "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  503. "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  504. "psrad $" #shift ", %%mm2 \n\t"\
  505. "psrad $" #shift ", %%mm5 \n\t"\
  506. "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
  507. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  508. "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
  509. "psrad $" #shift ", %%mm6 \n\t"\
  510. "psrad $" #shift ", %%mm1 \n\t"\
  511. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  512. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  513. "movd %%mm2, 32+" #dst " \n\t"\
  514. "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
  515. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  516. "movd %%mm6, 48+" #dst " \n\t"\
  517. "movd %%mm1, 64+" #dst " \n\t"\
  518. "movd %%mm5, 80+" #dst " \n\t"
  519. //IDCT( src0, src4, src1, src5, dst, shift)
  520. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  521. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  522. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  523. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  524. "jmp 9f \n\t"
  525. "# .p2align 4 \n\t"\
  526. "2: \n\t"
  527. Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
  528. #undef IDCT
  529. #define IDCT(src0, src4, src1, src5, dst, shift) \
  530. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  531. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  532. "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
  533. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  534. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  535. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  536. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  537. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  538. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  539. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  540. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  541. "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
  542. "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
  543. "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  544. "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
  545. "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
  546. "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
  547. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  548. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  549. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  550. "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
  551. "psrad $" #shift ", %%mm7 \n\t"\
  552. "psrad $" #shift ", %%mm4 \n\t"\
  553. "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
  554. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  555. "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  556. "psrad $" #shift ", %%mm0 \n\t"\
  557. "psrad $" #shift ", %%mm2 \n\t"\
  558. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  559. "movd %%mm7, " #dst " \n\t"\
  560. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  561. "movd %%mm0, 16+" #dst " \n\t"\
  562. "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
  563. "movd %%mm2, 96+" #dst " \n\t"\
  564. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  565. "movd %%mm4, 112+" #dst " \n\t"\
  566. "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
  567. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  568. "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  569. "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
  570. "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  571. "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
  572. "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
  573. "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
  574. "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
  575. "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  576. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  577. "psrad $" #shift ", %%mm2 \n\t"\
  578. "psrad $" #shift ", %%mm5 \n\t"\
  579. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  580. "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
  581. "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  582. "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  583. "psrad $" #shift ", %%mm6 \n\t"\
  584. "psrad $" #shift ", %%mm4 \n\t"\
  585. "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
  586. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  587. "movd %%mm2, 32+" #dst " \n\t"\
  588. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  589. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  590. "movd %%mm6, 48+" #dst " \n\t"\
  591. "movd %%mm4, 64+" #dst " \n\t"\
  592. "movd %%mm5, 80+" #dst " \n\t"
  593. //IDCT( src0, src4, src1, src5, dst, shift)
  594. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  595. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  596. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  597. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  598. "jmp 9f \n\t"
  599. "# .p2align 4 \n\t"\
  600. "3: \n\t"
  601. #undef IDCT
  602. #define IDCT(src0, src4, src1, src5, dst, shift) \
  603. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  604. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  605. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  606. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  607. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  608. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  609. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  610. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  611. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  612. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  613. "movq 64(%2), %%mm3 \n\t"\
  614. "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  615. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  616. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  617. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  618. "psrad $" #shift ", %%mm7 \n\t"\
  619. "psrad $" #shift ", %%mm4 \n\t"\
  620. "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
  621. "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  622. "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  623. "psrad $" #shift ", %%mm0 \n\t"\
  624. "psrad $" #shift ", %%mm1 \n\t"\
  625. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  626. "movd %%mm7, " #dst " \n\t"\
  627. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  628. "movd %%mm0, 16+" #dst " \n\t"\
  629. "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
  630. "movd %%mm1, 96+" #dst " \n\t"\
  631. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  632. "movd %%mm4, 112+" #dst " \n\t"\
  633. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  634. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  635. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  636. "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
  637. "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  638. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  639. "psrad $" #shift ", %%mm1 \n\t"\
  640. "psrad $" #shift ", %%mm5 \n\t"\
  641. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  642. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  643. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  644. "psrad $" #shift ", %%mm6 \n\t"\
  645. "psrad $" #shift ", %%mm4 \n\t"\
  646. "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
  647. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  648. "movd %%mm1, 32+" #dst " \n\t"\
  649. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  650. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  651. "movd %%mm6, 48+" #dst " \n\t"\
  652. "movd %%mm4, 64+" #dst " \n\t"\
  653. "movd %%mm5, 80+" #dst " \n\t"
  654. //IDCT( src0, src4, src1, src5, dst, shift)
  655. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  656. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  657. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  658. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  659. "jmp 9f \n\t"
  660. "# .p2align 4 \n\t"\
  661. "5: \n\t"
  662. #undef IDCT
  663. #define IDCT(src0, src4, src1, src5, dst, shift) \
  664. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  665. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  666. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  667. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  668. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  669. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  670. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  671. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  672. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  673. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  674. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  675. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  676. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  677. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  678. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  679. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  680. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  681. "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
  682. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  683. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  684. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  685. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  686. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  687. "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  688. "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  689. "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
  690. "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
  691. "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
  692. "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
  693. "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
  694. "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
  695. "psrad $" #shift ", %%mm4 \n\t"\
  696. "psrad $" #shift ", %%mm7 \n\t"\
  697. "psrad $" #shift ", %%mm3 \n\t"\
  698. "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
  699. "movq %%mm4, " #dst " \n\t"\
  700. "psrad $" #shift ", %%mm0 \n\t"\
  701. "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
  702. "movq %%mm0, 16+" #dst " \n\t"\
  703. "movq %%mm0, 96+" #dst " \n\t"\
  704. "movq %%mm4, 112+" #dst " \n\t"\
  705. "psrad $" #shift ", %%mm5 \n\t"\
  706. "psrad $" #shift ", %%mm6 \n\t"\
  707. "psrad $" #shift ", %%mm2 \n\t"\
  708. "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  709. "movq %%mm5, 32+" #dst " \n\t"\
  710. "psrad $" #shift ", %%mm1 \n\t"\
  711. "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  712. "movq %%mm6, 48+" #dst " \n\t"\
  713. "movq %%mm6, 64+" #dst " \n\t"\
  714. "movq %%mm5, 80+" #dst " \n\t"
  715. //IDCT( src0, src4, src1, src5, dst, shift)
  716. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  717. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  718. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  719. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  720. "jmp 9f \n\t"
  721. "# .p2align 4 \n\t"\
  722. "1: \n\t"
  723. #undef IDCT
  724. #define IDCT(src0, src4, src1, src5, dst, shift) \
  725. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  726. "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
  727. "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
  728. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  729. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  730. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  731. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  732. "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
  733. "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
  734. "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
  735. "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
  736. "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  737. "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
  738. "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
  739. "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
  740. "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
  741. "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  742. "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
  743. "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
  744. "movq 64(%2), %%mm1 \n\t"\
  745. "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
  746. "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  747. "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
  748. "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  749. "psrad $" #shift ", %%mm7 \n\t"\
  750. "psrad $" #shift ", %%mm4 \n\t"\
  751. "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
  752. "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  753. "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  754. "psrad $" #shift ", %%mm0 \n\t"\
  755. "psrad $" #shift ", %%mm3 \n\t"\
  756. "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
  757. "movd %%mm7, " #dst " \n\t"\
  758. "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
  759. "movd %%mm0, 16+" #dst " \n\t"\
  760. "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
  761. "movd %%mm3, 96+" #dst " \n\t"\
  762. "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
  763. "movd %%mm4, 112+" #dst " \n\t"\
  764. "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
  765. "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
  766. "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
  767. "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
  768. "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  769. "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
  770. "psrad $" #shift ", %%mm3 \n\t"\
  771. "psrad $" #shift ", %%mm5 \n\t"\
  772. "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
  773. "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  774. "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
  775. "psrad $" #shift ", %%mm6 \n\t"\
  776. "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
  777. "movd %%mm3, 32+" #dst " \n\t"\
  778. "psrad $" #shift ", %%mm4 \n\t"\
  779. "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
  780. "movd %%mm6, 48+" #dst " \n\t"\
  781. "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
  782. "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
  783. "movd %%mm4, 64+" #dst " \n\t"\
  784. "movd %%mm5, 80+" #dst " \n\t"
  785. //IDCT( src0, src4, src1, src5, dst, shift)
  786. IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  787. IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  788. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  789. IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  790. "jmp 9f \n\t"
  791. "# .p2align 4 \n\t"
  792. "7: \n\t"
  793. #undef IDCT
  794. #define IDCT(src0, src4, src1, src5, dst, shift) \
  795. "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
  796. "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
  797. "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  798. "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
  799. "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  800. "psrad $" #shift ", %%mm4 \n\t"\
  801. "psrad $" #shift ", %%mm0 \n\t"\
  802. "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
  803. "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
  804. "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
  805. "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
  806. "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
  807. "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
  808. "psrad $" #shift ", %%mm1 \n\t"\
  809. "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
  810. "movq %%mm4, " #dst " \n\t"\
  811. "psrad $" #shift ", %%mm2 \n\t"\
  812. "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
  813. "movq %%mm0, 16+" #dst " \n\t"\
  814. "movq %%mm0, 96+" #dst " \n\t"\
  815. "movq %%mm4, 112+" #dst " \n\t"\
  816. "movq %%mm0, 32+" #dst " \n\t"\
  817. "movq %%mm4, 48+" #dst " \n\t"\
  818. "movq %%mm4, 64+" #dst " \n\t"\
  819. "movq %%mm0, 80+" #dst " \n\t"
  820. //IDCT( src0, src4, src1, src5, dst, shift)
  821. IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
  822. //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
  823. IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
  824. //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
  825. /*
  826. Input
  827. 00 40 04 44 20 60 24 64
  828. 10 30 14 34 50 70 54 74
  829. 01 41 03 43 21 61 23 63
  830. 11 31 13 33 51 71 53 73
  831. 02 42 06 46 22 62 26 66
  832. 12 32 16 36 52 72 56 76
  833. 05 45 07 47 25 65 27 67
  834. 15 35 17 37 55 75 57 77
  835. Temp
  836. 00 04 10 14 20 24 30 34
  837. 40 44 50 54 60 64 70 74
  838. 01 03 11 13 21 23 31 33
  839. 41 43 51 53 61 63 71 73
  840. 02 06 12 16 22 26 32 36
  841. 42 46 52 56 62 66 72 76
  842. 05 07 15 17 25 27 35 37
  843. 45 47 55 57 65 67 75 77
  844. */
  845. "9: \n\t"
  846. :: "r" (block), "r" (temp), "r" (coeffs)
  847. : "%eax"
  848. );
  849. }
  850. void ff_simple_idct_mmx(int16_t *block)
  851. {
  852. idct(block);
  853. }
  854. //FIXME merge add/put into the idct
  855. void ff_simple_idct_put_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
  856. {
  857. idct(block);
  858. ff_put_pixels_clamped(block, dest, line_size);
  859. }
  860. void ff_simple_idct_add_mmx(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
  861. {
  862. idct(block);
  863. ff_add_pixels_clamped(block, dest, line_size);
  864. }
  865. #endif /* HAVE_INLINE_ASM */