You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1708 lines
66KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. int mm_flags; /* multimedia extension flags */
  23. /* FIXME use them in static form */
  24. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  25. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
  41. int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
  42. int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
  43. int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
  44. /* pixel operations */
  45. static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  46. static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  47. static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  48. static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
  49. static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
  50. static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
  51. static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
  52. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  53. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  54. #define MOVQ_WONE(regd) \
  55. __asm __volatile ( \
  56. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  57. "psrlw $15, %%" #regd ::)
  58. #define MOVQ_BFE(regd) \
  59. __asm __volatile ( \
  60. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  61. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  62. #ifndef PIC
  63. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
  64. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  65. #else
  66. // for shared library it's better to use this way for accessing constants
  67. // pcmpeqd -> -1
  68. #define MOVQ_BONE(regd) \
  69. __asm __volatile ( \
  70. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  71. "psrlw $15, %%" #regd " \n\t" \
  72. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  73. #define MOVQ_WTWO(regd) \
  74. __asm __volatile ( \
  75. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  76. "psrlw $15, %%" #regd " \n\t" \
  77. "psllw $1, %%" #regd " \n\t"::)
  78. #endif
  79. // using regr as temporary and for the output result
  80. // first argument is unmodifed and second is trashed
  81. // regfe is supposed to contain 0xfefefefefefefefe
  82. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  83. "movq " #rega ", " #regr " \n\t"\
  84. "pand " #regb ", " #regr " \n\t"\
  85. "pxor " #rega ", " #regb " \n\t"\
  86. "pand " #regfe "," #regb " \n\t"\
  87. "psrlq $1, " #regb " \n\t"\
  88. "paddb " #regb ", " #regr " \n\t"
  89. #define PAVGB_MMX(rega, regb, regr, regfe) \
  90. "movq " #rega ", " #regr " \n\t"\
  91. "por " #regb ", " #regr " \n\t"\
  92. "pxor " #rega ", " #regb " \n\t"\
  93. "pand " #regfe "," #regb " \n\t"\
  94. "psrlq $1, " #regb " \n\t"\
  95. "psubb " #regb ", " #regr " \n\t"
  96. // mm6 is supposed to contain 0xfefefefefefefefe
  97. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  98. "movq " #rega ", " #regr " \n\t"\
  99. "movq " #regc ", " #regp " \n\t"\
  100. "pand " #regb ", " #regr " \n\t"\
  101. "pand " #regd ", " #regp " \n\t"\
  102. "pxor " #rega ", " #regb " \n\t"\
  103. "pxor " #regc ", " #regd " \n\t"\
  104. "pand %%mm6, " #regb " \n\t"\
  105. "pand %%mm6, " #regd " \n\t"\
  106. "psrlq $1, " #regb " \n\t"\
  107. "psrlq $1, " #regd " \n\t"\
  108. "paddb " #regb ", " #regr " \n\t"\
  109. "paddb " #regd ", " #regp " \n\t"
  110. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  111. "movq " #rega ", " #regr " \n\t"\
  112. "movq " #regc ", " #regp " \n\t"\
  113. "por " #regb ", " #regr " \n\t"\
  114. "por " #regd ", " #regp " \n\t"\
  115. "pxor " #rega ", " #regb " \n\t"\
  116. "pxor " #regc ", " #regd " \n\t"\
  117. "pand %%mm6, " #regb " \n\t"\
  118. "pand %%mm6, " #regd " \n\t"\
  119. "psrlq $1, " #regd " \n\t"\
  120. "psrlq $1, " #regb " \n\t"\
  121. "psubb " #regb ", " #regr " \n\t"\
  122. "psubb " #regd ", " #regp " \n\t"
  123. /***********************************/
  124. /* MMX no rounding */
  125. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  126. #define SET_RND MOVQ_WONE
  127. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  128. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  129. #include "dsputil_mmx_rnd.h"
  130. #undef DEF
  131. #undef SET_RND
  132. #undef PAVGBP
  133. #undef PAVGB
  134. /***********************************/
  135. /* MMX rounding */
  136. #define DEF(x, y) x ## _ ## y ##_mmx
  137. #define SET_RND MOVQ_WTWO
  138. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  139. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  140. #include "dsputil_mmx_rnd.h"
  141. #undef DEF
  142. #undef SET_RND
  143. #undef PAVGBP
  144. #undef PAVGB
  145. /***********************************/
  146. /* 3Dnow specific */
  147. #define DEF(x) x ## _3dnow
  148. /* for Athlons PAVGUSB is prefered */
  149. #define PAVGB "pavgusb"
  150. #include "dsputil_mmx_avg.h"
  151. #undef DEF
  152. #undef PAVGB
  153. /***********************************/
  154. /* MMX2 specific */
  155. #define DEF(x) x ## _mmx2
  156. /* Introduced only in MMX2 set */
  157. #define PAVGB "pavgb"
  158. #include "dsputil_mmx_avg.h"
  159. #undef DEF
  160. #undef PAVGB
  161. /***********************************/
  162. /* standard MMX */
  163. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  164. {
  165. asm volatile(
  166. "movl $-128, %%eax \n\t"
  167. "pxor %%mm7, %%mm7 \n\t"
  168. ".balign 16 \n\t"
  169. "1: \n\t"
  170. "movq (%0), %%mm0 \n\t"
  171. "movq (%0, %2), %%mm2 \n\t"
  172. "movq %%mm0, %%mm1 \n\t"
  173. "movq %%mm2, %%mm3 \n\t"
  174. "punpcklbw %%mm7, %%mm0 \n\t"
  175. "punpckhbw %%mm7, %%mm1 \n\t"
  176. "punpcklbw %%mm7, %%mm2 \n\t"
  177. "punpckhbw %%mm7, %%mm3 \n\t"
  178. "movq %%mm0, (%1, %%eax)\n\t"
  179. "movq %%mm1, 8(%1, %%eax)\n\t"
  180. "movq %%mm2, 16(%1, %%eax)\n\t"
  181. "movq %%mm3, 24(%1, %%eax)\n\t"
  182. "addl %3, %0 \n\t"
  183. "addl $32, %%eax \n\t"
  184. "js 1b \n\t"
  185. : "+r" (pixels)
  186. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  187. : "%eax"
  188. );
  189. }
  190. static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
  191. {
  192. asm volatile(
  193. "pxor %%mm7, %%mm7 \n\t"
  194. "movl $-128, %%eax \n\t"
  195. ".balign 16 \n\t"
  196. "1: \n\t"
  197. "movq (%0), %%mm0 \n\t"
  198. "movq (%1), %%mm2 \n\t"
  199. "movq %%mm0, %%mm1 \n\t"
  200. "movq %%mm2, %%mm3 \n\t"
  201. "punpcklbw %%mm7, %%mm0 \n\t"
  202. "punpckhbw %%mm7, %%mm1 \n\t"
  203. "punpcklbw %%mm7, %%mm2 \n\t"
  204. "punpckhbw %%mm7, %%mm3 \n\t"
  205. "psubw %%mm2, %%mm0 \n\t"
  206. "psubw %%mm3, %%mm1 \n\t"
  207. "movq %%mm0, (%2, %%eax)\n\t"
  208. "movq %%mm1, 8(%2, %%eax)\n\t"
  209. "addl %3, %0 \n\t"
  210. "addl %3, %1 \n\t"
  211. "addl $16, %%eax \n\t"
  212. "jnz 1b \n\t"
  213. : "+r" (s1), "+r" (s2)
  214. : "r" (block+64), "r" (stride)
  215. : "%eax"
  216. );
  217. }
  218. void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  219. {
  220. const DCTELEM *p;
  221. UINT8 *pix;
  222. /* read the pixels */
  223. p = block;
  224. pix = pixels;
  225. /* unrolled loop */
  226. __asm __volatile(
  227. "movq %3, %%mm0\n\t"
  228. "movq 8%3, %%mm1\n\t"
  229. "movq 16%3, %%mm2\n\t"
  230. "movq 24%3, %%mm3\n\t"
  231. "movq 32%3, %%mm4\n\t"
  232. "movq 40%3, %%mm5\n\t"
  233. "movq 48%3, %%mm6\n\t"
  234. "movq 56%3, %%mm7\n\t"
  235. "packuswb %%mm1, %%mm0\n\t"
  236. "packuswb %%mm3, %%mm2\n\t"
  237. "packuswb %%mm5, %%mm4\n\t"
  238. "packuswb %%mm7, %%mm6\n\t"
  239. "movq %%mm0, (%0)\n\t"
  240. "movq %%mm2, (%0, %1)\n\t"
  241. "movq %%mm4, (%0, %1, 2)\n\t"
  242. "movq %%mm6, (%0, %2)\n\t"
  243. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  244. :"memory");
  245. pix += line_size*4;
  246. p += 32;
  247. // if here would be an exact copy of the code above
  248. // compiler would generate some very strange code
  249. // thus using "r"
  250. __asm __volatile(
  251. "movq (%3), %%mm0\n\t"
  252. "movq 8(%3), %%mm1\n\t"
  253. "movq 16(%3), %%mm2\n\t"
  254. "movq 24(%3), %%mm3\n\t"
  255. "movq 32(%3), %%mm4\n\t"
  256. "movq 40(%3), %%mm5\n\t"
  257. "movq 48(%3), %%mm6\n\t"
  258. "movq 56(%3), %%mm7\n\t"
  259. "packuswb %%mm1, %%mm0\n\t"
  260. "packuswb %%mm3, %%mm2\n\t"
  261. "packuswb %%mm5, %%mm4\n\t"
  262. "packuswb %%mm7, %%mm6\n\t"
  263. "movq %%mm0, (%0)\n\t"
  264. "movq %%mm2, (%0, %1)\n\t"
  265. "movq %%mm4, (%0, %1, 2)\n\t"
  266. "movq %%mm6, (%0, %2)\n\t"
  267. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  268. :"memory");
  269. }
  270. void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  271. {
  272. const DCTELEM *p;
  273. UINT8 *pix;
  274. int i;
  275. /* read the pixels */
  276. p = block;
  277. pix = pixels;
  278. MOVQ_ZERO(mm7);
  279. i = 4;
  280. do {
  281. __asm __volatile(
  282. "movq (%2), %%mm0\n\t"
  283. "movq 8(%2), %%mm1\n\t"
  284. "movq 16(%2), %%mm2\n\t"
  285. "movq 24(%2), %%mm3\n\t"
  286. "movq %0, %%mm4\n\t"
  287. "movq %1, %%mm6\n\t"
  288. "movq %%mm4, %%mm5\n\t"
  289. "punpcklbw %%mm7, %%mm4\n\t"
  290. "punpckhbw %%mm7, %%mm5\n\t"
  291. "paddsw %%mm4, %%mm0\n\t"
  292. "paddsw %%mm5, %%mm1\n\t"
  293. "movq %%mm6, %%mm5\n\t"
  294. "punpcklbw %%mm7, %%mm6\n\t"
  295. "punpckhbw %%mm7, %%mm5\n\t"
  296. "paddsw %%mm6, %%mm2\n\t"
  297. "paddsw %%mm5, %%mm3\n\t"
  298. "packuswb %%mm1, %%mm0\n\t"
  299. "packuswb %%mm3, %%mm2\n\t"
  300. "movq %%mm0, %0\n\t"
  301. "movq %%mm2, %1\n\t"
  302. :"+m"(*pix), "+m"(*(pix+line_size))
  303. :"r"(p)
  304. :"memory");
  305. pix += line_size*2;
  306. p += 16;
  307. } while (--i);
  308. }
  309. static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  310. {
  311. __asm __volatile(
  312. "lea (%3, %3), %%eax \n\t"
  313. ".balign 8 \n\t"
  314. "1: \n\t"
  315. "movq (%1), %%mm0 \n\t"
  316. "movq (%1, %3), %%mm1 \n\t"
  317. "movq %%mm0, (%2) \n\t"
  318. "movq %%mm1, (%2, %3) \n\t"
  319. "addl %%eax, %1 \n\t"
  320. "addl %%eax, %2 \n\t"
  321. "movq (%1), %%mm0 \n\t"
  322. "movq (%1, %3), %%mm1 \n\t"
  323. "movq %%mm0, (%2) \n\t"
  324. "movq %%mm1, (%2, %3) \n\t"
  325. "addl %%eax, %1 \n\t"
  326. "addl %%eax, %2 \n\t"
  327. "subl $4, %0 \n\t"
  328. "jnz 1b \n\t"
  329. : "+g"(h), "+r" (pixels), "+r" (block)
  330. : "r"(line_size)
  331. : "%eax", "memory"
  332. );
  333. }
  334. static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  335. {
  336. __asm __volatile(
  337. "lea (%3, %3), %%eax \n\t"
  338. ".balign 8 \n\t"
  339. "1: \n\t"
  340. "movq (%1), %%mm0 \n\t"
  341. "movq 8(%1), %%mm4 \n\t"
  342. "movq (%1, %3), %%mm1 \n\t"
  343. "movq 8(%1, %3), %%mm5 \n\t"
  344. "movq %%mm0, (%2) \n\t"
  345. "movq %%mm4, 8(%2) \n\t"
  346. "movq %%mm1, (%2, %3) \n\t"
  347. "movq %%mm5, 8(%2, %3) \n\t"
  348. "addl %%eax, %1 \n\t"
  349. "addl %%eax, %2 \n\t"
  350. "movq (%1), %%mm0 \n\t"
  351. "movq 8(%1), %%mm4 \n\t"
  352. "movq (%1, %3), %%mm1 \n\t"
  353. "movq 8(%1, %3), %%mm5 \n\t"
  354. "movq %%mm0, (%2) \n\t"
  355. "movq %%mm4, 8(%2) \n\t"
  356. "movq %%mm1, (%2, %3) \n\t"
  357. "movq %%mm5, 8(%2, %3) \n\t"
  358. "addl %%eax, %1 \n\t"
  359. "addl %%eax, %2 \n\t"
  360. "subl $4, %0 \n\t"
  361. "jnz 1b \n\t"
  362. : "+g"(h), "+r" (pixels), "+r" (block)
  363. : "r"(line_size)
  364. : "%eax", "memory"
  365. );
  366. }
  367. static void clear_blocks_mmx(DCTELEM *blocks)
  368. {
  369. __asm __volatile(
  370. "pxor %%mm7, %%mm7 \n\t"
  371. "movl $-128*6, %%eax \n\t"
  372. "1: \n\t"
  373. "movq %%mm7, (%0, %%eax) \n\t"
  374. "movq %%mm7, 8(%0, %%eax) \n\t"
  375. "movq %%mm7, 16(%0, %%eax) \n\t"
  376. "movq %%mm7, 24(%0, %%eax) \n\t"
  377. "addl $32, %%eax \n\t"
  378. " js 1b \n\t"
  379. : : "r" (((int)blocks)+128*6)
  380. : "%eax"
  381. );
  382. }
  383. static int pix_sum16_mmx(UINT8 * pix, int line_size){
  384. const int h=16;
  385. int sum;
  386. int index= -line_size*h;
  387. __asm __volatile(
  388. "pxor %%mm7, %%mm7 \n\t"
  389. "pxor %%mm6, %%mm6 \n\t"
  390. "1: \n\t"
  391. "movq (%2, %1), %%mm0 \n\t"
  392. "movq (%2, %1), %%mm1 \n\t"
  393. "movq 8(%2, %1), %%mm2 \n\t"
  394. "movq 8(%2, %1), %%mm3 \n\t"
  395. "punpcklbw %%mm7, %%mm0 \n\t"
  396. "punpckhbw %%mm7, %%mm1 \n\t"
  397. "punpcklbw %%mm7, %%mm2 \n\t"
  398. "punpckhbw %%mm7, %%mm3 \n\t"
  399. "paddw %%mm0, %%mm1 \n\t"
  400. "paddw %%mm2, %%mm3 \n\t"
  401. "paddw %%mm1, %%mm3 \n\t"
  402. "paddw %%mm3, %%mm6 \n\t"
  403. "addl %3, %1 \n\t"
  404. " js 1b \n\t"
  405. "movq %%mm6, %%mm5 \n\t"
  406. "psrlq $32, %%mm6 \n\t"
  407. "paddw %%mm5, %%mm6 \n\t"
  408. "movq %%mm6, %%mm5 \n\t"
  409. "psrlq $16, %%mm6 \n\t"
  410. "paddw %%mm5, %%mm6 \n\t"
  411. "movd %%mm6, %0 \n\t"
  412. "andl $0xFFFF, %0 \n\t"
  413. : "=&r" (sum), "+r" (index)
  414. : "r" (pix - index), "r" (line_size)
  415. );
  416. return sum;
  417. }
  418. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  419. int i=0;
  420. asm volatile(
  421. "1: \n\t"
  422. "movq (%1, %0), %%mm0 \n\t"
  423. "movq (%2, %0), %%mm1 \n\t"
  424. "paddb %%mm0, %%mm1 \n\t"
  425. "movq %%mm1, (%2, %0) \n\t"
  426. "movq 8(%1, %0), %%mm0 \n\t"
  427. "movq 8(%2, %0), %%mm1 \n\t"
  428. "paddb %%mm0, %%mm1 \n\t"
  429. "movq %%mm1, 8(%2, %0) \n\t"
  430. "addl $16, %0 \n\t"
  431. "cmpl %3, %0 \n\t"
  432. " jb 1b \n\t"
  433. : "+r" (i)
  434. : "r"(src), "r"(dst), "r"(w-15)
  435. );
  436. for(; i<w; i++)
  437. dst[i+0] += src[i+0];
  438. }
  439. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  440. int tmp;
  441. asm volatile (
  442. "movl $16,%%ecx\n"
  443. "pxor %%mm0,%%mm0\n"
  444. "pxor %%mm7,%%mm7\n"
  445. "1:\n"
  446. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  447. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  448. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  449. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  450. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  451. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  452. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  453. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  454. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  455. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  456. "pmaddwd %%mm3,%%mm3\n"
  457. "pmaddwd %%mm4,%%mm4\n"
  458. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  459. pix2^2+pix3^2+pix6^2+pix7^2) */
  460. "paddd %%mm3,%%mm4\n"
  461. "paddd %%mm2,%%mm7\n"
  462. "addl %2, %0\n"
  463. "paddd %%mm4,%%mm7\n"
  464. "dec %%ecx\n"
  465. "jnz 1b\n"
  466. "movq %%mm7,%%mm1\n"
  467. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  468. "paddd %%mm7,%%mm1\n"
  469. "movd %%mm1,%1\n"
  470. : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
  471. return tmp;
  472. }
  473. static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) {
  474. int tmp;
  475. asm volatile (
  476. "movl $16,%%ecx\n"
  477. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  478. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  479. "1:\n"
  480. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  481. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  482. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  483. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  484. /* todo: mm1-mm2, mm3-mm4 */
  485. /* algo: substract mm1 from mm2 with saturation and vice versa */
  486. /* OR the results to get absolute difference */
  487. "movq %%mm1,%%mm5\n"
  488. "movq %%mm3,%%mm6\n"
  489. "psubusb %%mm2,%%mm1\n"
  490. "psubusb %%mm4,%%mm3\n"
  491. "psubusb %%mm5,%%mm2\n"
  492. "psubusb %%mm6,%%mm4\n"
  493. "por %%mm1,%%mm2\n"
  494. "por %%mm3,%%mm4\n"
  495. /* now convert to 16-bit vectors so we can square them */
  496. "movq %%mm2,%%mm1\n"
  497. "movq %%mm4,%%mm3\n"
  498. "punpckhbw %%mm0,%%mm2\n"
  499. "punpckhbw %%mm0,%%mm4\n"
  500. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  501. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  502. "pmaddwd %%mm2,%%mm2\n"
  503. "pmaddwd %%mm4,%%mm4\n"
  504. "pmaddwd %%mm1,%%mm1\n"
  505. "pmaddwd %%mm3,%%mm3\n"
  506. "addl %3,%0\n"
  507. "addl %3,%1\n"
  508. "paddd %%mm2,%%mm1\n"
  509. "paddd %%mm4,%%mm3\n"
  510. "paddd %%mm1,%%mm7\n"
  511. "paddd %%mm3,%%mm7\n"
  512. "decl %%ecx\n"
  513. "jnz 1b\n"
  514. "movq %%mm7,%%mm1\n"
  515. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  516. "paddd %%mm7,%%mm1\n"
  517. "movd %%mm1,%2\n"
  518. : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
  519. return tmp;
  520. }
  521. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  522. int i=0;
  523. asm volatile(
  524. "1: \n\t"
  525. "movq (%2, %0), %%mm0 \n\t"
  526. "movq (%1, %0), %%mm1 \n\t"
  527. "psubb %%mm0, %%mm1 \n\t"
  528. "movq %%mm1, (%3, %0) \n\t"
  529. "movq 8(%2, %0), %%mm0 \n\t"
  530. "movq 8(%1, %0), %%mm1 \n\t"
  531. "psubb %%mm0, %%mm1 \n\t"
  532. "movq %%mm1, 8(%3, %0) \n\t"
  533. "addl $16, %0 \n\t"
  534. "cmpl %4, %0 \n\t"
  535. " jb 1b \n\t"
  536. : "+r" (i)
  537. : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
  538. );
  539. for(; i<w; i++)
  540. dst[i+0] = src1[i+0]-src2[i+0];
  541. }
  542. #define LBUTTERFLY(a,b)\
  543. "paddw " #b ", " #a " \n\t"\
  544. "paddw " #b ", " #b " \n\t"\
  545. "psubw " #a ", " #b " \n\t"
  546. #define HADAMARD48\
  547. LBUTTERFLY(%%mm0, %%mm1)\
  548. LBUTTERFLY(%%mm2, %%mm3)\
  549. LBUTTERFLY(%%mm4, %%mm5)\
  550. LBUTTERFLY(%%mm6, %%mm7)\
  551. \
  552. LBUTTERFLY(%%mm0, %%mm2)\
  553. LBUTTERFLY(%%mm1, %%mm3)\
  554. LBUTTERFLY(%%mm4, %%mm6)\
  555. LBUTTERFLY(%%mm5, %%mm7)\
  556. \
  557. LBUTTERFLY(%%mm0, %%mm4)\
  558. LBUTTERFLY(%%mm1, %%mm5)\
  559. LBUTTERFLY(%%mm2, %%mm6)\
  560. LBUTTERFLY(%%mm3, %%mm7)
  561. #define MMABS(a,z)\
  562. "pxor " #z ", " #z " \n\t"\
  563. "pcmpgtw " #a ", " #z " \n\t"\
  564. "pxor " #z ", " #a " \n\t"\
  565. "psubw " #z ", " #a " \n\t"
  566. #define MMABS_SUM(a,z, sum)\
  567. "pxor " #z ", " #z " \n\t"\
  568. "pcmpgtw " #a ", " #z " \n\t"\
  569. "pxor " #z ", " #a " \n\t"\
  570. "psubw " #z ", " #a " \n\t"\
  571. "paddusw " #a ", " #sum " \n\t"
  572. #define SBUTTERFLY(a,b,t,n)\
  573. "movq " #a ", " #t " \n\t" /* abcd */\
  574. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  575. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  576. #define TRANSPOSE4(a,b,c,d,t)\
  577. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  578. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  579. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  580. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  581. #define LOAD4(o, a, b, c, d)\
  582. "movq "#o"(%1), " #a " \n\t"\
  583. "movq "#o"+16(%1), " #b " \n\t"\
  584. "movq "#o"+32(%1), " #c " \n\t"\
  585. "movq "#o"+48(%1), " #d " \n\t"
  586. #define STORE4(o, a, b, c, d)\
  587. "movq "#a", "#o"(%1) \n\t"\
  588. "movq "#b", "#o"+16(%1) \n\t"\
  589. "movq "#c", "#o"+32(%1) \n\t"\
  590. "movq "#d", "#o"+48(%1) \n\t"\
  591. static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
  592. uint64_t temp[16] __align8;
  593. int sum=0;
  594. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  595. asm volatile(
  596. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  597. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  598. HADAMARD48
  599. "movq %%mm7, 112(%1) \n\t"
  600. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  601. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  602. "movq 112(%1), %%mm7 \n\t"
  603. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  604. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  605. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  606. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  607. HADAMARD48
  608. "movq %%mm7, 120(%1) \n\t"
  609. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  610. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  611. "movq 120(%1), %%mm7 \n\t"
  612. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  613. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  614. "movq %%mm6, %%mm7 \n\t"
  615. "movq %%mm0, %%mm6 \n\t"
  616. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  617. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  618. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  619. HADAMARD48
  620. "movq %%mm7, 64(%1) \n\t"
  621. MMABS(%%mm0, %%mm7)
  622. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  623. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  624. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  625. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  626. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  627. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  628. "movq 64(%1), %%mm1 \n\t"
  629. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  630. "movq %%mm0, 64(%1) \n\t"
  631. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  632. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  633. HADAMARD48
  634. "movq %%mm7, (%1) \n\t"
  635. MMABS(%%mm0, %%mm7)
  636. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  637. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  638. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  639. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  640. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  641. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  642. "movq (%1), %%mm1 \n\t"
  643. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  644. "movq 64(%1), %%mm1 \n\t"
  645. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  646. "movq %%mm0, %%mm1 \n\t"
  647. "psrlq $32, %%mm0 \n\t"
  648. "paddusw %%mm1, %%mm0 \n\t"
  649. "movq %%mm0, %%mm1 \n\t"
  650. "psrlq $16, %%mm0 \n\t"
  651. "paddusw %%mm1, %%mm0 \n\t"
  652. "movd %%mm0, %0 \n\t"
  653. : "=r" (sum)
  654. : "r"(temp)
  655. );
  656. return sum&0xFFFF;
  657. }
  658. WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
  659. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  660. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  661. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  662. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  663. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  664. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  665. "movq "#in7", " #m3 " \n\t" /* d */\
  666. "movq "#in0", %%mm5 \n\t" /* D */\
  667. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  668. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  669. "movq "#in1", %%mm5 \n\t" /* C */\
  670. "movq "#in2", %%mm6 \n\t" /* B */\
  671. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  672. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  673. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  674. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  675. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  676. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  677. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  678. "psraw $5, %%mm5 \n\t"\
  679. "packuswb %%mm5, %%mm5 \n\t"\
  680. OP(%%mm5, out, %%mm7, d)
  681. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  682. void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  683. uint64_t temp;\
  684. \
  685. asm volatile(\
  686. "pxor %%mm7, %%mm7 \n\t"\
  687. "1: \n\t"\
  688. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  689. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  690. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  691. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  692. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  693. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  694. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  695. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  696. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  697. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  698. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  699. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  700. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  701. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  702. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  703. "paddw %%mm3, %%mm5 \n\t" /* b */\
  704. "paddw %%mm2, %%mm6 \n\t" /* c */\
  705. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  706. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  707. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  708. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  709. "paddw %%mm4, %%mm0 \n\t" /* a */\
  710. "paddw %%mm1, %%mm5 \n\t" /* d */\
  711. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  712. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  713. "paddw %6, %%mm6 \n\t"\
  714. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  715. "psraw $5, %%mm0 \n\t"\
  716. "movq %%mm0, %5 \n\t"\
  717. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  718. \
  719. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  720. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  721. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  722. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  723. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  724. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  725. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  726. "paddw %%mm0, %%mm2 \n\t" /* b */\
  727. "paddw %%mm5, %%mm3 \n\t" /* c */\
  728. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  729. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  730. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  731. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  732. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  733. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  734. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  735. "paddw %%mm2, %%mm1 \n\t" /* a */\
  736. "paddw %%mm6, %%mm4 \n\t" /* d */\
  737. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  738. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  739. "paddw %6, %%mm1 \n\t"\
  740. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  741. "psraw $5, %%mm3 \n\t"\
  742. "movq %5, %%mm1 \n\t"\
  743. "packuswb %%mm3, %%mm1 \n\t"\
  744. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  745. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  746. \
  747. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  748. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  749. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  750. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  751. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  752. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  753. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  754. "paddw %%mm1, %%mm5 \n\t" /* b */\
  755. "paddw %%mm4, %%mm0 \n\t" /* c */\
  756. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  757. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  758. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  759. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  760. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  761. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  762. "paddw %%mm3, %%mm2 \n\t" /* d */\
  763. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  764. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  765. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  766. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  767. "paddw %%mm2, %%mm6 \n\t" /* a */\
  768. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  769. "paddw %6, %%mm0 \n\t"\
  770. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  771. "psraw $5, %%mm0 \n\t"\
  772. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  773. \
  774. "paddw %%mm5, %%mm3 \n\t" /* a */\
  775. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  776. "paddw %%mm4, %%mm6 \n\t" /* b */\
  777. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  778. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  779. "paddw %%mm1, %%mm4 \n\t" /* c */\
  780. "paddw %%mm2, %%mm5 \n\t" /* d */\
  781. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  782. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  783. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  784. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  785. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  786. "paddw %6, %%mm4 \n\t"\
  787. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  788. "psraw $5, %%mm4 \n\t"\
  789. "packuswb %%mm4, %%mm0 \n\t"\
  790. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  791. \
  792. "addl %3, %0 \n\t"\
  793. "addl %4, %1 \n\t"\
  794. "decl %2 \n\t"\
  795. " jnz 1b \n\t"\
  796. : "+a"(src), "+c"(dst), "+m"(h)\
  797. : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  798. : "memory"\
  799. );\
  800. }\
  801. \
  802. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  803. int i;\
  804. int16_t temp[16];\
  805. /* quick HACK, XXX FIXME MUST be optimized */\
  806. for(i=0; i<h; i++)\
  807. {\
  808. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  809. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  810. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  811. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  812. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  813. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  814. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  815. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  816. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  817. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  818. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  819. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  820. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  821. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  822. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  823. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  824. asm volatile(\
  825. "movq (%0), %%mm0 \n\t"\
  826. "movq 8(%0), %%mm1 \n\t"\
  827. "paddw %2, %%mm0 \n\t"\
  828. "paddw %2, %%mm1 \n\t"\
  829. "psraw $5, %%mm0 \n\t"\
  830. "psraw $5, %%mm1 \n\t"\
  831. "packuswb %%mm1, %%mm0 \n\t"\
  832. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  833. "movq 16(%0), %%mm0 \n\t"\
  834. "movq 24(%0), %%mm1 \n\t"\
  835. "paddw %2, %%mm0 \n\t"\
  836. "paddw %2, %%mm1 \n\t"\
  837. "psraw $5, %%mm0 \n\t"\
  838. "psraw $5, %%mm1 \n\t"\
  839. "packuswb %%mm1, %%mm0 \n\t"\
  840. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  841. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  842. : "memory"\
  843. );\
  844. dst+=dstStride;\
  845. src+=srcStride;\
  846. }\
  847. }\
  848. \
  849. void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  850. uint64_t temp;\
  851. \
  852. asm volatile(\
  853. "pxor %%mm7, %%mm7 \n\t"\
  854. "1: \n\t"\
  855. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  856. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  857. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  858. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  859. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  860. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  861. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  862. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  863. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  864. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  865. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  866. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  867. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  868. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  869. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  870. "paddw %%mm3, %%mm5 \n\t" /* b */\
  871. "paddw %%mm2, %%mm6 \n\t" /* c */\
  872. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  873. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  874. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  875. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  876. "paddw %%mm4, %%mm0 \n\t" /* a */\
  877. "paddw %%mm1, %%mm5 \n\t" /* d */\
  878. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  879. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  880. "paddw %6, %%mm6 \n\t"\
  881. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  882. "psraw $5, %%mm0 \n\t"\
  883. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  884. \
  885. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  886. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  887. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  888. "paddw %%mm5, %%mm1 \n\t" /* a */\
  889. "paddw %%mm6, %%mm2 \n\t" /* b */\
  890. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  891. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  892. "paddw %%mm6, %%mm3 \n\t" /* c */\
  893. "paddw %%mm5, %%mm4 \n\t" /* d */\
  894. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  895. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  896. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  897. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  898. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  899. "paddw %6, %%mm1 \n\t"\
  900. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  901. "psraw $5, %%mm3 \n\t"\
  902. "packuswb %%mm3, %%mm0 \n\t"\
  903. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  904. \
  905. "addl %3, %0 \n\t"\
  906. "addl %4, %1 \n\t"\
  907. "decl %2 \n\t"\
  908. " jnz 1b \n\t"\
  909. : "+a"(src), "+c"(dst), "+m"(h)\
  910. : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  911. : "memory"\
  912. );\
  913. }\
  914. \
  915. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  916. int i;\
  917. int16_t temp[8];\
  918. /* quick HACK, XXX FIXME MUST be optimized */\
  919. for(i=0; i<h; i++)\
  920. {\
  921. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  922. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  923. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  924. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  925. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  926. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  927. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  928. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  929. asm volatile(\
  930. "movq (%0), %%mm0 \n\t"\
  931. "movq 8(%0), %%mm1 \n\t"\
  932. "paddw %2, %%mm0 \n\t"\
  933. "paddw %2, %%mm1 \n\t"\
  934. "psraw $5, %%mm0 \n\t"\
  935. "psraw $5, %%mm1 \n\t"\
  936. "packuswb %%mm1, %%mm0 \n\t"\
  937. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  938. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  939. :"memory"\
  940. );\
  941. dst+=dstStride;\
  942. src+=srcStride;\
  943. }\
  944. }
  945. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  946. \
  947. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  948. uint64_t temp[17*4];\
  949. uint64_t *temp_ptr= temp;\
  950. int count= 17;\
  951. \
  952. /*FIXME unroll */\
  953. asm volatile(\
  954. "pxor %%mm7, %%mm7 \n\t"\
  955. "1: \n\t"\
  956. "movq (%0), %%mm0 \n\t"\
  957. "movq (%0), %%mm1 \n\t"\
  958. "movq 8(%0), %%mm2 \n\t"\
  959. "movq 8(%0), %%mm3 \n\t"\
  960. "punpcklbw %%mm7, %%mm0 \n\t"\
  961. "punpckhbw %%mm7, %%mm1 \n\t"\
  962. "punpcklbw %%mm7, %%mm2 \n\t"\
  963. "punpckhbw %%mm7, %%mm3 \n\t"\
  964. "movq %%mm0, (%1) \n\t"\
  965. "movq %%mm1, 17*8(%1) \n\t"\
  966. "movq %%mm2, 2*17*8(%1) \n\t"\
  967. "movq %%mm3, 3*17*8(%1) \n\t"\
  968. "addl $8, %1 \n\t"\
  969. "addl %3, %0 \n\t"\
  970. "decl %2 \n\t"\
  971. " jnz 1b \n\t"\
  972. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  973. : "r" (srcStride)\
  974. : "memory"\
  975. );\
  976. \
  977. temp_ptr= temp;\
  978. count=4;\
  979. \
  980. /*FIXME reorder for speed */\
  981. asm volatile(\
  982. /*"pxor %%mm7, %%mm7 \n\t"*/\
  983. "1: \n\t"\
  984. "movq (%0), %%mm0 \n\t"\
  985. "movq 8(%0), %%mm1 \n\t"\
  986. "movq 16(%0), %%mm2 \n\t"\
  987. "movq 24(%0), %%mm3 \n\t"\
  988. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  989. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  990. "addl %4, %1 \n\t"\
  991. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  992. \
  993. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  994. "addl %4, %1 \n\t"\
  995. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  996. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  997. "addl %4, %1 \n\t"\
  998. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  999. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  1000. "addl %4, %1 \n\t"\
  1001. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  1002. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  1003. "addl %4, %1 \n\t"\
  1004. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  1005. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  1006. "addl %4, %1 \n\t"\
  1007. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  1008. \
  1009. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  1010. "addl %4, %1 \n\t" \
  1011. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  1012. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  1013. \
  1014. "addl $136, %0 \n\t"\
  1015. "addl %6, %1 \n\t"\
  1016. "decl %2 \n\t"\
  1017. " jnz 1b \n\t"\
  1018. \
  1019. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1020. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
  1021. :"memory"\
  1022. );\
  1023. }\
  1024. \
  1025. void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1026. uint64_t temp[9*4];\
  1027. uint64_t *temp_ptr= temp;\
  1028. int count= 9;\
  1029. \
  1030. /*FIXME unroll */\
  1031. asm volatile(\
  1032. "pxor %%mm7, %%mm7 \n\t"\
  1033. "1: \n\t"\
  1034. "movq (%0), %%mm0 \n\t"\
  1035. "movq (%0), %%mm1 \n\t"\
  1036. "punpcklbw %%mm7, %%mm0 \n\t"\
  1037. "punpckhbw %%mm7, %%mm1 \n\t"\
  1038. "movq %%mm0, (%1) \n\t"\
  1039. "movq %%mm1, 9*8(%1) \n\t"\
  1040. "addl $8, %1 \n\t"\
  1041. "addl %3, %0 \n\t"\
  1042. "decl %2 \n\t"\
  1043. " jnz 1b \n\t"\
  1044. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1045. : "r" (srcStride)\
  1046. : "memory"\
  1047. );\
  1048. \
  1049. temp_ptr= temp;\
  1050. count=2;\
  1051. \
  1052. /*FIXME reorder for speed */\
  1053. asm volatile(\
  1054. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1055. "1: \n\t"\
  1056. "movq (%0), %%mm0 \n\t"\
  1057. "movq 8(%0), %%mm1 \n\t"\
  1058. "movq 16(%0), %%mm2 \n\t"\
  1059. "movq 24(%0), %%mm3 \n\t"\
  1060. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1061. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1062. "addl %4, %1 \n\t"\
  1063. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1064. \
  1065. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1066. "addl %4, %1 \n\t"\
  1067. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1068. \
  1069. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  1070. "addl %4, %1 \n\t"\
  1071. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  1072. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  1073. \
  1074. "addl $72, %0 \n\t"\
  1075. "addl %6, %1 \n\t"\
  1076. "decl %2 \n\t"\
  1077. " jnz 1b \n\t"\
  1078. \
  1079. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1080. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
  1081. : "memory"\
  1082. );\
  1083. }\
  1084. \
  1085. static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
  1086. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  1087. }\
  1088. \
  1089. static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1090. uint64_t temp[8];\
  1091. uint8_t * const half= (uint8_t*)temp;\
  1092. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1093. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1094. }\
  1095. \
  1096. static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1097. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  1098. }\
  1099. \
  1100. static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1101. uint64_t temp[8];\
  1102. uint8_t * const half= (uint8_t*)temp;\
  1103. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1104. OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
  1105. }\
  1106. \
  1107. static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1108. uint64_t temp[8];\
  1109. uint8_t * const half= (uint8_t*)temp;\
  1110. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1111. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1112. }\
  1113. \
  1114. static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1115. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1116. }\
  1117. \
  1118. static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1119. uint64_t temp[8];\
  1120. uint8_t * const half= (uint8_t*)temp;\
  1121. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1122. OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
  1123. }\
  1124. static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1125. uint64_t half[8 + 9];\
  1126. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1127. uint8_t * const halfHV= ((uint8_t*)half);\
  1128. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1129. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1130. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1131. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1132. }\
  1133. static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1134. uint64_t half[8 + 9];\
  1135. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1136. uint8_t * const halfHV= ((uint8_t*)half);\
  1137. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1138. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1139. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1140. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1141. }\
  1142. static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1143. uint64_t half[8 + 9];\
  1144. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1145. uint8_t * const halfHV= ((uint8_t*)half);\
  1146. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1147. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1148. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1149. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1150. }\
  1151. static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1152. uint64_t half[8 + 9];\
  1153. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1154. uint8_t * const halfHV= ((uint8_t*)half);\
  1155. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1156. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1157. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1158. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1159. }\
  1160. static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1161. uint64_t half[8 + 9];\
  1162. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1163. uint8_t * const halfHV= ((uint8_t*)half);\
  1164. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1165. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1166. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1167. }\
  1168. static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1169. uint64_t half[8 + 9];\
  1170. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1171. uint8_t * const halfHV= ((uint8_t*)half);\
  1172. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1173. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1174. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1175. }\
  1176. static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1177. uint64_t half[8 + 9];\
  1178. uint8_t * const halfH= ((uint8_t*)half);\
  1179. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1180. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1181. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1182. }\
  1183. static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1184. uint64_t half[8 + 9];\
  1185. uint8_t * const halfH= ((uint8_t*)half);\
  1186. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1187. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1188. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1189. }\
  1190. static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1191. uint64_t half[9];\
  1192. uint8_t * const halfH= ((uint8_t*)half);\
  1193. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1194. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1195. }\
  1196. static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
  1197. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  1198. }\
  1199. \
  1200. static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1201. uint64_t temp[32];\
  1202. uint8_t * const half= (uint8_t*)temp;\
  1203. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1204. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1205. }\
  1206. \
  1207. static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1208. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  1209. }\
  1210. \
  1211. static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1212. uint64_t temp[32];\
  1213. uint8_t * const half= (uint8_t*)temp;\
  1214. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1215. OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
  1216. }\
  1217. \
  1218. static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1219. uint64_t temp[32];\
  1220. uint8_t * const half= (uint8_t*)temp;\
  1221. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1222. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1223. }\
  1224. \
  1225. static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1226. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1227. }\
  1228. \
  1229. static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1230. uint64_t temp[32];\
  1231. uint8_t * const half= (uint8_t*)temp;\
  1232. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1233. OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
  1234. }\
  1235. static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1236. uint64_t half[16*2 + 17*2];\
  1237. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1238. uint8_t * const halfHV= ((uint8_t*)half);\
  1239. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1240. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1241. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1242. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1243. }\
  1244. static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1245. uint64_t half[16*2 + 17*2];\
  1246. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1247. uint8_t * const halfHV= ((uint8_t*)half);\
  1248. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1249. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1250. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1251. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1252. }\
  1253. static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1254. uint64_t half[16*2 + 17*2];\
  1255. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1256. uint8_t * const halfHV= ((uint8_t*)half);\
  1257. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1258. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1259. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1260. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1261. }\
  1262. static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1263. uint64_t half[16*2 + 17*2];\
  1264. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1265. uint8_t * const halfHV= ((uint8_t*)half);\
  1266. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1267. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1268. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1269. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1270. }\
  1271. static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1272. uint64_t half[16*2 + 17*2];\
  1273. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1274. uint8_t * const halfHV= ((uint8_t*)half);\
  1275. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1276. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1277. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1278. }\
  1279. static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1280. uint64_t half[16*2 + 17*2];\
  1281. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1282. uint8_t * const halfHV= ((uint8_t*)half);\
  1283. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1284. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1285. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1286. }\
  1287. static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1288. uint64_t half[17*2];\
  1289. uint8_t * const halfH= ((uint8_t*)half);\
  1290. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1291. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1292. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1293. }\
  1294. static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1295. uint64_t half[17*2];\
  1296. uint8_t * const halfH= ((uint8_t*)half);\
  1297. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1298. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1299. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1300. }\
  1301. static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
  1302. uint64_t half[17*2];\
  1303. uint8_t * const halfH= ((uint8_t*)half);\
  1304. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1305. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1306. }
  1307. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  1308. #define AVG_3DNOW_OP(a,b,temp, size) \
  1309. "mov" #size " " #b ", " #temp " \n\t"\
  1310. "pavgusb " #temp ", " #a " \n\t"\
  1311. "mov" #size " " #a ", " #b " \n\t"
  1312. #define AVG_MMX2_OP(a,b,temp, size) \
  1313. "mov" #size " " #b ", " #temp " \n\t"\
  1314. "pavgb " #temp ", " #a " \n\t"\
  1315. "mov" #size " " #a ", " #b " \n\t"
  1316. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  1317. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  1318. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  1319. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  1320. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  1321. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  1322. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  1323. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  1324. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  1325. #if 0
  1326. static void just_return() { return; }
  1327. #endif
  1328. #define SET_QPEL_FUNC(postfix1, postfix2) \
  1329. c->put_ ## postfix1 = put_ ## postfix2;\
  1330. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  1331. c->avg_ ## postfix1 = avg_ ## postfix2;
  1332. void dsputil_init_mmx(DSPContext* c, unsigned mask)
  1333. {
  1334. mm_flags = mm_support();
  1335. #if 0
  1336. fprintf(stderr, "libavcodec: CPU flags:");
  1337. if (mm_flags & MM_MMX)
  1338. fprintf(stderr, " mmx");
  1339. if (mm_flags & MM_MMXEXT)
  1340. fprintf(stderr, " mmxext");
  1341. if (mm_flags & MM_3DNOW)
  1342. fprintf(stderr, " 3dnow");
  1343. if (mm_flags & MM_SSE)
  1344. fprintf(stderr, " sse");
  1345. if (mm_flags & MM_SSE2)
  1346. fprintf(stderr, " sse2");
  1347. fprintf(stderr, "\n");
  1348. #endif
  1349. if (mm_flags & MM_MMX) {
  1350. c->get_pixels = get_pixels_mmx;
  1351. c->diff_pixels = diff_pixels_mmx;
  1352. c->put_pixels_clamped = put_pixels_clamped_mmx;
  1353. c->add_pixels_clamped = add_pixels_clamped_mmx;
  1354. c->clear_blocks = clear_blocks_mmx;
  1355. c->pix_sum = pix_sum16_mmx;
  1356. c->pix_abs16x16 = pix_abs16x16_mmx;
  1357. c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1358. c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1359. c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1360. c->pix_abs8x8 = pix_abs8x8_mmx;
  1361. c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1362. c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1363. c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
  1364. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  1365. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  1366. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  1367. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  1368. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  1369. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  1370. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  1371. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  1372. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  1373. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  1374. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  1375. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  1376. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  1377. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  1378. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  1379. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  1380. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  1381. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  1382. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  1383. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  1384. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  1385. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  1386. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  1387. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  1388. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  1389. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  1390. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  1391. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  1392. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  1393. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  1394. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  1395. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  1396. c->add_bytes= add_bytes_mmx;
  1397. c->diff_bytes= diff_bytes_mmx;
  1398. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  1399. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  1400. c->sad[0]= sad16x16_mmx;
  1401. c->sad[1]= sad8x8_mmx;
  1402. c->pix_norm1 = pix_norm1_mmx;
  1403. c->sse[0] = sse16_mmx;
  1404. if (mm_flags & MM_MMXEXT) {
  1405. c->pix_abs16x16 = pix_abs16x16_mmx2;
  1406. c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  1407. c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  1408. c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
  1409. c->pix_abs8x8 = pix_abs8x8_mmx2;
  1410. c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  1411. c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  1412. c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
  1413. c->sad[0]= sad16x16_mmx2;
  1414. c->sad[1]= sad8x8_mmx2;
  1415. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  1416. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  1417. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  1418. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  1419. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  1420. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  1421. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  1422. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  1423. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  1424. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  1425. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  1426. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  1427. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  1428. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  1429. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  1430. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  1431. #if 1
  1432. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  1433. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  1434. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  1435. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  1436. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  1437. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  1438. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  1439. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  1440. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  1441. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  1442. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  1443. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  1444. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  1445. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  1446. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  1447. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  1448. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  1449. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  1450. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  1451. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  1452. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  1453. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  1454. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  1455. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  1456. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  1457. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  1458. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  1459. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  1460. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  1461. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  1462. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  1463. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  1464. #endif
  1465. } else if (mm_flags & MM_3DNOW) {
  1466. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  1467. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  1468. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  1469. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  1470. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  1471. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  1472. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  1473. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  1474. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  1475. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  1476. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  1477. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  1478. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  1479. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  1480. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  1481. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  1482. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  1483. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  1484. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  1485. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  1486. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  1487. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  1488. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  1489. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  1490. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  1491. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  1492. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  1493. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  1494. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  1495. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  1496. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  1497. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  1498. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  1499. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  1500. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  1501. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  1502. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  1503. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  1504. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  1505. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  1506. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  1507. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  1508. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  1509. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  1510. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  1511. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  1512. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  1513. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  1514. }
  1515. }
  1516. #if 0
  1517. // for speed testing
  1518. get_pixels = just_return;
  1519. put_pixels_clamped = just_return;
  1520. add_pixels_clamped = just_return;
  1521. pix_abs16x16 = just_return;
  1522. pix_abs16x16_x2 = just_return;
  1523. pix_abs16x16_y2 = just_return;
  1524. pix_abs16x16_xy2 = just_return;
  1525. put_pixels_tab[0] = just_return;
  1526. put_pixels_tab[1] = just_return;
  1527. put_pixels_tab[2] = just_return;
  1528. put_pixels_tab[3] = just_return;
  1529. put_no_rnd_pixels_tab[0] = just_return;
  1530. put_no_rnd_pixels_tab[1] = just_return;
  1531. put_no_rnd_pixels_tab[2] = just_return;
  1532. put_no_rnd_pixels_tab[3] = just_return;
  1533. avg_pixels_tab[0] = just_return;
  1534. avg_pixels_tab[1] = just_return;
  1535. avg_pixels_tab[2] = just_return;
  1536. avg_pixels_tab[3] = just_return;
  1537. avg_no_rnd_pixels_tab[0] = just_return;
  1538. avg_no_rnd_pixels_tab[1] = just_return;
  1539. avg_no_rnd_pixels_tab[2] = just_return;
  1540. avg_no_rnd_pixels_tab[3] = just_return;
  1541. //av_fdct = just_return;
  1542. //ff_idct = just_return;
  1543. #endif
  1544. }
  1545. /* remove any non bit exact operation (testing purpose). NOTE that
  1546. this function should be kept as small as possible because it is
  1547. always difficult to test automatically non bit exact cases. */
  1548. void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
  1549. {
  1550. if (mm_flags & MM_MMX) {
  1551. /* MMX2 & 3DNOW */
  1552. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  1553. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  1554. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  1555. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  1556. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  1557. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  1558. if (mm_flags & MM_MMXEXT) {
  1559. c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1560. c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1561. c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1562. c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1563. c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1564. c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  1565. }
  1566. }
  1567. }