You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1792 lines
68KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. int mm_flags; /* multimedia extension flags */
  24. /* pixel operations */
  25. static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  26. static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  27. static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  28. static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
  29. static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
  30. static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
  31. static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
  32. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  33. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  34. #define MOVQ_WONE(regd) \
  35. __asm __volatile ( \
  36. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  37. "psrlw $15, %%" #regd ::)
  38. #define MOVQ_BFE(regd) \
  39. __asm __volatile ( \
  40. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  41. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  42. #ifndef PIC
  43. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
  44. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  45. #else
  46. // for shared library it's better to use this way for accessing constants
  47. // pcmpeqd -> -1
  48. #define MOVQ_BONE(regd) \
  49. __asm __volatile ( \
  50. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  51. "psrlw $15, %%" #regd " \n\t" \
  52. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  53. #define MOVQ_WTWO(regd) \
  54. __asm __volatile ( \
  55. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  56. "psrlw $15, %%" #regd " \n\t" \
  57. "psllw $1, %%" #regd " \n\t"::)
  58. #endif
  59. // using regr as temporary and for the output result
  60. // first argument is unmodifed and second is trashed
  61. // regfe is supposed to contain 0xfefefefefefefefe
  62. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  63. "movq " #rega ", " #regr " \n\t"\
  64. "pand " #regb ", " #regr " \n\t"\
  65. "pxor " #rega ", " #regb " \n\t"\
  66. "pand " #regfe "," #regb " \n\t"\
  67. "psrlq $1, " #regb " \n\t"\
  68. "paddb " #regb ", " #regr " \n\t"
  69. #define PAVGB_MMX(rega, regb, regr, regfe) \
  70. "movq " #rega ", " #regr " \n\t"\
  71. "por " #regb ", " #regr " \n\t"\
  72. "pxor " #rega ", " #regb " \n\t"\
  73. "pand " #regfe "," #regb " \n\t"\
  74. "psrlq $1, " #regb " \n\t"\
  75. "psubb " #regb ", " #regr " \n\t"
  76. // mm6 is supposed to contain 0xfefefefefefefefe
  77. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  78. "movq " #rega ", " #regr " \n\t"\
  79. "movq " #regc ", " #regp " \n\t"\
  80. "pand " #regb ", " #regr " \n\t"\
  81. "pand " #regd ", " #regp " \n\t"\
  82. "pxor " #rega ", " #regb " \n\t"\
  83. "pxor " #regc ", " #regd " \n\t"\
  84. "pand %%mm6, " #regb " \n\t"\
  85. "pand %%mm6, " #regd " \n\t"\
  86. "psrlq $1, " #regb " \n\t"\
  87. "psrlq $1, " #regd " \n\t"\
  88. "paddb " #regb ", " #regr " \n\t"\
  89. "paddb " #regd ", " #regp " \n\t"
  90. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  91. "movq " #rega ", " #regr " \n\t"\
  92. "movq " #regc ", " #regp " \n\t"\
  93. "por " #regb ", " #regr " \n\t"\
  94. "por " #regd ", " #regp " \n\t"\
  95. "pxor " #rega ", " #regb " \n\t"\
  96. "pxor " #regc ", " #regd " \n\t"\
  97. "pand %%mm6, " #regb " \n\t"\
  98. "pand %%mm6, " #regd " \n\t"\
  99. "psrlq $1, " #regd " \n\t"\
  100. "psrlq $1, " #regb " \n\t"\
  101. "psubb " #regb ", " #regr " \n\t"\
  102. "psubb " #regd ", " #regp " \n\t"
  103. /***********************************/
  104. /* MMX no rounding */
  105. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  106. #define SET_RND MOVQ_WONE
  107. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  108. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  109. #include "dsputil_mmx_rnd.h"
  110. #undef DEF
  111. #undef SET_RND
  112. #undef PAVGBP
  113. #undef PAVGB
  114. /***********************************/
  115. /* MMX rounding */
  116. #define DEF(x, y) x ## _ ## y ##_mmx
  117. #define SET_RND MOVQ_WTWO
  118. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  119. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  120. #include "dsputil_mmx_rnd.h"
  121. #undef DEF
  122. #undef SET_RND
  123. #undef PAVGBP
  124. #undef PAVGB
  125. /***********************************/
  126. /* 3Dnow specific */
  127. #define DEF(x) x ## _3dnow
  128. /* for Athlons PAVGUSB is prefered */
  129. #define PAVGB "pavgusb"
  130. #include "dsputil_mmx_avg.h"
  131. #undef DEF
  132. #undef PAVGB
  133. /***********************************/
  134. /* MMX2 specific */
  135. #define DEF(x) x ## _mmx2
  136. /* Introduced only in MMX2 set */
  137. #define PAVGB "pavgb"
  138. #include "dsputil_mmx_avg.h"
  139. #undef DEF
  140. #undef PAVGB
  141. /***********************************/
  142. /* standard MMX */
  143. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  144. {
  145. asm volatile(
  146. "movl $-128, %%eax \n\t"
  147. "pxor %%mm7, %%mm7 \n\t"
  148. ".balign 16 \n\t"
  149. "1: \n\t"
  150. "movq (%0), %%mm0 \n\t"
  151. "movq (%0, %2), %%mm2 \n\t"
  152. "movq %%mm0, %%mm1 \n\t"
  153. "movq %%mm2, %%mm3 \n\t"
  154. "punpcklbw %%mm7, %%mm0 \n\t"
  155. "punpckhbw %%mm7, %%mm1 \n\t"
  156. "punpcklbw %%mm7, %%mm2 \n\t"
  157. "punpckhbw %%mm7, %%mm3 \n\t"
  158. "movq %%mm0, (%1, %%eax)\n\t"
  159. "movq %%mm1, 8(%1, %%eax)\n\t"
  160. "movq %%mm2, 16(%1, %%eax)\n\t"
  161. "movq %%mm3, 24(%1, %%eax)\n\t"
  162. "addl %3, %0 \n\t"
  163. "addl $32, %%eax \n\t"
  164. "js 1b \n\t"
  165. : "+r" (pixels)
  166. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  167. : "%eax"
  168. );
  169. }
  170. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  171. {
  172. asm volatile(
  173. "pxor %%mm7, %%mm7 \n\t"
  174. "movl $-128, %%eax \n\t"
  175. ".balign 16 \n\t"
  176. "1: \n\t"
  177. "movq (%0), %%mm0 \n\t"
  178. "movq (%1), %%mm2 \n\t"
  179. "movq %%mm0, %%mm1 \n\t"
  180. "movq %%mm2, %%mm3 \n\t"
  181. "punpcklbw %%mm7, %%mm0 \n\t"
  182. "punpckhbw %%mm7, %%mm1 \n\t"
  183. "punpcklbw %%mm7, %%mm2 \n\t"
  184. "punpckhbw %%mm7, %%mm3 \n\t"
  185. "psubw %%mm2, %%mm0 \n\t"
  186. "psubw %%mm3, %%mm1 \n\t"
  187. "movq %%mm0, (%2, %%eax)\n\t"
  188. "movq %%mm1, 8(%2, %%eax)\n\t"
  189. "addl %3, %0 \n\t"
  190. "addl %3, %1 \n\t"
  191. "addl $16, %%eax \n\t"
  192. "jnz 1b \n\t"
  193. : "+r" (s1), "+r" (s2)
  194. : "r" (block+64), "r" (stride)
  195. : "%eax"
  196. );
  197. }
  198. void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  199. {
  200. const DCTELEM *p;
  201. uint8_t *pix;
  202. /* read the pixels */
  203. p = block;
  204. pix = pixels;
  205. /* unrolled loop */
  206. __asm __volatile(
  207. "movq %3, %%mm0\n\t"
  208. "movq 8%3, %%mm1\n\t"
  209. "movq 16%3, %%mm2\n\t"
  210. "movq 24%3, %%mm3\n\t"
  211. "movq 32%3, %%mm4\n\t"
  212. "movq 40%3, %%mm5\n\t"
  213. "movq 48%3, %%mm6\n\t"
  214. "movq 56%3, %%mm7\n\t"
  215. "packuswb %%mm1, %%mm0\n\t"
  216. "packuswb %%mm3, %%mm2\n\t"
  217. "packuswb %%mm5, %%mm4\n\t"
  218. "packuswb %%mm7, %%mm6\n\t"
  219. "movq %%mm0, (%0)\n\t"
  220. "movq %%mm2, (%0, %1)\n\t"
  221. "movq %%mm4, (%0, %1, 2)\n\t"
  222. "movq %%mm6, (%0, %2)\n\t"
  223. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  224. :"memory");
  225. pix += line_size*4;
  226. p += 32;
  227. // if here would be an exact copy of the code above
  228. // compiler would generate some very strange code
  229. // thus using "r"
  230. __asm __volatile(
  231. "movq (%3), %%mm0\n\t"
  232. "movq 8(%3), %%mm1\n\t"
  233. "movq 16(%3), %%mm2\n\t"
  234. "movq 24(%3), %%mm3\n\t"
  235. "movq 32(%3), %%mm4\n\t"
  236. "movq 40(%3), %%mm5\n\t"
  237. "movq 48(%3), %%mm6\n\t"
  238. "movq 56(%3), %%mm7\n\t"
  239. "packuswb %%mm1, %%mm0\n\t"
  240. "packuswb %%mm3, %%mm2\n\t"
  241. "packuswb %%mm5, %%mm4\n\t"
  242. "packuswb %%mm7, %%mm6\n\t"
  243. "movq %%mm0, (%0)\n\t"
  244. "movq %%mm2, (%0, %1)\n\t"
  245. "movq %%mm4, (%0, %1, 2)\n\t"
  246. "movq %%mm6, (%0, %2)\n\t"
  247. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  248. :"memory");
  249. }
  250. void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  251. {
  252. const DCTELEM *p;
  253. uint8_t *pix;
  254. int i;
  255. /* read the pixels */
  256. p = block;
  257. pix = pixels;
  258. MOVQ_ZERO(mm7);
  259. i = 4;
  260. do {
  261. __asm __volatile(
  262. "movq (%2), %%mm0\n\t"
  263. "movq 8(%2), %%mm1\n\t"
  264. "movq 16(%2), %%mm2\n\t"
  265. "movq 24(%2), %%mm3\n\t"
  266. "movq %0, %%mm4\n\t"
  267. "movq %1, %%mm6\n\t"
  268. "movq %%mm4, %%mm5\n\t"
  269. "punpcklbw %%mm7, %%mm4\n\t"
  270. "punpckhbw %%mm7, %%mm5\n\t"
  271. "paddsw %%mm4, %%mm0\n\t"
  272. "paddsw %%mm5, %%mm1\n\t"
  273. "movq %%mm6, %%mm5\n\t"
  274. "punpcklbw %%mm7, %%mm6\n\t"
  275. "punpckhbw %%mm7, %%mm5\n\t"
  276. "paddsw %%mm6, %%mm2\n\t"
  277. "paddsw %%mm5, %%mm3\n\t"
  278. "packuswb %%mm1, %%mm0\n\t"
  279. "packuswb %%mm3, %%mm2\n\t"
  280. "movq %%mm0, %0\n\t"
  281. "movq %%mm2, %1\n\t"
  282. :"+m"(*pix), "+m"(*(pix+line_size))
  283. :"r"(p)
  284. :"memory");
  285. pix += line_size*2;
  286. p += 16;
  287. } while (--i);
  288. }
  289. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  290. {
  291. __asm __volatile(
  292. "lea (%3, %3), %%eax \n\t"
  293. ".balign 8 \n\t"
  294. "1: \n\t"
  295. "movq (%1), %%mm0 \n\t"
  296. "movq (%1, %3), %%mm1 \n\t"
  297. "movq %%mm0, (%2) \n\t"
  298. "movq %%mm1, (%2, %3) \n\t"
  299. "addl %%eax, %1 \n\t"
  300. "addl %%eax, %2 \n\t"
  301. "movq (%1), %%mm0 \n\t"
  302. "movq (%1, %3), %%mm1 \n\t"
  303. "movq %%mm0, (%2) \n\t"
  304. "movq %%mm1, (%2, %3) \n\t"
  305. "addl %%eax, %1 \n\t"
  306. "addl %%eax, %2 \n\t"
  307. "subl $4, %0 \n\t"
  308. "jnz 1b \n\t"
  309. : "+g"(h), "+r" (pixels), "+r" (block)
  310. : "r"(line_size)
  311. : "%eax", "memory"
  312. );
  313. }
  314. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  315. {
  316. __asm __volatile(
  317. "lea (%3, %3), %%eax \n\t"
  318. ".balign 8 \n\t"
  319. "1: \n\t"
  320. "movq (%1), %%mm0 \n\t"
  321. "movq 8(%1), %%mm4 \n\t"
  322. "movq (%1, %3), %%mm1 \n\t"
  323. "movq 8(%1, %3), %%mm5 \n\t"
  324. "movq %%mm0, (%2) \n\t"
  325. "movq %%mm4, 8(%2) \n\t"
  326. "movq %%mm1, (%2, %3) \n\t"
  327. "movq %%mm5, 8(%2, %3) \n\t"
  328. "addl %%eax, %1 \n\t"
  329. "addl %%eax, %2 \n\t"
  330. "movq (%1), %%mm0 \n\t"
  331. "movq 8(%1), %%mm4 \n\t"
  332. "movq (%1, %3), %%mm1 \n\t"
  333. "movq 8(%1, %3), %%mm5 \n\t"
  334. "movq %%mm0, (%2) \n\t"
  335. "movq %%mm4, 8(%2) \n\t"
  336. "movq %%mm1, (%2, %3) \n\t"
  337. "movq %%mm5, 8(%2, %3) \n\t"
  338. "addl %%eax, %1 \n\t"
  339. "addl %%eax, %2 \n\t"
  340. "subl $4, %0 \n\t"
  341. "jnz 1b \n\t"
  342. : "+g"(h), "+r" (pixels), "+r" (block)
  343. : "r"(line_size)
  344. : "%eax", "memory"
  345. );
  346. }
  347. static void clear_blocks_mmx(DCTELEM *blocks)
  348. {
  349. __asm __volatile(
  350. "pxor %%mm7, %%mm7 \n\t"
  351. "movl $-128*6, %%eax \n\t"
  352. "1: \n\t"
  353. "movq %%mm7, (%0, %%eax) \n\t"
  354. "movq %%mm7, 8(%0, %%eax) \n\t"
  355. "movq %%mm7, 16(%0, %%eax) \n\t"
  356. "movq %%mm7, 24(%0, %%eax) \n\t"
  357. "addl $32, %%eax \n\t"
  358. " js 1b \n\t"
  359. : : "r" (((int)blocks)+128*6)
  360. : "%eax"
  361. );
  362. }
  363. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  364. const int h=16;
  365. int sum;
  366. int index= -line_size*h;
  367. __asm __volatile(
  368. "pxor %%mm7, %%mm7 \n\t"
  369. "pxor %%mm6, %%mm6 \n\t"
  370. "1: \n\t"
  371. "movq (%2, %1), %%mm0 \n\t"
  372. "movq (%2, %1), %%mm1 \n\t"
  373. "movq 8(%2, %1), %%mm2 \n\t"
  374. "movq 8(%2, %1), %%mm3 \n\t"
  375. "punpcklbw %%mm7, %%mm0 \n\t"
  376. "punpckhbw %%mm7, %%mm1 \n\t"
  377. "punpcklbw %%mm7, %%mm2 \n\t"
  378. "punpckhbw %%mm7, %%mm3 \n\t"
  379. "paddw %%mm0, %%mm1 \n\t"
  380. "paddw %%mm2, %%mm3 \n\t"
  381. "paddw %%mm1, %%mm3 \n\t"
  382. "paddw %%mm3, %%mm6 \n\t"
  383. "addl %3, %1 \n\t"
  384. " js 1b \n\t"
  385. "movq %%mm6, %%mm5 \n\t"
  386. "psrlq $32, %%mm6 \n\t"
  387. "paddw %%mm5, %%mm6 \n\t"
  388. "movq %%mm6, %%mm5 \n\t"
  389. "psrlq $16, %%mm6 \n\t"
  390. "paddw %%mm5, %%mm6 \n\t"
  391. "movd %%mm6, %0 \n\t"
  392. "andl $0xFFFF, %0 \n\t"
  393. : "=&r" (sum), "+r" (index)
  394. : "r" (pix - index), "r" (line_size)
  395. );
  396. return sum;
  397. }
  398. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  399. int i=0;
  400. asm volatile(
  401. "1: \n\t"
  402. "movq (%1, %0), %%mm0 \n\t"
  403. "movq (%2, %0), %%mm1 \n\t"
  404. "paddb %%mm0, %%mm1 \n\t"
  405. "movq %%mm1, (%2, %0) \n\t"
  406. "movq 8(%1, %0), %%mm0 \n\t"
  407. "movq 8(%2, %0), %%mm1 \n\t"
  408. "paddb %%mm0, %%mm1 \n\t"
  409. "movq %%mm1, 8(%2, %0) \n\t"
  410. "addl $16, %0 \n\t"
  411. "cmpl %3, %0 \n\t"
  412. " jb 1b \n\t"
  413. : "+r" (i)
  414. : "r"(src), "r"(dst), "r"(w-15)
  415. );
  416. for(; i<w; i++)
  417. dst[i+0] += src[i+0];
  418. }
  419. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  420. int tmp;
  421. asm volatile (
  422. "movl $16,%%ecx\n"
  423. "pxor %%mm0,%%mm0\n"
  424. "pxor %%mm7,%%mm7\n"
  425. "1:\n"
  426. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  427. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  428. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  429. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  430. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  431. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  432. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  433. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  434. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  435. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  436. "pmaddwd %%mm3,%%mm3\n"
  437. "pmaddwd %%mm4,%%mm4\n"
  438. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  439. pix2^2+pix3^2+pix6^2+pix7^2) */
  440. "paddd %%mm3,%%mm4\n"
  441. "paddd %%mm2,%%mm7\n"
  442. "addl %2, %0\n"
  443. "paddd %%mm4,%%mm7\n"
  444. "dec %%ecx\n"
  445. "jnz 1b\n"
  446. "movq %%mm7,%%mm1\n"
  447. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  448. "paddd %%mm7,%%mm1\n"
  449. "movd %%mm1,%1\n"
  450. : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
  451. return tmp;
  452. }
  453. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
  454. int tmp;
  455. asm volatile (
  456. "movl $16,%%ecx\n"
  457. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  458. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  459. "1:\n"
  460. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  461. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  462. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  463. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  464. /* todo: mm1-mm2, mm3-mm4 */
  465. /* algo: substract mm1 from mm2 with saturation and vice versa */
  466. /* OR the results to get absolute difference */
  467. "movq %%mm1,%%mm5\n"
  468. "movq %%mm3,%%mm6\n"
  469. "psubusb %%mm2,%%mm1\n"
  470. "psubusb %%mm4,%%mm3\n"
  471. "psubusb %%mm5,%%mm2\n"
  472. "psubusb %%mm6,%%mm4\n"
  473. "por %%mm1,%%mm2\n"
  474. "por %%mm3,%%mm4\n"
  475. /* now convert to 16-bit vectors so we can square them */
  476. "movq %%mm2,%%mm1\n"
  477. "movq %%mm4,%%mm3\n"
  478. "punpckhbw %%mm0,%%mm2\n"
  479. "punpckhbw %%mm0,%%mm4\n"
  480. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  481. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  482. "pmaddwd %%mm2,%%mm2\n"
  483. "pmaddwd %%mm4,%%mm4\n"
  484. "pmaddwd %%mm1,%%mm1\n"
  485. "pmaddwd %%mm3,%%mm3\n"
  486. "addl %3,%0\n"
  487. "addl %3,%1\n"
  488. "paddd %%mm2,%%mm1\n"
  489. "paddd %%mm4,%%mm3\n"
  490. "paddd %%mm1,%%mm7\n"
  491. "paddd %%mm3,%%mm7\n"
  492. "decl %%ecx\n"
  493. "jnz 1b\n"
  494. "movq %%mm7,%%mm1\n"
  495. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  496. "paddd %%mm7,%%mm1\n"
  497. "movd %%mm1,%2\n"
  498. : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
  499. return tmp;
  500. }
  501. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  502. int i=0;
  503. asm volatile(
  504. "1: \n\t"
  505. "movq (%2, %0), %%mm0 \n\t"
  506. "movq (%1, %0), %%mm1 \n\t"
  507. "psubb %%mm0, %%mm1 \n\t"
  508. "movq %%mm1, (%3, %0) \n\t"
  509. "movq 8(%2, %0), %%mm0 \n\t"
  510. "movq 8(%1, %0), %%mm1 \n\t"
  511. "psubb %%mm0, %%mm1 \n\t"
  512. "movq %%mm1, 8(%3, %0) \n\t"
  513. "addl $16, %0 \n\t"
  514. "cmpl %4, %0 \n\t"
  515. " jb 1b \n\t"
  516. : "+r" (i)
  517. : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
  518. );
  519. for(; i<w; i++)
  520. dst[i+0] = src1[i+0]-src2[i+0];
  521. }
  522. #define LBUTTERFLY2(a1,b1,a2,b2)\
  523. "paddw " #b1 ", " #a1 " \n\t"\
  524. "paddw " #b2 ", " #a2 " \n\t"\
  525. "paddw " #b1 ", " #b1 " \n\t"\
  526. "paddw " #b2 ", " #b2 " \n\t"\
  527. "psubw " #a1 ", " #b1 " \n\t"\
  528. "psubw " #a2 ", " #b2 " \n\t"
  529. #define HADAMARD48\
  530. LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
  531. LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
  532. LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
  533. LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
  534. LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
  535. LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
  536. #define MMABS(a,z)\
  537. "pxor " #z ", " #z " \n\t"\
  538. "pcmpgtw " #a ", " #z " \n\t"\
  539. "pxor " #z ", " #a " \n\t"\
  540. "psubw " #z ", " #a " \n\t"
  541. #define MMABS_SUM(a,z, sum)\
  542. "pxor " #z ", " #z " \n\t"\
  543. "pcmpgtw " #a ", " #z " \n\t"\
  544. "pxor " #z ", " #a " \n\t"\
  545. "psubw " #z ", " #a " \n\t"\
  546. "paddusw " #a ", " #sum " \n\t"
  547. #define MMABS_MMX2(a,z)\
  548. "pxor " #z ", " #z " \n\t"\
  549. "psubw " #a ", " #z " \n\t"\
  550. "pmaxsw " #z ", " #a " \n\t"
  551. #define MMABS_SUM_MMX2(a,z, sum)\
  552. "pxor " #z ", " #z " \n\t"\
  553. "psubw " #a ", " #z " \n\t"\
  554. "pmaxsw " #z ", " #a " \n\t"\
  555. "paddusw " #a ", " #sum " \n\t"
  556. #define SBUTTERFLY(a,b,t,n)\
  557. "movq " #a ", " #t " \n\t" /* abcd */\
  558. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  559. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  560. #define TRANSPOSE4(a,b,c,d,t)\
  561. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  562. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  563. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  564. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  565. #define LOAD4(o, a, b, c, d)\
  566. "movq "#o"(%1), " #a " \n\t"\
  567. "movq "#o"+16(%1), " #b " \n\t"\
  568. "movq "#o"+32(%1), " #c " \n\t"\
  569. "movq "#o"+48(%1), " #d " \n\t"
  570. #define STORE4(o, a, b, c, d)\
  571. "movq "#a", "#o"(%1) \n\t"\
  572. "movq "#b", "#o"+16(%1) \n\t"\
  573. "movq "#c", "#o"+32(%1) \n\t"\
  574. "movq "#d", "#o"+48(%1) \n\t"\
  575. static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
  576. uint64_t temp[16] __align8;
  577. int sum=0;
  578. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  579. asm volatile(
  580. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  581. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  582. HADAMARD48
  583. "movq %%mm7, 112(%1) \n\t"
  584. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  585. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  586. "movq 112(%1), %%mm7 \n\t"
  587. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  588. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  589. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  590. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  591. HADAMARD48
  592. "movq %%mm7, 120(%1) \n\t"
  593. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  594. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  595. "movq 120(%1), %%mm7 \n\t"
  596. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  597. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  598. "movq %%mm6, %%mm7 \n\t"
  599. "movq %%mm0, %%mm6 \n\t"
  600. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  601. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  602. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  603. HADAMARD48
  604. "movq %%mm7, 64(%1) \n\t"
  605. MMABS(%%mm0, %%mm7)
  606. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  607. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  608. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  609. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  610. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  611. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  612. "movq 64(%1), %%mm1 \n\t"
  613. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  614. "movq %%mm0, 64(%1) \n\t"
  615. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  616. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  617. HADAMARD48
  618. "movq %%mm7, (%1) \n\t"
  619. MMABS(%%mm0, %%mm7)
  620. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  621. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  622. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  623. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  624. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  625. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  626. "movq (%1), %%mm1 \n\t"
  627. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  628. "movq 64(%1), %%mm1 \n\t"
  629. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  630. "movq %%mm0, %%mm1 \n\t"
  631. "psrlq $32, %%mm0 \n\t"
  632. "paddusw %%mm1, %%mm0 \n\t"
  633. "movq %%mm0, %%mm1 \n\t"
  634. "psrlq $16, %%mm0 \n\t"
  635. "paddusw %%mm1, %%mm0 \n\t"
  636. "movd %%mm0, %0 \n\t"
  637. : "=r" (sum)
  638. : "r"(temp)
  639. );
  640. return sum&0xFFFF;
  641. }
  642. static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
  643. uint64_t temp[16] __align8;
  644. int sum=0;
  645. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  646. asm volatile(
  647. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  648. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  649. HADAMARD48
  650. "movq %%mm7, 112(%1) \n\t"
  651. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  652. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  653. "movq 112(%1), %%mm7 \n\t"
  654. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  655. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  656. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  657. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  658. HADAMARD48
  659. "movq %%mm7, 120(%1) \n\t"
  660. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  661. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  662. "movq 120(%1), %%mm7 \n\t"
  663. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  664. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  665. "movq %%mm6, %%mm7 \n\t"
  666. "movq %%mm0, %%mm6 \n\t"
  667. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  668. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  669. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  670. HADAMARD48
  671. "movq %%mm7, 64(%1) \n\t"
  672. MMABS_MMX2(%%mm0, %%mm7)
  673. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  674. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  675. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  676. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  677. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  678. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  679. "movq 64(%1), %%mm1 \n\t"
  680. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  681. "movq %%mm0, 64(%1) \n\t"
  682. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  683. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  684. HADAMARD48
  685. "movq %%mm7, (%1) \n\t"
  686. MMABS_MMX2(%%mm0, %%mm7)
  687. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  688. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  689. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  690. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  691. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  692. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  693. "movq (%1), %%mm1 \n\t"
  694. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  695. "movq 64(%1), %%mm1 \n\t"
  696. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  697. "movq %%mm0, %%mm1 \n\t"
  698. "psrlq $32, %%mm0 \n\t"
  699. "paddusw %%mm1, %%mm0 \n\t"
  700. "movq %%mm0, %%mm1 \n\t"
  701. "psrlq $16, %%mm0 \n\t"
  702. "paddusw %%mm1, %%mm0 \n\t"
  703. "movd %%mm0, %0 \n\t"
  704. : "=r" (sum)
  705. : "r"(temp)
  706. );
  707. return sum&0xFFFF;
  708. }
  709. WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
  710. WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
  711. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  712. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  713. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  714. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  715. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  716. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  717. "movq "#in7", " #m3 " \n\t" /* d */\
  718. "movq "#in0", %%mm5 \n\t" /* D */\
  719. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  720. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  721. "movq "#in1", %%mm5 \n\t" /* C */\
  722. "movq "#in2", %%mm6 \n\t" /* B */\
  723. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  724. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  725. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  726. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  727. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  728. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  729. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  730. "psraw $5, %%mm5 \n\t"\
  731. "packuswb %%mm5, %%mm5 \n\t"\
  732. OP(%%mm5, out, %%mm7, d)
  733. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  734. static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  735. uint64_t temp;\
  736. \
  737. asm volatile(\
  738. "pxor %%mm7, %%mm7 \n\t"\
  739. "1: \n\t"\
  740. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  741. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  742. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  743. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  744. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  745. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  746. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  747. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  748. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  749. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  750. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  751. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  752. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  753. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  754. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  755. "paddw %%mm3, %%mm5 \n\t" /* b */\
  756. "paddw %%mm2, %%mm6 \n\t" /* c */\
  757. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  758. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  759. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  760. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  761. "paddw %%mm4, %%mm0 \n\t" /* a */\
  762. "paddw %%mm1, %%mm5 \n\t" /* d */\
  763. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  764. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  765. "paddw %6, %%mm6 \n\t"\
  766. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  767. "psraw $5, %%mm0 \n\t"\
  768. "movq %%mm0, %5 \n\t"\
  769. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  770. \
  771. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  772. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  773. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  774. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  775. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  776. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  777. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  778. "paddw %%mm0, %%mm2 \n\t" /* b */\
  779. "paddw %%mm5, %%mm3 \n\t" /* c */\
  780. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  781. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  782. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  783. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  784. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  785. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  786. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  787. "paddw %%mm2, %%mm1 \n\t" /* a */\
  788. "paddw %%mm6, %%mm4 \n\t" /* d */\
  789. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  790. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  791. "paddw %6, %%mm1 \n\t"\
  792. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  793. "psraw $5, %%mm3 \n\t"\
  794. "movq %5, %%mm1 \n\t"\
  795. "packuswb %%mm3, %%mm1 \n\t"\
  796. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  797. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  798. \
  799. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  800. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  801. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  802. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  803. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  804. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  805. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  806. "paddw %%mm1, %%mm5 \n\t" /* b */\
  807. "paddw %%mm4, %%mm0 \n\t" /* c */\
  808. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  809. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  810. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  811. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  812. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  813. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  814. "paddw %%mm3, %%mm2 \n\t" /* d */\
  815. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  816. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  817. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  818. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  819. "paddw %%mm2, %%mm6 \n\t" /* a */\
  820. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  821. "paddw %6, %%mm0 \n\t"\
  822. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  823. "psraw $5, %%mm0 \n\t"\
  824. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  825. \
  826. "paddw %%mm5, %%mm3 \n\t" /* a */\
  827. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  828. "paddw %%mm4, %%mm6 \n\t" /* b */\
  829. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  830. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  831. "paddw %%mm1, %%mm4 \n\t" /* c */\
  832. "paddw %%mm2, %%mm5 \n\t" /* d */\
  833. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  834. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  835. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  836. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  837. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  838. "paddw %6, %%mm4 \n\t"\
  839. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  840. "psraw $5, %%mm4 \n\t"\
  841. "packuswb %%mm4, %%mm0 \n\t"\
  842. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  843. \
  844. "addl %3, %0 \n\t"\
  845. "addl %4, %1 \n\t"\
  846. "decl %2 \n\t"\
  847. " jnz 1b \n\t"\
  848. : "+a"(src), "+c"(dst), "+m"(h)\
  849. : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  850. : "memory"\
  851. );\
  852. }\
  853. \
  854. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  855. int i;\
  856. int16_t temp[16];\
  857. /* quick HACK, XXX FIXME MUST be optimized */\
  858. for(i=0; i<h; i++)\
  859. {\
  860. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  861. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  862. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  863. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  864. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  865. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  866. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  867. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  868. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  869. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  870. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  871. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  872. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  873. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  874. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  875. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  876. asm volatile(\
  877. "movq (%0), %%mm0 \n\t"\
  878. "movq 8(%0), %%mm1 \n\t"\
  879. "paddw %2, %%mm0 \n\t"\
  880. "paddw %2, %%mm1 \n\t"\
  881. "psraw $5, %%mm0 \n\t"\
  882. "psraw $5, %%mm1 \n\t"\
  883. "packuswb %%mm1, %%mm0 \n\t"\
  884. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  885. "movq 16(%0), %%mm0 \n\t"\
  886. "movq 24(%0), %%mm1 \n\t"\
  887. "paddw %2, %%mm0 \n\t"\
  888. "paddw %2, %%mm1 \n\t"\
  889. "psraw $5, %%mm0 \n\t"\
  890. "psraw $5, %%mm1 \n\t"\
  891. "packuswb %%mm1, %%mm0 \n\t"\
  892. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  893. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  894. : "memory"\
  895. );\
  896. dst+=dstStride;\
  897. src+=srcStride;\
  898. }\
  899. }\
  900. \
  901. static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  902. uint64_t temp;\
  903. \
  904. asm volatile(\
  905. "pxor %%mm7, %%mm7 \n\t"\
  906. "1: \n\t"\
  907. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  908. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  909. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  910. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  911. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  912. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  913. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  914. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  915. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  916. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  917. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  918. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  919. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  920. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  921. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  922. "paddw %%mm3, %%mm5 \n\t" /* b */\
  923. "paddw %%mm2, %%mm6 \n\t" /* c */\
  924. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  925. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  926. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  927. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  928. "paddw %%mm4, %%mm0 \n\t" /* a */\
  929. "paddw %%mm1, %%mm5 \n\t" /* d */\
  930. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  931. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  932. "paddw %6, %%mm6 \n\t"\
  933. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  934. "psraw $5, %%mm0 \n\t"\
  935. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  936. \
  937. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  938. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  939. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  940. "paddw %%mm5, %%mm1 \n\t" /* a */\
  941. "paddw %%mm6, %%mm2 \n\t" /* b */\
  942. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  943. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  944. "paddw %%mm6, %%mm3 \n\t" /* c */\
  945. "paddw %%mm5, %%mm4 \n\t" /* d */\
  946. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  947. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  948. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  949. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  950. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  951. "paddw %6, %%mm1 \n\t"\
  952. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  953. "psraw $5, %%mm3 \n\t"\
  954. "packuswb %%mm3, %%mm0 \n\t"\
  955. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  956. \
  957. "addl %3, %0 \n\t"\
  958. "addl %4, %1 \n\t"\
  959. "decl %2 \n\t"\
  960. " jnz 1b \n\t"\
  961. : "+a"(src), "+c"(dst), "+m"(h)\
  962. : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  963. : "memory"\
  964. );\
  965. }\
  966. \
  967. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  968. int i;\
  969. int16_t temp[8];\
  970. /* quick HACK, XXX FIXME MUST be optimized */\
  971. for(i=0; i<h; i++)\
  972. {\
  973. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  974. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  975. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  976. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  977. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  978. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  979. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  980. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  981. asm volatile(\
  982. "movq (%0), %%mm0 \n\t"\
  983. "movq 8(%0), %%mm1 \n\t"\
  984. "paddw %2, %%mm0 \n\t"\
  985. "paddw %2, %%mm1 \n\t"\
  986. "psraw $5, %%mm0 \n\t"\
  987. "psraw $5, %%mm1 \n\t"\
  988. "packuswb %%mm1, %%mm0 \n\t"\
  989. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  990. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  991. :"memory"\
  992. );\
  993. dst+=dstStride;\
  994. src+=srcStride;\
  995. }\
  996. }
  997. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  998. \
  999. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1000. uint64_t temp[17*4];\
  1001. uint64_t *temp_ptr= temp;\
  1002. int count= 17;\
  1003. \
  1004. /*FIXME unroll */\
  1005. asm volatile(\
  1006. "pxor %%mm7, %%mm7 \n\t"\
  1007. "1: \n\t"\
  1008. "movq (%0), %%mm0 \n\t"\
  1009. "movq (%0), %%mm1 \n\t"\
  1010. "movq 8(%0), %%mm2 \n\t"\
  1011. "movq 8(%0), %%mm3 \n\t"\
  1012. "punpcklbw %%mm7, %%mm0 \n\t"\
  1013. "punpckhbw %%mm7, %%mm1 \n\t"\
  1014. "punpcklbw %%mm7, %%mm2 \n\t"\
  1015. "punpckhbw %%mm7, %%mm3 \n\t"\
  1016. "movq %%mm0, (%1) \n\t"\
  1017. "movq %%mm1, 17*8(%1) \n\t"\
  1018. "movq %%mm2, 2*17*8(%1) \n\t"\
  1019. "movq %%mm3, 3*17*8(%1) \n\t"\
  1020. "addl $8, %1 \n\t"\
  1021. "addl %3, %0 \n\t"\
  1022. "decl %2 \n\t"\
  1023. " jnz 1b \n\t"\
  1024. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1025. : "r" (srcStride)\
  1026. : "memory"\
  1027. );\
  1028. \
  1029. temp_ptr= temp;\
  1030. count=4;\
  1031. \
  1032. /*FIXME reorder for speed */\
  1033. asm volatile(\
  1034. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1035. "1: \n\t"\
  1036. "movq (%0), %%mm0 \n\t"\
  1037. "movq 8(%0), %%mm1 \n\t"\
  1038. "movq 16(%0), %%mm2 \n\t"\
  1039. "movq 24(%0), %%mm3 \n\t"\
  1040. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1041. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1042. "addl %4, %1 \n\t"\
  1043. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1044. \
  1045. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1046. "addl %4, %1 \n\t"\
  1047. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1048. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  1049. "addl %4, %1 \n\t"\
  1050. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  1051. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  1052. "addl %4, %1 \n\t"\
  1053. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  1054. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  1055. "addl %4, %1 \n\t"\
  1056. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  1057. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  1058. "addl %4, %1 \n\t"\
  1059. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  1060. \
  1061. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  1062. "addl %4, %1 \n\t" \
  1063. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  1064. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  1065. \
  1066. "addl $136, %0 \n\t"\
  1067. "addl %6, %1 \n\t"\
  1068. "decl %2 \n\t"\
  1069. " jnz 1b \n\t"\
  1070. \
  1071. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1072. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
  1073. :"memory"\
  1074. );\
  1075. }\
  1076. \
  1077. static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1078. uint64_t temp[9*4];\
  1079. uint64_t *temp_ptr= temp;\
  1080. int count= 9;\
  1081. \
  1082. /*FIXME unroll */\
  1083. asm volatile(\
  1084. "pxor %%mm7, %%mm7 \n\t"\
  1085. "1: \n\t"\
  1086. "movq (%0), %%mm0 \n\t"\
  1087. "movq (%0), %%mm1 \n\t"\
  1088. "punpcklbw %%mm7, %%mm0 \n\t"\
  1089. "punpckhbw %%mm7, %%mm1 \n\t"\
  1090. "movq %%mm0, (%1) \n\t"\
  1091. "movq %%mm1, 9*8(%1) \n\t"\
  1092. "addl $8, %1 \n\t"\
  1093. "addl %3, %0 \n\t"\
  1094. "decl %2 \n\t"\
  1095. " jnz 1b \n\t"\
  1096. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1097. : "r" (srcStride)\
  1098. : "memory"\
  1099. );\
  1100. \
  1101. temp_ptr= temp;\
  1102. count=2;\
  1103. \
  1104. /*FIXME reorder for speed */\
  1105. asm volatile(\
  1106. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1107. "1: \n\t"\
  1108. "movq (%0), %%mm0 \n\t"\
  1109. "movq 8(%0), %%mm1 \n\t"\
  1110. "movq 16(%0), %%mm2 \n\t"\
  1111. "movq 24(%0), %%mm3 \n\t"\
  1112. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1113. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1114. "addl %4, %1 \n\t"\
  1115. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1116. \
  1117. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1118. "addl %4, %1 \n\t"\
  1119. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1120. \
  1121. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  1122. "addl %4, %1 \n\t"\
  1123. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  1124. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  1125. \
  1126. "addl $72, %0 \n\t"\
  1127. "addl %6, %1 \n\t"\
  1128. "decl %2 \n\t"\
  1129. " jnz 1b \n\t"\
  1130. \
  1131. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1132. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
  1133. : "memory"\
  1134. );\
  1135. }\
  1136. \
  1137. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1138. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  1139. }\
  1140. \
  1141. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1142. uint64_t temp[8];\
  1143. uint8_t * const half= (uint8_t*)temp;\
  1144. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1145. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1146. }\
  1147. \
  1148. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1149. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  1150. }\
  1151. \
  1152. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1153. uint64_t temp[8];\
  1154. uint8_t * const half= (uint8_t*)temp;\
  1155. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1156. OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
  1157. }\
  1158. \
  1159. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1160. uint64_t temp[8];\
  1161. uint8_t * const half= (uint8_t*)temp;\
  1162. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1163. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1164. }\
  1165. \
  1166. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1167. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1168. }\
  1169. \
  1170. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1171. uint64_t temp[8];\
  1172. uint8_t * const half= (uint8_t*)temp;\
  1173. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1174. OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
  1175. }\
  1176. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1177. uint64_t half[8 + 9];\
  1178. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1179. uint8_t * const halfHV= ((uint8_t*)half);\
  1180. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1181. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1182. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1183. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1184. }\
  1185. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1186. uint64_t half[8 + 9];\
  1187. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1188. uint8_t * const halfHV= ((uint8_t*)half);\
  1189. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1190. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1191. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1192. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1193. }\
  1194. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1195. uint64_t half[8 + 9];\
  1196. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1197. uint8_t * const halfHV= ((uint8_t*)half);\
  1198. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1199. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1200. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1201. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1202. }\
  1203. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1204. uint64_t half[8 + 9];\
  1205. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1206. uint8_t * const halfHV= ((uint8_t*)half);\
  1207. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1208. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1209. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1210. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1211. }\
  1212. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1213. uint64_t half[8 + 9];\
  1214. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1215. uint8_t * const halfHV= ((uint8_t*)half);\
  1216. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1217. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1218. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1219. }\
  1220. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1221. uint64_t half[8 + 9];\
  1222. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1223. uint8_t * const halfHV= ((uint8_t*)half);\
  1224. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1225. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1226. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1227. }\
  1228. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1229. uint64_t half[8 + 9];\
  1230. uint8_t * const halfH= ((uint8_t*)half);\
  1231. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1232. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1233. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1234. }\
  1235. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1236. uint64_t half[8 + 9];\
  1237. uint8_t * const halfH= ((uint8_t*)half);\
  1238. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1239. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1240. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1241. }\
  1242. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1243. uint64_t half[9];\
  1244. uint8_t * const halfH= ((uint8_t*)half);\
  1245. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1246. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1247. }\
  1248. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1249. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  1250. }\
  1251. \
  1252. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1253. uint64_t temp[32];\
  1254. uint8_t * const half= (uint8_t*)temp;\
  1255. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1256. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1257. }\
  1258. \
  1259. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1260. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  1261. }\
  1262. \
  1263. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1264. uint64_t temp[32];\
  1265. uint8_t * const half= (uint8_t*)temp;\
  1266. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1267. OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
  1268. }\
  1269. \
  1270. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1271. uint64_t temp[32];\
  1272. uint8_t * const half= (uint8_t*)temp;\
  1273. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1274. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1275. }\
  1276. \
  1277. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1278. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1279. }\
  1280. \
  1281. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1282. uint64_t temp[32];\
  1283. uint8_t * const half= (uint8_t*)temp;\
  1284. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1285. OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
  1286. }\
  1287. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1288. uint64_t half[16*2 + 17*2];\
  1289. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1290. uint8_t * const halfHV= ((uint8_t*)half);\
  1291. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1292. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1293. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1294. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1295. }\
  1296. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1297. uint64_t half[16*2 + 17*2];\
  1298. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1299. uint8_t * const halfHV= ((uint8_t*)half);\
  1300. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1301. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1302. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1303. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1304. }\
  1305. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1306. uint64_t half[16*2 + 17*2];\
  1307. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1308. uint8_t * const halfHV= ((uint8_t*)half);\
  1309. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1310. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1311. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1312. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1313. }\
  1314. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1315. uint64_t half[16*2 + 17*2];\
  1316. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1317. uint8_t * const halfHV= ((uint8_t*)half);\
  1318. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1319. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1320. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1321. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1322. }\
  1323. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1324. uint64_t half[16*2 + 17*2];\
  1325. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1326. uint8_t * const halfHV= ((uint8_t*)half);\
  1327. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1328. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1329. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1330. }\
  1331. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1332. uint64_t half[16*2 + 17*2];\
  1333. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1334. uint8_t * const halfHV= ((uint8_t*)half);\
  1335. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1336. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1337. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1338. }\
  1339. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1340. uint64_t half[17*2];\
  1341. uint8_t * const halfH= ((uint8_t*)half);\
  1342. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1343. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1344. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1345. }\
  1346. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1347. uint64_t half[17*2];\
  1348. uint8_t * const halfH= ((uint8_t*)half);\
  1349. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1350. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1351. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1352. }\
  1353. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1354. uint64_t half[17*2];\
  1355. uint8_t * const halfH= ((uint8_t*)half);\
  1356. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1357. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1358. }
  1359. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  1360. #define AVG_3DNOW_OP(a,b,temp, size) \
  1361. "mov" #size " " #b ", " #temp " \n\t"\
  1362. "pavgusb " #temp ", " #a " \n\t"\
  1363. "mov" #size " " #a ", " #b " \n\t"
  1364. #define AVG_MMX2_OP(a,b,temp, size) \
  1365. "mov" #size " " #b ", " #temp " \n\t"\
  1366. "pavgb " #temp ", " #a " \n\t"\
  1367. "mov" #size " " #a ", " #b " \n\t"
  1368. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  1369. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  1370. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  1371. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  1372. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  1373. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  1374. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  1375. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  1376. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  1377. #if 0
  1378. static void just_return() { return; }
  1379. #endif
  1380. #define SET_QPEL_FUNC(postfix1, postfix2) \
  1381. c->put_ ## postfix1 = put_ ## postfix2;\
  1382. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  1383. c->avg_ ## postfix1 = avg_ ## postfix2;
  1384. /* external functions, from idct_mmx.c */
  1385. void ff_mmx_idct(DCTELEM *block);
  1386. void ff_mmxext_idct(DCTELEM *block);
  1387. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  1388. converted */
  1389. static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1390. {
  1391. ff_mmx_idct (block);
  1392. put_pixels_clamped_mmx(block, dest, line_size);
  1393. }
  1394. static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1395. {
  1396. ff_mmx_idct (block);
  1397. add_pixels_clamped_mmx(block, dest, line_size);
  1398. }
  1399. static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1400. {
  1401. ff_mmxext_idct (block);
  1402. put_pixels_clamped_mmx(block, dest, line_size);
  1403. }
  1404. static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1405. {
  1406. ff_mmxext_idct (block);
  1407. add_pixels_clamped_mmx(block, dest, line_size);
  1408. }
  1409. void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  1410. {
  1411. mm_flags = mm_support();
  1412. if (avctx->dsp_mask) {
  1413. if (avctx->dsp_mask & FF_MM_FORCE)
  1414. mm_flags |= (avctx->dsp_mask & 0xffff);
  1415. else
  1416. mm_flags &= ~(avctx->dsp_mask & 0xffff);
  1417. }
  1418. #if 0
  1419. fprintf(stderr, "libavcodec: CPU flags:");
  1420. if (mm_flags & MM_MMX)
  1421. fprintf(stderr, " mmx");
  1422. if (mm_flags & MM_MMXEXT)
  1423. fprintf(stderr, " mmxext");
  1424. if (mm_flags & MM_3DNOW)
  1425. fprintf(stderr, " 3dnow");
  1426. if (mm_flags & MM_SSE)
  1427. fprintf(stderr, " sse");
  1428. if (mm_flags & MM_SSE2)
  1429. fprintf(stderr, " sse2");
  1430. fprintf(stderr, "\n");
  1431. #endif
  1432. if (mm_flags & MM_MMX) {
  1433. const int dct_algo = avctx->dct_algo;
  1434. const int idct_algo= avctx->idct_algo;
  1435. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
  1436. c->fdct = ff_fdct_mmx;
  1437. if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
  1438. c->idct_put= ff_simple_idct_put_mmx;
  1439. c->idct_add= ff_simple_idct_add_mmx;
  1440. c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
  1441. }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
  1442. if(mm_flags & MM_MMXEXT){
  1443. c->idct_put= ff_libmpeg2mmx2_idct_put;
  1444. c->idct_add= ff_libmpeg2mmx2_idct_add;
  1445. }else{
  1446. c->idct_put= ff_libmpeg2mmx_idct_put;
  1447. c->idct_add= ff_libmpeg2mmx_idct_add;
  1448. }
  1449. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  1450. }
  1451. c->get_pixels = get_pixels_mmx;
  1452. c->diff_pixels = diff_pixels_mmx;
  1453. c->put_pixels_clamped = put_pixels_clamped_mmx;
  1454. c->add_pixels_clamped = add_pixels_clamped_mmx;
  1455. c->clear_blocks = clear_blocks_mmx;
  1456. c->pix_sum = pix_sum16_mmx;
  1457. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  1458. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  1459. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  1460. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  1461. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  1462. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  1463. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  1464. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  1465. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  1466. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  1467. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  1468. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  1469. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  1470. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  1471. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  1472. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  1473. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  1474. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  1475. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  1476. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  1477. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  1478. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  1479. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  1480. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  1481. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  1482. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  1483. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  1484. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  1485. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  1486. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  1487. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  1488. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  1489. c->add_bytes= add_bytes_mmx;
  1490. c->diff_bytes= diff_bytes_mmx;
  1491. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  1492. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  1493. c->pix_norm1 = pix_norm1_mmx;
  1494. c->sse[0] = sse16_mmx;
  1495. if (mm_flags & MM_MMXEXT) {
  1496. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  1497. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  1498. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  1499. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  1500. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  1501. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  1502. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  1503. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  1504. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  1505. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  1506. c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
  1507. c->hadamard8_diff[1]= hadamard8_diff_mmx2;
  1508. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1509. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  1510. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  1511. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  1512. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  1513. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  1514. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  1515. }
  1516. #if 1
  1517. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  1518. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  1519. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  1520. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  1521. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  1522. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  1523. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  1524. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  1525. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  1526. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  1527. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  1528. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  1529. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  1530. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  1531. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  1532. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  1533. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  1534. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  1535. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  1536. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  1537. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  1538. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  1539. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  1540. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  1541. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  1542. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  1543. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  1544. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  1545. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  1546. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  1547. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  1548. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  1549. #endif
  1550. } else if (mm_flags & MM_3DNOW) {
  1551. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  1552. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  1553. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  1554. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  1555. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  1556. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  1557. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  1558. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  1559. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  1560. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  1561. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1562. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  1563. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  1564. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  1565. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  1566. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  1567. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  1568. }
  1569. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  1570. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  1571. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  1572. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  1573. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  1574. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  1575. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  1576. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  1577. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  1578. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  1579. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  1580. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  1581. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  1582. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  1583. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  1584. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  1585. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  1586. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  1587. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  1588. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  1589. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  1590. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  1591. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  1592. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  1593. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  1594. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  1595. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  1596. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  1597. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  1598. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  1599. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  1600. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  1601. }
  1602. }
  1603. dsputil_init_pix_mmx(c, avctx);
  1604. #if 0
  1605. // for speed testing
  1606. get_pixels = just_return;
  1607. put_pixels_clamped = just_return;
  1608. add_pixels_clamped = just_return;
  1609. pix_abs16x16 = just_return;
  1610. pix_abs16x16_x2 = just_return;
  1611. pix_abs16x16_y2 = just_return;
  1612. pix_abs16x16_xy2 = just_return;
  1613. put_pixels_tab[0] = just_return;
  1614. put_pixels_tab[1] = just_return;
  1615. put_pixels_tab[2] = just_return;
  1616. put_pixels_tab[3] = just_return;
  1617. put_no_rnd_pixels_tab[0] = just_return;
  1618. put_no_rnd_pixels_tab[1] = just_return;
  1619. put_no_rnd_pixels_tab[2] = just_return;
  1620. put_no_rnd_pixels_tab[3] = just_return;
  1621. avg_pixels_tab[0] = just_return;
  1622. avg_pixels_tab[1] = just_return;
  1623. avg_pixels_tab[2] = just_return;
  1624. avg_pixels_tab[3] = just_return;
  1625. avg_no_rnd_pixels_tab[0] = just_return;
  1626. avg_no_rnd_pixels_tab[1] = just_return;
  1627. avg_no_rnd_pixels_tab[2] = just_return;
  1628. avg_no_rnd_pixels_tab[3] = just_return;
  1629. //av_fdct = just_return;
  1630. //ff_idct = just_return;
  1631. #endif
  1632. }