You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1220 lines
30KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. int mm_flags; /* multimedia extension flags */
  24. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  25. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. /* external functions, from idct_mmx.c */
  41. void ff_mmx_idct(DCTELEM *block);
  42. void ff_mmxext_idct(DCTELEM *block);
  43. /* pixel operations */
  44. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
  45. static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
  46. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  47. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  48. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  49. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  50. #ifndef PIC
  51. #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
  52. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  53. #else
  54. // for shared library it's better to use this way for accessing constants
  55. // pcmpeqd -> -1
  56. #define MOVQ_WONE(regd) \
  57. __asm __volatile ( \
  58. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  59. "psrlw $15, %%" #regd ::)
  60. #define MOVQ_WTWO(regd) \
  61. __asm __volatile ( \
  62. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  63. "psrlw $15, %%" #regd " \n\t" \
  64. "psllw $1, %%" #regd ::)
  65. #endif
  66. /***********************************/
  67. /* 3Dnow specific */
  68. #define DEF(x) x ## _3dnow
  69. /* for Athlons PAVGUSB is prefered */
  70. #define PAVGB "pavgusb"
  71. #include "dsputil_mmx_avg.h"
  72. #undef DEF
  73. #undef PAVGB
  74. /***********************************/
  75. /* MMX2 specific */
  76. #define DEF(x) x ## _sse
  77. /* Introduced only in MMX2 set */
  78. #define PAVGB "pavgb"
  79. #include "dsputil_mmx_avg.h"
  80. #undef DEF
  81. #undef PAVGB
  82. /***********************************/
  83. /* standard MMX */
  84. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  85. {
  86. DCTELEM *p;
  87. const UINT8 *pix;
  88. int i;
  89. /* read the pixels */
  90. p = block;
  91. pix = pixels;
  92. MOVQ_ZERO(mm7);
  93. for(i=0;i<4;i++) {
  94. __asm __volatile(
  95. "movq %1, %%mm0\n\t"
  96. "movq %2, %%mm1\n\t"
  97. "movq %%mm0, %%mm2\n\t"
  98. "movq %%mm1, %%mm3\n\t"
  99. "punpcklbw %%mm7, %%mm0\n\t"
  100. "punpckhbw %%mm7, %%mm2\n\t"
  101. "punpcklbw %%mm7, %%mm1\n\t"
  102. "punpckhbw %%mm7, %%mm3\n\t"
  103. "movq %%mm0, %0\n\t"
  104. "movq %%mm2, 8%0\n\t"
  105. "movq %%mm1, 16%0\n\t"
  106. "movq %%mm3, 24%0\n\t"
  107. :"=m"(*p)
  108. :"m"(*pix), "m"(*(pix+line_size))
  109. :"memory");
  110. pix += line_size*2;
  111. p += 16;
  112. }
  113. }
  114. static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
  115. {
  116. asm volatile(
  117. ".balign 16 \n\t"
  118. "movl $-128, %%eax \n\t"
  119. "1: \n\t"
  120. "movq (%0), %%mm0 \n\t"
  121. "movq (%1), %%mm2 \n\t"
  122. "movq %%mm0, %%mm1 \n\t"
  123. "movq %%mm2, %%mm3 \n\t"
  124. "punpcklbw %%mm7, %%mm0 \n\t"
  125. "punpckhbw %%mm7, %%mm1 \n\t"
  126. "punpcklbw %%mm7, %%mm2 \n\t"
  127. "punpckhbw %%mm7, %%mm3 \n\t"
  128. "psubw %%mm2, %%mm0 \n\t"
  129. "psubw %%mm3, %%mm1 \n\t"
  130. "movq %%mm0, (%2, %%eax)\n\t"
  131. "movq %%mm1, 8(%2, %%eax)\n\t"
  132. "addl %3, %0 \n\t"
  133. "addl %3, %1 \n\t"
  134. "addl $16, %%eax \n\t"
  135. "jnz 1b \n\t"
  136. : "+r" (s1), "+r" (s2)
  137. : "r" (block+64), "r" (stride)
  138. : "%eax"
  139. );
  140. }
  141. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  142. {
  143. const DCTELEM *p;
  144. UINT8 *pix;
  145. /* read the pixels */
  146. p = block;
  147. pix = pixels;
  148. /* unrolled loop */
  149. __asm __volatile(
  150. "movq %3, %%mm0\n\t"
  151. "movq 8%3, %%mm1\n\t"
  152. "movq 16%3, %%mm2\n\t"
  153. "movq 24%3, %%mm3\n\t"
  154. "movq 32%3, %%mm4\n\t"
  155. "movq 40%3, %%mm5\n\t"
  156. "movq 48%3, %%mm6\n\t"
  157. "movq 56%3, %%mm7\n\t"
  158. "packuswb %%mm1, %%mm0\n\t"
  159. "packuswb %%mm3, %%mm2\n\t"
  160. "packuswb %%mm5, %%mm4\n\t"
  161. "packuswb %%mm7, %%mm6\n\t"
  162. "movq %%mm0, (%0)\n\t"
  163. "movq %%mm2, (%0, %1)\n\t"
  164. "movq %%mm4, (%0, %1, 2)\n\t"
  165. "movq %%mm6, (%0, %2)\n\t"
  166. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  167. :"memory");
  168. pix += line_size*4;
  169. p += 32;
  170. // if here would be an exact copy of the code above
  171. // compiler would generate some very strange code
  172. // thus using "r"
  173. __asm __volatile(
  174. "movq (%3), %%mm0\n\t"
  175. "movq 8(%3), %%mm1\n\t"
  176. "movq 16(%3), %%mm2\n\t"
  177. "movq 24(%3), %%mm3\n\t"
  178. "movq 32(%3), %%mm4\n\t"
  179. "movq 40(%3), %%mm5\n\t"
  180. "movq 48(%3), %%mm6\n\t"
  181. "movq 56(%3), %%mm7\n\t"
  182. "packuswb %%mm1, %%mm0\n\t"
  183. "packuswb %%mm3, %%mm2\n\t"
  184. "packuswb %%mm5, %%mm4\n\t"
  185. "packuswb %%mm7, %%mm6\n\t"
  186. "movq %%mm0, (%0)\n\t"
  187. "movq %%mm2, (%0, %1)\n\t"
  188. "movq %%mm4, (%0, %1, 2)\n\t"
  189. "movq %%mm6, (%0, %2)\n\t"
  190. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  191. :"memory");
  192. }
  193. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  194. {
  195. const DCTELEM *p;
  196. UINT8 *pix;
  197. int i;
  198. /* read the pixels */
  199. p = block;
  200. pix = pixels;
  201. MOVQ_ZERO(mm7);
  202. i = 4;
  203. while (i) {
  204. __asm __volatile(
  205. "movq %2, %%mm0\n\t"
  206. "movq 8%2, %%mm1\n\t"
  207. "movq 16%2, %%mm2\n\t"
  208. "movq 24%2, %%mm3\n\t"
  209. "movq %0, %%mm4\n\t"
  210. "movq %1, %%mm6\n\t"
  211. "movq %%mm4, %%mm5\n\t"
  212. "punpcklbw %%mm7, %%mm4\n\t"
  213. "punpckhbw %%mm7, %%mm5\n\t"
  214. "paddsw %%mm4, %%mm0\n\t"
  215. "paddsw %%mm5, %%mm1\n\t"
  216. "movq %%mm6, %%mm5\n\t"
  217. "punpcklbw %%mm7, %%mm6\n\t"
  218. "punpckhbw %%mm7, %%mm5\n\t"
  219. "paddsw %%mm6, %%mm2\n\t"
  220. "paddsw %%mm5, %%mm3\n\t"
  221. "packuswb %%mm1, %%mm0\n\t"
  222. "packuswb %%mm3, %%mm2\n\t"
  223. "movq %%mm0, %0\n\t"
  224. "movq %%mm2, %1\n\t"
  225. :"+m"(*pix), "+m"(*(pix+line_size))
  226. :"m"(*p)
  227. :"memory");
  228. pix += line_size*2;
  229. p += 16;
  230. i--;
  231. };
  232. }
  233. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  234. {
  235. int hh;
  236. UINT8 *p;
  237. const UINT8 *pix;
  238. p = block;
  239. pix = pixels; // 2s
  240. #if 0
  241. do {
  242. __asm __volatile(
  243. "movq %1, %%mm0\n\t"
  244. "movq %%mm0, %0\n\t"
  245. :"=m"(*p)
  246. :"m"(*pix)
  247. :"memory");
  248. pix += line_size;
  249. p += line_size;
  250. } while (--h);
  251. #else
  252. // this optimized code is not very usefull
  253. // the above loop is definitely faster
  254. // at least on Celeron 500MHz
  255. hh = h & 3;
  256. while (hh) {
  257. __asm __volatile(
  258. "movq %1, %%mm0\n\t"
  259. "movq %%mm0, %0\n\t"
  260. :"=m"(*p)
  261. :"m"(*pix)
  262. :"memory");
  263. pix += line_size;
  264. p += line_size;
  265. hh--;
  266. }
  267. hh=h>>2;
  268. while (hh) {
  269. __asm __volatile(
  270. "movq (%1), %%mm0 \n\t"
  271. "movq (%1, %2), %%mm1 \n\t"
  272. "movq (%1, %2, 2), %%mm2 \n\t"
  273. "movq (%1, %3), %%mm3 \n\t"
  274. "movq %%mm0, (%0) \n\t"
  275. "movq %%mm1, (%0, %2) \n\t"
  276. "movq %%mm2, (%0, %2, 2) \n\t"
  277. "movq %%mm3, (%0, %3) \n\t"
  278. ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
  279. :"memory");
  280. pix += line_size*4;
  281. p += line_size*4;
  282. hh--;
  283. }
  284. #endif
  285. }
  286. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  287. {
  288. UINT8 *p;
  289. const UINT8 *pix;
  290. p = block;
  291. pix = pixels;
  292. MOVQ_ZERO(mm7);
  293. MOVQ_WONE(mm4);
  294. JUMPALIGN();
  295. do {
  296. __asm __volatile(
  297. "movq %1, %%mm0\n\t"
  298. "movq 1%1, %%mm1\n\t"
  299. "movq %%mm0, %%mm2\n\t"
  300. "movq %%mm1, %%mm3\n\t"
  301. "punpcklbw %%mm7, %%mm0\n\t"
  302. "punpcklbw %%mm7, %%mm1\n\t"
  303. "punpckhbw %%mm7, %%mm2\n\t"
  304. "punpckhbw %%mm7, %%mm3\n\t"
  305. "paddusw %%mm1, %%mm0\n\t"
  306. "paddusw %%mm3, %%mm2\n\t"
  307. "paddusw %%mm4, %%mm0\n\t"
  308. "paddusw %%mm4, %%mm2\n\t"
  309. "psrlw $1, %%mm0\n\t"
  310. "psrlw $1, %%mm2\n\t"
  311. "packuswb %%mm2, %%mm0\n\t"
  312. "movq %%mm0, %0\n\t"
  313. :"=m"(*p)
  314. :"m"(*pix)
  315. :"memory");
  316. pix += line_size; p += line_size;
  317. } while (--h);
  318. }
  319. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  320. {
  321. UINT8 *p;
  322. const UINT8 *pix;
  323. p = block;
  324. pix = pixels;
  325. MOVQ_ZERO(mm7);
  326. MOVQ_WONE(mm4);
  327. JUMPALIGN();
  328. do {
  329. __asm __volatile(
  330. "movq %1, %%mm0\n\t"
  331. "movq %2, %%mm1\n\t"
  332. "movq %%mm0, %%mm2\n\t"
  333. "movq %%mm1, %%mm3\n\t"
  334. "punpcklbw %%mm7, %%mm0\n\t"
  335. "punpcklbw %%mm7, %%mm1\n\t"
  336. "punpckhbw %%mm7, %%mm2\n\t"
  337. "punpckhbw %%mm7, %%mm3\n\t"
  338. "paddusw %%mm1, %%mm0\n\t"
  339. "paddusw %%mm3, %%mm2\n\t"
  340. "paddusw %%mm4, %%mm0\n\t"
  341. "paddusw %%mm4, %%mm2\n\t"
  342. "psrlw $1, %%mm0\n\t"
  343. "psrlw $1, %%mm2\n\t"
  344. "packuswb %%mm2, %%mm0\n\t"
  345. "movq %%mm0, %0\n\t"
  346. :"=m"(*p)
  347. :"m"(*pix),
  348. "m"(*(pix+line_size))
  349. :"memory");
  350. pix += line_size;
  351. p += line_size;
  352. } while (--h);
  353. }
  354. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  355. {
  356. UINT8 *p;
  357. const UINT8 *pix;
  358. p = block;
  359. pix = pixels; // 1s
  360. MOVQ_ZERO(mm7);
  361. MOVQ_WTWO(mm6);
  362. JUMPALIGN();
  363. do {
  364. __asm __volatile(
  365. "movq %1, %%mm0\n\t"
  366. "movq %2, %%mm1\n\t"
  367. "movq 1%1, %%mm4\n\t"
  368. "movq 1%2, %%mm5\n\t"
  369. "movq %%mm0, %%mm2\n\t"
  370. "movq %%mm1, %%mm3\n\t"
  371. "punpcklbw %%mm7, %%mm0\n\t"
  372. "punpcklbw %%mm7, %%mm1\n\t"
  373. "punpckhbw %%mm7, %%mm2\n\t"
  374. "punpckhbw %%mm7, %%mm3\n\t"
  375. "paddusw %%mm1, %%mm0\n\t"
  376. "paddusw %%mm3, %%mm2\n\t"
  377. "movq %%mm4, %%mm1\n\t"
  378. "movq %%mm5, %%mm3\n\t"
  379. "punpcklbw %%mm7, %%mm4\n\t"
  380. "punpcklbw %%mm7, %%mm5\n\t"
  381. "punpckhbw %%mm7, %%mm1\n\t"
  382. "punpckhbw %%mm7, %%mm3\n\t"
  383. "paddusw %%mm5, %%mm4\n\t"
  384. "paddusw %%mm3, %%mm1\n\t"
  385. "paddusw %%mm6, %%mm4\n\t"
  386. "paddusw %%mm6, %%mm1\n\t"
  387. "paddusw %%mm4, %%mm0\n\t"
  388. "paddusw %%mm1, %%mm2\n\t"
  389. "psrlw $2, %%mm0\n\t"
  390. "psrlw $2, %%mm2\n\t"
  391. "packuswb %%mm2, %%mm0\n\t"
  392. "movq %%mm0, %0\n\t"
  393. :"=m"(*p)
  394. :"m"(*pix),
  395. "m"(*(pix+line_size))
  396. :"memory");
  397. pix += line_size;
  398. p += line_size;
  399. } while(--h);
  400. }
  401. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  402. {
  403. UINT8 *p;
  404. const UINT8 *pix;
  405. p = block;
  406. pix = pixels;
  407. MOVQ_ZERO(mm7);
  408. do {
  409. __asm __volatile(
  410. "movq %1, %%mm0\n\t"
  411. "movq 1%1, %%mm1\n\t"
  412. "movq %%mm0, %%mm2\n\t"
  413. "movq %%mm1, %%mm3\n\t"
  414. "punpcklbw %%mm7, %%mm0\n\t"
  415. "punpcklbw %%mm7, %%mm1\n\t"
  416. "punpckhbw %%mm7, %%mm2\n\t"
  417. "punpckhbw %%mm7, %%mm3\n\t"
  418. "paddusw %%mm1, %%mm0\n\t"
  419. "paddusw %%mm3, %%mm2\n\t"
  420. "psrlw $1, %%mm0\n\t"
  421. "psrlw $1, %%mm2\n\t"
  422. "packuswb %%mm2, %%mm0\n\t"
  423. "movq %%mm0, %0\n\t"
  424. :"=m"(*p)
  425. :"m"(*pix)
  426. :"memory");
  427. pix += line_size;
  428. p += line_size;
  429. } while (--h);
  430. }
  431. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  432. {
  433. UINT8 *p;
  434. const UINT8 *pix;
  435. p = block;
  436. pix = pixels;
  437. MOVQ_ZERO(mm7);
  438. JUMPALIGN();
  439. do {
  440. __asm __volatile(
  441. "movq %1, %%mm0\n\t"
  442. "movq %2, %%mm1\n\t"
  443. "movq %%mm0, %%mm2\n\t"
  444. "movq %%mm1, %%mm3\n\t"
  445. "punpcklbw %%mm7, %%mm0\n\t"
  446. "punpcklbw %%mm7, %%mm1\n\t"
  447. "punpckhbw %%mm7, %%mm2\n\t"
  448. "punpckhbw %%mm7, %%mm3\n\t"
  449. "paddusw %%mm1, %%mm0\n\t"
  450. "paddusw %%mm3, %%mm2\n\t"
  451. "psrlw $1, %%mm0\n\t"
  452. "psrlw $1, %%mm2\n\t"
  453. "packuswb %%mm2, %%mm0\n\t"
  454. "movq %%mm0, %0\n\t"
  455. :"=m"(*p)
  456. :"m"(*pix),
  457. "m"(*(pix+line_size))
  458. :"memory");
  459. pix += line_size;
  460. p += line_size;
  461. } while(--h);
  462. }
  463. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  464. {
  465. UINT8 *p;
  466. const UINT8 *pix;
  467. p = block;
  468. pix = pixels;
  469. MOVQ_ZERO(mm7);
  470. MOVQ_WONE(mm6);
  471. JUMPALIGN();
  472. do {
  473. __asm __volatile(
  474. "movq %1, %%mm0\n\t"
  475. "movq %2, %%mm1\n\t"
  476. "movq 1%1, %%mm4\n\t"
  477. "movq 1%2, %%mm5\n\t"
  478. "movq %%mm0, %%mm2\n\t"
  479. "movq %%mm1, %%mm3\n\t"
  480. "punpcklbw %%mm7, %%mm0\n\t"
  481. "punpcklbw %%mm7, %%mm1\n\t"
  482. "punpckhbw %%mm7, %%mm2\n\t"
  483. "punpckhbw %%mm7, %%mm3\n\t"
  484. "paddusw %%mm1, %%mm0\n\t"
  485. "paddusw %%mm3, %%mm2\n\t"
  486. "movq %%mm4, %%mm1\n\t"
  487. "movq %%mm5, %%mm3\n\t"
  488. "punpcklbw %%mm7, %%mm4\n\t"
  489. "punpcklbw %%mm7, %%mm5\n\t"
  490. "punpckhbw %%mm7, %%mm1\n\t"
  491. "punpckhbw %%mm7, %%mm3\n\t"
  492. "paddusw %%mm5, %%mm4\n\t"
  493. "paddusw %%mm3, %%mm1\n\t"
  494. "paddusw %%mm6, %%mm4\n\t"
  495. "paddusw %%mm6, %%mm1\n\t"
  496. "paddusw %%mm4, %%mm0\n\t"
  497. "paddusw %%mm1, %%mm2\n\t"
  498. "psrlw $2, %%mm0\n\t"
  499. "psrlw $2, %%mm2\n\t"
  500. "packuswb %%mm2, %%mm0\n\t"
  501. "movq %%mm0, %0\n\t"
  502. :"=m"(*p)
  503. :"m"(*pix),
  504. "m"(*(pix+line_size))
  505. :"memory");
  506. pix += line_size;
  507. p += line_size;
  508. } while(--h);
  509. }
  510. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  511. {
  512. UINT8 *p;
  513. const UINT8 *pix;
  514. p = block;
  515. pix = pixels;
  516. MOVQ_ZERO(mm7);
  517. MOVQ_WONE(mm6);
  518. JUMPALIGN();
  519. do {
  520. __asm __volatile(
  521. "movq %0, %%mm0\n\t"
  522. "movq %1, %%mm1\n\t"
  523. "movq %%mm0, %%mm2\n\t"
  524. "movq %%mm1, %%mm3\n\t"
  525. "punpcklbw %%mm7, %%mm0\n\t"
  526. "punpcklbw %%mm7, %%mm1\n\t"
  527. "punpckhbw %%mm7, %%mm2\n\t"
  528. "punpckhbw %%mm7, %%mm3\n\t"
  529. "paddusw %%mm1, %%mm0\n\t"
  530. "paddusw %%mm3, %%mm2\n\t"
  531. "paddusw %%mm6, %%mm0\n\t"
  532. "paddusw %%mm6, %%mm2\n\t"
  533. "psrlw $1, %%mm0\n\t"
  534. "psrlw $1, %%mm2\n\t"
  535. "packuswb %%mm2, %%mm0\n\t"
  536. "movq %%mm0, %0\n\t"
  537. :"+m"(*p)
  538. :"m"(*pix)
  539. :"memory");
  540. pix += line_size;
  541. p += line_size;
  542. }
  543. while (--h);
  544. }
  545. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  546. {
  547. UINT8 *p;
  548. const UINT8 *pix;
  549. p = block;
  550. pix = pixels;
  551. MOVQ_ZERO(mm7);
  552. MOVQ_WONE(mm6);
  553. JUMPALIGN();
  554. do {
  555. __asm __volatile(
  556. "movq %1, %%mm1\n\t"
  557. "movq %0, %%mm0\n\t"
  558. "movq 1%1, %%mm4\n\t"
  559. "movq %%mm0, %%mm2\n\t"
  560. "movq %%mm1, %%mm3\n\t"
  561. "movq %%mm4, %%mm5\n\t"
  562. "punpcklbw %%mm7, %%mm1\n\t"
  563. "punpckhbw %%mm7, %%mm3\n\t"
  564. "punpcklbw %%mm7, %%mm4\n\t"
  565. "punpckhbw %%mm7, %%mm5\n\t"
  566. "punpcklbw %%mm7, %%mm0\n\t"
  567. "punpckhbw %%mm7, %%mm2\n\t"
  568. "paddusw %%mm4, %%mm1\n\t"
  569. "paddusw %%mm5, %%mm3\n\t"
  570. "paddusw %%mm6, %%mm1\n\t"
  571. "paddusw %%mm6, %%mm3\n\t"
  572. "psrlw $1, %%mm1\n\t"
  573. "psrlw $1, %%mm3\n\t"
  574. "paddusw %%mm6, %%mm0\n\t"
  575. "paddusw %%mm6, %%mm2\n\t"
  576. "paddusw %%mm1, %%mm0\n\t"
  577. "paddusw %%mm3, %%mm2\n\t"
  578. "psrlw $1, %%mm0\n\t"
  579. "psrlw $1, %%mm2\n\t"
  580. "packuswb %%mm2, %%mm0\n\t"
  581. "movq %%mm0, %0\n\t"
  582. :"+m"(*p)
  583. :"m"(*pix)
  584. :"memory");
  585. pix += line_size;
  586. p += line_size;
  587. } while (--h);
  588. }
  589. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  590. {
  591. UINT8 *p;
  592. const UINT8 *pix;
  593. p = block;
  594. pix = pixels;
  595. MOVQ_ZERO(mm7);
  596. MOVQ_WONE(mm6);
  597. JUMPALIGN();
  598. do {
  599. __asm __volatile(
  600. "movq %1, %%mm1\n\t"
  601. "movq %0, %%mm0\n\t"
  602. "movq %2, %%mm4\n\t"
  603. "movq %%mm0, %%mm2\n\t"
  604. "movq %%mm1, %%mm3\n\t"
  605. "movq %%mm4, %%mm5\n\t"
  606. "punpcklbw %%mm7, %%mm1\n\t"
  607. "punpckhbw %%mm7, %%mm3\n\t"
  608. "punpcklbw %%mm7, %%mm4\n\t"
  609. "punpckhbw %%mm7, %%mm5\n\t"
  610. "punpcklbw %%mm7, %%mm0\n\t"
  611. "punpckhbw %%mm7, %%mm2\n\t"
  612. "paddusw %%mm4, %%mm1\n\t"
  613. "paddusw %%mm5, %%mm3\n\t"
  614. "paddusw %%mm6, %%mm1\n\t"
  615. "paddusw %%mm6, %%mm3\n\t"
  616. "psrlw $1, %%mm1\n\t"
  617. "psrlw $1, %%mm3\n\t"
  618. "paddusw %%mm6, %%mm0\n\t"
  619. "paddusw %%mm6, %%mm2\n\t"
  620. "paddusw %%mm1, %%mm0\n\t"
  621. "paddusw %%mm3, %%mm2\n\t"
  622. "psrlw $1, %%mm0\n\t"
  623. "psrlw $1, %%mm2\n\t"
  624. "packuswb %%mm2, %%mm0\n\t"
  625. "movq %%mm0, %0\n\t"
  626. :"+m"(*p)
  627. :"m"(*pix), "m"(*(pix+line_size))
  628. :"memory");
  629. pix += line_size;
  630. p += line_size ;
  631. } while(--h);
  632. }
  633. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  634. {
  635. UINT8 *p;
  636. const UINT8 *pix;
  637. p = block;
  638. pix = pixels;
  639. MOVQ_ZERO(mm7);
  640. // this doesn't seem to be used offten - so
  641. // the inside usage of mm_wone is not optimized
  642. MOVQ_WTWO(mm6);
  643. do {
  644. __asm __volatile(
  645. "movq %1, %%mm0\n\t"
  646. "movq %2, %%mm1\n\t"
  647. "movq 1%1, %%mm4\n\t"
  648. "movq 1%2, %%mm5\n\t"
  649. "movq %%mm0, %%mm2\n\t"
  650. "movq %%mm1, %%mm3\n\t"
  651. "punpcklbw %%mm7, %%mm0\n\t"
  652. "punpcklbw %%mm7, %%mm1\n\t"
  653. "punpckhbw %%mm7, %%mm2\n\t"
  654. "punpckhbw %%mm7, %%mm3\n\t"
  655. "paddusw %%mm1, %%mm0\n\t"
  656. "paddusw %%mm3, %%mm2\n\t"
  657. "movq %%mm4, %%mm1\n\t"
  658. "movq %%mm5, %%mm3\n\t"
  659. "punpcklbw %%mm7, %%mm4\n\t"
  660. "punpcklbw %%mm7, %%mm5\n\t"
  661. "punpckhbw %%mm7, %%mm1\n\t"
  662. "punpckhbw %%mm7, %%mm3\n\t"
  663. "paddusw %%mm5, %%mm4\n\t"
  664. "paddusw %%mm3, %%mm1\n\t"
  665. "paddusw %%mm6, %%mm4\n\t"
  666. "paddusw %%mm6, %%mm1\n\t"
  667. "paddusw %%mm4, %%mm0\n\t"
  668. "paddusw %%mm1, %%mm2\n\t"
  669. "movq %3, %%mm5\n\t"
  670. "psrlw $2, %%mm0\n\t"
  671. "movq %0, %%mm1\n\t"
  672. "psrlw $2, %%mm2\n\t"
  673. "movq %%mm1, %%mm3\n\t"
  674. "punpcklbw %%mm7, %%mm1\n\t"
  675. "punpckhbw %%mm7, %%mm3\n\t"
  676. "paddusw %%mm1, %%mm0\n\t"
  677. "paddusw %%mm3, %%mm2\n\t"
  678. "paddusw %%mm5, %%mm0\n\t"
  679. "paddusw %%mm5, %%mm2\n\t"
  680. "psrlw $1, %%mm0\n\t"
  681. "psrlw $1, %%mm2\n\t"
  682. "packuswb %%mm2, %%mm0\n\t"
  683. "movq %%mm0, %0\n\t"
  684. :"+m"(*p)
  685. :"m"(*pix),
  686. "m"(*(pix+line_size)), "m"(mm_wone)
  687. :"memory");
  688. pix += line_size;
  689. p += line_size ;
  690. } while(--h);
  691. }
  692. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  693. {
  694. UINT8 *p;
  695. const UINT8 *pix;
  696. p = block;
  697. pix = pixels;
  698. MOVQ_ZERO(mm7);
  699. do {
  700. __asm __volatile(
  701. "movq %1, %%mm0\n\t"
  702. "movq %0, %%mm1\n\t"
  703. "movq %%mm0, %%mm2\n\t"
  704. "movq %%mm1, %%mm3\n\t"
  705. "punpcklbw %%mm7, %%mm0\n\t"
  706. "punpcklbw %%mm7, %%mm1\n\t"
  707. "punpckhbw %%mm7, %%mm2\n\t"
  708. "punpckhbw %%mm7, %%mm3\n\t"
  709. "paddusw %%mm1, %%mm0\n\t"
  710. "paddusw %%mm3, %%mm2\n\t"
  711. "psrlw $1, %%mm0\n\t"
  712. "psrlw $1, %%mm2\n\t"
  713. "packuswb %%mm2, %%mm0\n\t"
  714. "movq %%mm0, %0\n\t"
  715. :"+m"(*p)
  716. :"m"(*pix)
  717. :"memory");
  718. pix += line_size;
  719. p += line_size ;
  720. } while (--h);
  721. }
  722. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  723. {
  724. UINT8 *p;
  725. const UINT8 *pix;
  726. p = block;
  727. pix = pixels;
  728. MOVQ_ZERO(mm7);
  729. do {
  730. __asm __volatile(
  731. "movq %1, %%mm0\n\t"
  732. "movq 1%1, %%mm1\n\t"
  733. "movq %0, %%mm4\n\t"
  734. "movq %%mm0, %%mm2\n\t"
  735. "movq %%mm1, %%mm3\n\t"
  736. "movq %%mm4, %%mm5\n\t"
  737. "punpcklbw %%mm7, %%mm0\n\t"
  738. "punpcklbw %%mm7, %%mm1\n\t"
  739. "punpckhbw %%mm7, %%mm2\n\t"
  740. "punpckhbw %%mm7, %%mm3\n\t"
  741. "punpcklbw %%mm7, %%mm4\n\t"
  742. "punpckhbw %%mm7, %%mm5\n\t"
  743. "paddusw %%mm1, %%mm0\n\t"
  744. "paddusw %%mm3, %%mm2\n\t"
  745. "psrlw $1, %%mm0\n\t"
  746. "psrlw $1, %%mm2\n\t"
  747. "paddusw %%mm4, %%mm0\n\t"
  748. "paddusw %%mm5, %%mm2\n\t"
  749. "psrlw $1, %%mm0\n\t"
  750. "psrlw $1, %%mm2\n\t"
  751. "packuswb %%mm2, %%mm0\n\t"
  752. "movq %%mm0, %0\n\t"
  753. :"+m"(*p)
  754. :"m"(*pix)
  755. :"memory");
  756. pix += line_size;
  757. p += line_size;
  758. } while (--h);
  759. }
  760. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  761. {
  762. UINT8 *p;
  763. const UINT8 *pix;
  764. p = block;
  765. pix = pixels;
  766. MOVQ_ZERO(mm7);
  767. do {
  768. __asm __volatile(
  769. "movq %1, %%mm0\n\t"
  770. "movq %2, %%mm1\n\t"
  771. "movq %0, %%mm4\n\t"
  772. "movq %%mm0, %%mm2\n\t"
  773. "movq %%mm1, %%mm3\n\t"
  774. "movq %%mm4, %%mm5\n\t"
  775. "punpcklbw %%mm7, %%mm0\n\t"
  776. "punpcklbw %%mm7, %%mm1\n\t"
  777. "punpckhbw %%mm7, %%mm2\n\t"
  778. "punpckhbw %%mm7, %%mm3\n\t"
  779. "punpcklbw %%mm7, %%mm4\n\t"
  780. "punpckhbw %%mm7, %%mm5\n\t"
  781. "paddusw %%mm1, %%mm0\n\t"
  782. "paddusw %%mm3, %%mm2\n\t"
  783. "psrlw $1, %%mm0\n\t"
  784. "psrlw $1, %%mm2\n\t"
  785. "paddusw %%mm4, %%mm0\n\t"
  786. "paddusw %%mm5, %%mm2\n\t"
  787. "psrlw $1, %%mm0\n\t"
  788. "psrlw $1, %%mm2\n\t"
  789. "packuswb %%mm2, %%mm0\n\t"
  790. "movq %%mm0, %0\n\t"
  791. :"+m"(*p)
  792. :"m"(*pix), "m"(*(pix+line_size))
  793. :"memory");
  794. pix += line_size;
  795. p += line_size ;
  796. } while(--h);
  797. }
  798. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  799. {
  800. UINT8 *p;
  801. const UINT8 *pix;
  802. p = block;
  803. pix = pixels;
  804. MOVQ_ZERO(mm7);
  805. MOVQ_WONE(mm6);
  806. JUMPALIGN();
  807. do {
  808. __asm __volatile(
  809. "movq %1, %%mm0\n\t"
  810. "movq %2, %%mm1\n\t"
  811. "movq 1%1, %%mm4\n\t"
  812. "movq 1%2, %%mm5\n\t"
  813. "movq %%mm0, %%mm2\n\t"
  814. "movq %%mm1, %%mm3\n\t"
  815. "punpcklbw %%mm7, %%mm0\n\t"
  816. "punpcklbw %%mm7, %%mm1\n\t"
  817. "punpckhbw %%mm7, %%mm2\n\t"
  818. "punpckhbw %%mm7, %%mm3\n\t"
  819. "paddusw %%mm1, %%mm0\n\t"
  820. "paddusw %%mm3, %%mm2\n\t"
  821. "movq %%mm4, %%mm1\n\t"
  822. "movq %%mm5, %%mm3\n\t"
  823. "punpcklbw %%mm7, %%mm4\n\t"
  824. "punpcklbw %%mm7, %%mm5\n\t"
  825. "punpckhbw %%mm7, %%mm1\n\t"
  826. "punpckhbw %%mm7, %%mm3\n\t"
  827. "paddusw %%mm5, %%mm4\n\t"
  828. "paddusw %%mm3, %%mm1\n\t"
  829. "paddusw %%mm6, %%mm4\n\t"
  830. "paddusw %%mm6, %%mm1\n\t"
  831. "paddusw %%mm4, %%mm0\n\t"
  832. "paddusw %%mm1, %%mm2\n\t"
  833. "movq %0, %%mm1\n\t"
  834. "psrlw $2, %%mm0\n\t"
  835. "movq %%mm1, %%mm3\n\t"
  836. "psrlw $2, %%mm2\n\t"
  837. "punpcklbw %%mm7, %%mm1\n\t"
  838. "punpckhbw %%mm7, %%mm3\n\t"
  839. "paddusw %%mm1, %%mm0\n\t"
  840. "paddusw %%mm3, %%mm2\n\t"
  841. "psrlw $1, %%mm0\n\t"
  842. "psrlw $1, %%mm2\n\t"
  843. "packuswb %%mm2, %%mm0\n\t"
  844. "movq %%mm0, %0\n\t"
  845. :"+m"(*p)
  846. :"m"(*pix),
  847. "m"(*(pix+line_size))
  848. :"memory");
  849. pix += line_size;
  850. p += line_size;
  851. } while(--h);
  852. }
  853. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  854. {
  855. DCTELEM *p;
  856. const UINT8 *pix;
  857. p = block;
  858. pix = pixels;
  859. MOVQ_ZERO(mm7);
  860. do {
  861. __asm __volatile(
  862. "movq %0, %%mm0\n\t"
  863. "movq %1, %%mm2\n\t"
  864. "movq 8%0, %%mm1\n\t"
  865. "movq %%mm2, %%mm3\n\t"
  866. "punpcklbw %%mm7, %%mm2\n\t"
  867. "punpckhbw %%mm7, %%mm3\n\t"
  868. "psubsw %%mm2, %%mm0\n\t"
  869. "psubsw %%mm3, %%mm1\n\t"
  870. "movq %%mm0, %0\n\t"
  871. "movq %%mm1, 8%0\n\t"
  872. :"+m"(*p)
  873. :"m"(*pix)
  874. :"memory");
  875. pix += line_size;
  876. p += 8;
  877. } while (--h);
  878. }
  879. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  880. {
  881. DCTELEM *p;
  882. const UINT8 *pix;
  883. p = block;
  884. pix = pixels;
  885. MOVQ_ZERO(mm7);
  886. MOVQ_WONE(mm6);
  887. JUMPALIGN();
  888. do {
  889. __asm __volatile(
  890. "movq %0, %%mm0\n\t"
  891. "movq %1, %%mm2\n\t"
  892. "movq 8%0, %%mm1\n\t"
  893. "movq 1%1, %%mm4\n\t"
  894. "movq %%mm2, %%mm3\n\t"
  895. "movq %%mm4, %%mm5\n\t"
  896. "punpcklbw %%mm7, %%mm2\n\t"
  897. "punpckhbw %%mm7, %%mm3\n\t"
  898. "punpcklbw %%mm7, %%mm4\n\t"
  899. "punpckhbw %%mm7, %%mm5\n\t"
  900. "paddusw %%mm4, %%mm2\n\t"
  901. "paddusw %%mm5, %%mm3\n\t"
  902. "paddusw %%mm6, %%mm2\n\t"
  903. "paddusw %%mm6, %%mm3\n\t"
  904. "psrlw $1, %%mm2\n\t"
  905. "psrlw $1, %%mm3\n\t"
  906. "psubsw %%mm2, %%mm0\n\t"
  907. "psubsw %%mm3, %%mm1\n\t"
  908. "movq %%mm0, %0\n\t"
  909. "movq %%mm1, 8%0\n\t"
  910. :"+m"(*p)
  911. :"m"(*pix)
  912. :"memory");
  913. pix += line_size;
  914. p += 8;
  915. } while (--h);
  916. }
  917. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  918. {
  919. DCTELEM *p;
  920. const UINT8 *pix;
  921. p = block;
  922. pix = pixels;
  923. MOVQ_ZERO(mm7);
  924. MOVQ_WONE(mm6);
  925. do {
  926. __asm __volatile(
  927. "movq %0, %%mm0\n\t"
  928. "movq %1, %%mm2\n\t"
  929. "movq 8%0, %%mm1\n\t"
  930. "movq %2, %%mm4\n\t"
  931. "movq %%mm2, %%mm3\n\t"
  932. "movq %%mm4, %%mm5\n\t"
  933. "punpcklbw %%mm7, %%mm2\n\t"
  934. "punpckhbw %%mm7, %%mm3\n\t"
  935. "punpcklbw %%mm7, %%mm4\n\t"
  936. "punpckhbw %%mm7, %%mm5\n\t"
  937. "paddusw %%mm4, %%mm2\n\t"
  938. "paddusw %%mm5, %%mm3\n\t"
  939. "paddusw %%mm6, %%mm2\n\t"
  940. "paddusw %%mm6, %%mm3\n\t"
  941. "psrlw $1, %%mm2\n\t"
  942. "psrlw $1, %%mm3\n\t"
  943. "psubsw %%mm2, %%mm0\n\t"
  944. "psubsw %%mm3, %%mm1\n\t"
  945. "movq %%mm0, %0\n\t"
  946. "movq %%mm1, 8%0\n\t"
  947. :"+m"(*p)
  948. :"m"(*pix), "m"(*(pix+line_size))
  949. :"memory");
  950. pix += line_size;
  951. p += 8;
  952. } while (--h);
  953. }
  954. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  955. {
  956. DCTELEM *p;
  957. const UINT8 *pix;
  958. p = block;
  959. pix = pixels;
  960. MOVQ_ZERO(mm7);
  961. MOVQ_WTWO(mm6);
  962. JUMPALIGN();
  963. do {
  964. __asm __volatile(
  965. "movq %1, %%mm0\n\t"
  966. "movq %2, %%mm1\n\t"
  967. "movq 1%1, %%mm4\n\t"
  968. "movq 1%2, %%mm5\n\t"
  969. "movq %%mm0, %%mm2\n\t"
  970. "movq %%mm1, %%mm3\n\t"
  971. "punpcklbw %%mm7, %%mm0\n\t"
  972. "punpcklbw %%mm7, %%mm1\n\t"
  973. "punpckhbw %%mm7, %%mm2\n\t"
  974. "punpckhbw %%mm7, %%mm3\n\t"
  975. "paddusw %%mm1, %%mm0\n\t"
  976. "paddusw %%mm3, %%mm2\n\t"
  977. "movq %%mm4, %%mm1\n\t"
  978. "movq %%mm5, %%mm3\n\t"
  979. "punpcklbw %%mm7, %%mm4\n\t"
  980. "punpcklbw %%mm7, %%mm5\n\t"
  981. "punpckhbw %%mm7, %%mm1\n\t"
  982. "punpckhbw %%mm7, %%mm3\n\t"
  983. "paddusw %%mm5, %%mm4\n\t"
  984. "paddusw %%mm3, %%mm1\n\t"
  985. "paddusw %%mm6, %%mm4\n\t"
  986. "paddusw %%mm6, %%mm1\n\t"
  987. "paddusw %%mm4, %%mm0\n\t"
  988. "paddusw %%mm1, %%mm2\n\t"
  989. "movq %0, %%mm1\n\t"
  990. "movq 8%0, %%mm3\n\t"
  991. "psrlw $2, %%mm0\n\t"
  992. "psrlw $2, %%mm2\n\t"
  993. "psubsw %%mm0, %%mm1\n\t"
  994. "psubsw %%mm2, %%mm3\n\t"
  995. "movq %%mm1, %0\n\t"
  996. "movq %%mm3, 8%0\n\t"
  997. :"+m"(*p)
  998. :"m"(*pix),
  999. "m"(*(pix+line_size))
  1000. :"memory");
  1001. pix += line_size;
  1002. p += 8 ;
  1003. } while(--h);
  1004. }
  1005. static void clear_blocks_mmx(DCTELEM *blocks)
  1006. {
  1007. asm volatile(
  1008. "pxor %%mm7, %%mm7 \n\t"
  1009. "movl $-128*6, %%eax \n\t"
  1010. "1: \n\t"
  1011. "movq %%mm7, (%0, %%eax) \n\t"
  1012. "movq %%mm7, 8(%0, %%eax) \n\t"
  1013. "movq %%mm7, 16(%0, %%eax) \n\t"
  1014. "movq %%mm7, 24(%0, %%eax) \n\t"
  1015. "addl $32, %%eax \n\t"
  1016. " js 1b \n\t"
  1017. : : "r" (((int)blocks)+128*6)
  1018. : "%eax"
  1019. );
  1020. }
  1021. static void just_return() { return; }
  1022. void dsputil_init_mmx(void)
  1023. {
  1024. mm_flags = mm_support();
  1025. #if 1
  1026. printf("libavcodec: CPU flags:");
  1027. if (mm_flags & MM_MMX)
  1028. printf(" mmx");
  1029. if (mm_flags & MM_MMXEXT)
  1030. printf(" mmxext");
  1031. if (mm_flags & MM_3DNOW)
  1032. printf(" 3dnow");
  1033. if (mm_flags & MM_SSE)
  1034. printf(" sse");
  1035. if (mm_flags & MM_SSE2)
  1036. printf(" sse2");
  1037. printf("\n");
  1038. #endif
  1039. if (mm_flags & MM_MMX) {
  1040. get_pixels = get_pixels_mmx;
  1041. diff_pixels = diff_pixels_mmx;
  1042. put_pixels_clamped = put_pixels_clamped_mmx;
  1043. add_pixels_clamped = add_pixels_clamped_mmx;
  1044. clear_blocks= clear_blocks_mmx;
  1045. pix_abs16x16 = pix_abs16x16_mmx;
  1046. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1047. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1048. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1049. pix_abs8x8 = pix_abs8x8_mmx;
  1050. pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1051. pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1052. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  1053. av_fdct = fdct_mmx;
  1054. put_pixels_tab[0] = put_pixels_mmx;
  1055. put_pixels_tab[1] = put_pixels_x2_mmx;
  1056. put_pixels_tab[2] = put_pixels_y2_mmx;
  1057. put_pixels_tab[3] = put_pixels_xy2_mmx;
  1058. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  1059. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1060. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1061. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  1062. avg_pixels_tab[0] = avg_pixels_mmx;
  1063. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  1064. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  1065. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1066. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  1067. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  1068. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  1069. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  1070. sub_pixels_tab[0] = sub_pixels_mmx;
  1071. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  1072. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  1073. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  1074. if (mm_flags & MM_MMXEXT) {
  1075. pix_abs16x16 = pix_abs16x16_mmx2;
  1076. pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  1077. pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  1078. pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
  1079. pix_abs8x8 = pix_abs8x8_mmx2;
  1080. pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  1081. pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  1082. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
  1083. put_pixels_tab[1] = put_pixels_x2_sse;
  1084. put_pixels_tab[2] = put_pixels_y2_sse;
  1085. avg_pixels_tab[0] = avg_pixels_sse;
  1086. avg_pixels_tab[1] = avg_pixels_x2_sse;
  1087. avg_pixels_tab[2] = avg_pixels_y2_sse;
  1088. avg_pixels_tab[3] = avg_pixels_xy2_sse;
  1089. sub_pixels_tab[1] = sub_pixels_x2_sse;
  1090. sub_pixels_tab[2] = sub_pixels_y2_sse;
  1091. } else if (mm_flags & MM_3DNOW) {
  1092. put_pixels_tab[1] = put_pixels_x2_3dnow;
  1093. put_pixels_tab[2] = put_pixels_y2_3dnow;
  1094. avg_pixels_tab[0] = avg_pixels_3dnow;
  1095. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  1096. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  1097. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  1098. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  1099. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  1100. }
  1101. /* idct */
  1102. if (mm_flags & MM_MMXEXT) {
  1103. ff_idct = ff_mmxext_idct;
  1104. } else {
  1105. ff_idct = ff_mmx_idct;
  1106. }
  1107. #ifdef SIMPLE_IDCT
  1108. // ff_idct = simple_idct;
  1109. ff_idct = simple_idct_mmx;
  1110. #endif
  1111. }
  1112. #if 0
  1113. // for speed testing
  1114. get_pixels = just_return;
  1115. put_pixels_clamped = just_return;
  1116. add_pixels_clamped = just_return;
  1117. pix_abs16x16 = just_return;
  1118. pix_abs16x16_x2 = just_return;
  1119. pix_abs16x16_y2 = just_return;
  1120. pix_abs16x16_xy2 = just_return;
  1121. put_pixels_tab[0] = just_return;
  1122. put_pixels_tab[1] = just_return;
  1123. put_pixels_tab[2] = just_return;
  1124. put_pixels_tab[3] = just_return;
  1125. put_no_rnd_pixels_tab[0] = just_return;
  1126. put_no_rnd_pixels_tab[1] = just_return;
  1127. put_no_rnd_pixels_tab[2] = just_return;
  1128. put_no_rnd_pixels_tab[3] = just_return;
  1129. avg_pixels_tab[0] = just_return;
  1130. avg_pixels_tab[1] = just_return;
  1131. avg_pixels_tab[2] = just_return;
  1132. avg_pixels_tab[3] = just_return;
  1133. avg_no_rnd_pixels_tab[0] = just_return;
  1134. avg_no_rnd_pixels_tab[1] = just_return;
  1135. avg_no_rnd_pixels_tab[2] = just_return;
  1136. avg_no_rnd_pixels_tab[3] = just_return;
  1137. sub_pixels_tab[0] = just_return;
  1138. sub_pixels_tab[1] = just_return;
  1139. sub_pixels_tab[2] = just_return;
  1140. sub_pixels_tab[3] = just_return;
  1141. //av_fdct = just_return;
  1142. //ff_idct = just_return;
  1143. #endif
  1144. }