You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1192 lines
29KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. int mm_flags; /* multimedia extension flags */
  24. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  25. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. /* external functions, from idct_mmx.c */
  41. void ff_mmx_idct(DCTELEM *block);
  42. void ff_mmxext_idct(DCTELEM *block);
  43. /* pixel operations */
  44. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
  45. static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
  46. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  47. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  48. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  49. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  50. #ifndef PIC
  51. #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
  52. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  53. #else
  54. // for shared library it's better to use this way for accessing constants
  55. // pcmpeqd -> -1
  56. #define MOVQ_WONE(regd) \
  57. __asm __volatile ( \
  58. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  59. "psrlw $15, %%" #regd ::)
  60. #define MOVQ_WTWO(regd) \
  61. __asm __volatile ( \
  62. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  63. "psrlw $15, %%" #regd " \n\t" \
  64. "psllw $1, %%" #regd ::)
  65. #endif
  66. /***********************************/
  67. /* 3Dnow specific */
  68. #define DEF(x) x ## _3dnow
  69. /* for Athlons PAVGUSB is prefered */
  70. #define PAVGB "pavgusb"
  71. #include "dsputil_mmx_avg.h"
  72. #undef DEF
  73. #undef PAVGB
  74. /***********************************/
  75. /* MMX2 specific */
  76. #define DEF(x) x ## _sse
  77. /* Introduced only in MMX2 set */
  78. #define PAVGB "pavgb"
  79. #include "dsputil_mmx_avg.h"
  80. #undef DEF
  81. #undef PAVGB
  82. /***********************************/
  83. /* standard MMX */
  84. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  85. {
  86. DCTELEM *p;
  87. const UINT8 *pix;
  88. int i;
  89. /* read the pixels */
  90. p = block;
  91. pix = pixels;
  92. MOVQ_ZERO(mm7);
  93. for(i=0;i<4;i++) {
  94. __asm __volatile(
  95. "movq %1, %%mm0\n\t"
  96. "movq %2, %%mm1\n\t"
  97. "movq %%mm0, %%mm2\n\t"
  98. "movq %%mm1, %%mm3\n\t"
  99. "punpcklbw %%mm7, %%mm0\n\t"
  100. "punpckhbw %%mm7, %%mm2\n\t"
  101. "punpcklbw %%mm7, %%mm1\n\t"
  102. "punpckhbw %%mm7, %%mm3\n\t"
  103. "movq %%mm0, %0\n\t"
  104. "movq %%mm2, 8%0\n\t"
  105. "movq %%mm1, 16%0\n\t"
  106. "movq %%mm3, 24%0\n\t"
  107. :"=m"(*p)
  108. :"m"(*pix), "m"(*(pix+line_size))
  109. :"memory");
  110. pix += line_size*2;
  111. p += 16;
  112. }
  113. }
  114. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  115. {
  116. const DCTELEM *p;
  117. UINT8 *pix;
  118. /* read the pixels */
  119. p = block;
  120. pix = pixels;
  121. /* unrolled loop */
  122. __asm __volatile(
  123. "movq %3, %%mm0\n\t"
  124. "movq 8%3, %%mm1\n\t"
  125. "movq 16%3, %%mm2\n\t"
  126. "movq 24%3, %%mm3\n\t"
  127. "movq 32%3, %%mm4\n\t"
  128. "movq 40%3, %%mm5\n\t"
  129. "movq 48%3, %%mm6\n\t"
  130. "movq 56%3, %%mm7\n\t"
  131. "packuswb %%mm1, %%mm0\n\t"
  132. "packuswb %%mm3, %%mm2\n\t"
  133. "packuswb %%mm5, %%mm4\n\t"
  134. "packuswb %%mm7, %%mm6\n\t"
  135. "movq %%mm0, (%0)\n\t"
  136. "movq %%mm2, (%0, %1)\n\t"
  137. "movq %%mm4, (%0, %1, 2)\n\t"
  138. "movq %%mm6, (%0, %2)\n\t"
  139. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  140. :"memory");
  141. pix += line_size*4;
  142. p += 32;
  143. // if here would be an exact copy of the code above
  144. // compiler would generate some very strange code
  145. // thus using "r"
  146. __asm __volatile(
  147. "movq (%3), %%mm0\n\t"
  148. "movq 8(%3), %%mm1\n\t"
  149. "movq 16(%3), %%mm2\n\t"
  150. "movq 24(%3), %%mm3\n\t"
  151. "movq 32(%3), %%mm4\n\t"
  152. "movq 40(%3), %%mm5\n\t"
  153. "movq 48(%3), %%mm6\n\t"
  154. "movq 56(%3), %%mm7\n\t"
  155. "packuswb %%mm1, %%mm0\n\t"
  156. "packuswb %%mm3, %%mm2\n\t"
  157. "packuswb %%mm5, %%mm4\n\t"
  158. "packuswb %%mm7, %%mm6\n\t"
  159. "movq %%mm0, (%0)\n\t"
  160. "movq %%mm2, (%0, %1)\n\t"
  161. "movq %%mm4, (%0, %1, 2)\n\t"
  162. "movq %%mm6, (%0, %2)\n\t"
  163. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  164. :"memory");
  165. }
  166. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  167. {
  168. const DCTELEM *p;
  169. UINT8 *pix;
  170. int i;
  171. /* read the pixels */
  172. p = block;
  173. pix = pixels;
  174. MOVQ_ZERO(mm7);
  175. i = 4;
  176. while (i) {
  177. __asm __volatile(
  178. "movq %2, %%mm0\n\t"
  179. "movq 8%2, %%mm1\n\t"
  180. "movq 16%2, %%mm2\n\t"
  181. "movq 24%2, %%mm3\n\t"
  182. "movq %0, %%mm4\n\t"
  183. "movq %1, %%mm6\n\t"
  184. "movq %%mm4, %%mm5\n\t"
  185. "punpcklbw %%mm7, %%mm4\n\t"
  186. "punpckhbw %%mm7, %%mm5\n\t"
  187. "paddsw %%mm4, %%mm0\n\t"
  188. "paddsw %%mm5, %%mm1\n\t"
  189. "movq %%mm6, %%mm5\n\t"
  190. "punpcklbw %%mm7, %%mm6\n\t"
  191. "punpckhbw %%mm7, %%mm5\n\t"
  192. "paddsw %%mm6, %%mm2\n\t"
  193. "paddsw %%mm5, %%mm3\n\t"
  194. "packuswb %%mm1, %%mm0\n\t"
  195. "packuswb %%mm3, %%mm2\n\t"
  196. "movq %%mm0, %0\n\t"
  197. "movq %%mm2, %1\n\t"
  198. :"+m"(*pix), "+m"(*(pix+line_size))
  199. :"m"(*p)
  200. :"memory");
  201. pix += line_size*2;
  202. p += 16;
  203. i--;
  204. };
  205. }
  206. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  207. {
  208. int hh;
  209. UINT8 *p;
  210. const UINT8 *pix;
  211. p = block;
  212. pix = pixels; // 2s
  213. #if 0
  214. do {
  215. __asm __volatile(
  216. "movq %1, %%mm0\n\t"
  217. "movq %%mm0, %0\n\t"
  218. :"=m"(*p)
  219. :"m"(*pix)
  220. :"memory");
  221. pix += line_size;
  222. p += line_size;
  223. } while (--h);
  224. #else
  225. // this optimized code is not very usefull
  226. // the above loop is definitely faster
  227. // at least on Celeron 500MHz
  228. hh = h & 3;
  229. while (hh) {
  230. __asm __volatile(
  231. "movq %1, %%mm0\n\t"
  232. "movq %%mm0, %0\n\t"
  233. :"=m"(*p)
  234. :"m"(*pix)
  235. :"memory");
  236. pix += line_size;
  237. p += line_size;
  238. hh--;
  239. }
  240. hh=h>>2;
  241. while (hh) {
  242. __asm __volatile(
  243. "movq (%1), %%mm0 \n\t"
  244. "movq (%1, %2), %%mm1 \n\t"
  245. "movq (%1, %2, 2), %%mm2 \n\t"
  246. "movq (%1, %3), %%mm3 \n\t"
  247. "movq %%mm0, (%0) \n\t"
  248. "movq %%mm1, (%0, %2) \n\t"
  249. "movq %%mm2, (%0, %2, 2) \n\t"
  250. "movq %%mm3, (%0, %3) \n\t"
  251. ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3)
  252. :"memory");
  253. pix += line_size*4;
  254. p += line_size*4;
  255. hh--;
  256. }
  257. #endif
  258. }
  259. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  260. {
  261. UINT8 *p;
  262. const UINT8 *pix;
  263. p = block;
  264. pix = pixels;
  265. MOVQ_ZERO(mm7);
  266. MOVQ_WONE(mm4);
  267. JUMPALIGN();
  268. do {
  269. __asm __volatile(
  270. "movq %1, %%mm0\n\t"
  271. "movq 1%1, %%mm1\n\t"
  272. "movq %%mm0, %%mm2\n\t"
  273. "movq %%mm1, %%mm3\n\t"
  274. "punpcklbw %%mm7, %%mm0\n\t"
  275. "punpcklbw %%mm7, %%mm1\n\t"
  276. "punpckhbw %%mm7, %%mm2\n\t"
  277. "punpckhbw %%mm7, %%mm3\n\t"
  278. "paddusw %%mm1, %%mm0\n\t"
  279. "paddusw %%mm3, %%mm2\n\t"
  280. "paddusw %%mm4, %%mm0\n\t"
  281. "paddusw %%mm4, %%mm2\n\t"
  282. "psrlw $1, %%mm0\n\t"
  283. "psrlw $1, %%mm2\n\t"
  284. "packuswb %%mm2, %%mm0\n\t"
  285. "movq %%mm0, %0\n\t"
  286. :"=m"(*p)
  287. :"m"(*pix)
  288. :"memory");
  289. pix += line_size; p += line_size;
  290. } while (--h);
  291. }
  292. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  293. {
  294. UINT8 *p;
  295. const UINT8 *pix;
  296. p = block;
  297. pix = pixels;
  298. MOVQ_ZERO(mm7);
  299. MOVQ_WONE(mm4);
  300. JUMPALIGN();
  301. do {
  302. __asm __volatile(
  303. "movq %1, %%mm0\n\t"
  304. "movq %2, %%mm1\n\t"
  305. "movq %%mm0, %%mm2\n\t"
  306. "movq %%mm1, %%mm3\n\t"
  307. "punpcklbw %%mm7, %%mm0\n\t"
  308. "punpcklbw %%mm7, %%mm1\n\t"
  309. "punpckhbw %%mm7, %%mm2\n\t"
  310. "punpckhbw %%mm7, %%mm3\n\t"
  311. "paddusw %%mm1, %%mm0\n\t"
  312. "paddusw %%mm3, %%mm2\n\t"
  313. "paddusw %%mm4, %%mm0\n\t"
  314. "paddusw %%mm4, %%mm2\n\t"
  315. "psrlw $1, %%mm0\n\t"
  316. "psrlw $1, %%mm2\n\t"
  317. "packuswb %%mm2, %%mm0\n\t"
  318. "movq %%mm0, %0\n\t"
  319. :"=m"(*p)
  320. :"m"(*pix),
  321. "m"(*(pix+line_size))
  322. :"memory");
  323. pix += line_size;
  324. p += line_size;
  325. } while (--h);
  326. }
  327. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  328. {
  329. UINT8 *p;
  330. const UINT8 *pix;
  331. p = block;
  332. pix = pixels; // 1s
  333. MOVQ_ZERO(mm7);
  334. MOVQ_WTWO(mm6);
  335. JUMPALIGN();
  336. do {
  337. __asm __volatile(
  338. "movq %1, %%mm0\n\t"
  339. "movq %2, %%mm1\n\t"
  340. "movq 1%1, %%mm4\n\t"
  341. "movq 1%2, %%mm5\n\t"
  342. "movq %%mm0, %%mm2\n\t"
  343. "movq %%mm1, %%mm3\n\t"
  344. "punpcklbw %%mm7, %%mm0\n\t"
  345. "punpcklbw %%mm7, %%mm1\n\t"
  346. "punpckhbw %%mm7, %%mm2\n\t"
  347. "punpckhbw %%mm7, %%mm3\n\t"
  348. "paddusw %%mm1, %%mm0\n\t"
  349. "paddusw %%mm3, %%mm2\n\t"
  350. "movq %%mm4, %%mm1\n\t"
  351. "movq %%mm5, %%mm3\n\t"
  352. "punpcklbw %%mm7, %%mm4\n\t"
  353. "punpcklbw %%mm7, %%mm5\n\t"
  354. "punpckhbw %%mm7, %%mm1\n\t"
  355. "punpckhbw %%mm7, %%mm3\n\t"
  356. "paddusw %%mm5, %%mm4\n\t"
  357. "paddusw %%mm3, %%mm1\n\t"
  358. "paddusw %%mm6, %%mm4\n\t"
  359. "paddusw %%mm6, %%mm1\n\t"
  360. "paddusw %%mm4, %%mm0\n\t"
  361. "paddusw %%mm1, %%mm2\n\t"
  362. "psrlw $2, %%mm0\n\t"
  363. "psrlw $2, %%mm2\n\t"
  364. "packuswb %%mm2, %%mm0\n\t"
  365. "movq %%mm0, %0\n\t"
  366. :"=m"(*p)
  367. :"m"(*pix),
  368. "m"(*(pix+line_size))
  369. :"memory");
  370. pix += line_size;
  371. p += line_size;
  372. } while(--h);
  373. }
  374. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  375. {
  376. UINT8 *p;
  377. const UINT8 *pix;
  378. p = block;
  379. pix = pixels;
  380. MOVQ_ZERO(mm7);
  381. do {
  382. __asm __volatile(
  383. "movq %1, %%mm0\n\t"
  384. "movq 1%1, %%mm1\n\t"
  385. "movq %%mm0, %%mm2\n\t"
  386. "movq %%mm1, %%mm3\n\t"
  387. "punpcklbw %%mm7, %%mm0\n\t"
  388. "punpcklbw %%mm7, %%mm1\n\t"
  389. "punpckhbw %%mm7, %%mm2\n\t"
  390. "punpckhbw %%mm7, %%mm3\n\t"
  391. "paddusw %%mm1, %%mm0\n\t"
  392. "paddusw %%mm3, %%mm2\n\t"
  393. "psrlw $1, %%mm0\n\t"
  394. "psrlw $1, %%mm2\n\t"
  395. "packuswb %%mm2, %%mm0\n\t"
  396. "movq %%mm0, %0\n\t"
  397. :"=m"(*p)
  398. :"m"(*pix)
  399. :"memory");
  400. pix += line_size;
  401. p += line_size;
  402. } while (--h);
  403. }
  404. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  405. {
  406. UINT8 *p;
  407. const UINT8 *pix;
  408. p = block;
  409. pix = pixels;
  410. MOVQ_ZERO(mm7);
  411. JUMPALIGN();
  412. do {
  413. __asm __volatile(
  414. "movq %1, %%mm0\n\t"
  415. "movq %2, %%mm1\n\t"
  416. "movq %%mm0, %%mm2\n\t"
  417. "movq %%mm1, %%mm3\n\t"
  418. "punpcklbw %%mm7, %%mm0\n\t"
  419. "punpcklbw %%mm7, %%mm1\n\t"
  420. "punpckhbw %%mm7, %%mm2\n\t"
  421. "punpckhbw %%mm7, %%mm3\n\t"
  422. "paddusw %%mm1, %%mm0\n\t"
  423. "paddusw %%mm3, %%mm2\n\t"
  424. "psrlw $1, %%mm0\n\t"
  425. "psrlw $1, %%mm2\n\t"
  426. "packuswb %%mm2, %%mm0\n\t"
  427. "movq %%mm0, %0\n\t"
  428. :"=m"(*p)
  429. :"m"(*pix),
  430. "m"(*(pix+line_size))
  431. :"memory");
  432. pix += line_size;
  433. p += line_size;
  434. } while(--h);
  435. }
  436. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  437. {
  438. UINT8 *p;
  439. const UINT8 *pix;
  440. p = block;
  441. pix = pixels;
  442. MOVQ_ZERO(mm7);
  443. MOVQ_WONE(mm6);
  444. JUMPALIGN();
  445. do {
  446. __asm __volatile(
  447. "movq %1, %%mm0\n\t"
  448. "movq %2, %%mm1\n\t"
  449. "movq 1%1, %%mm4\n\t"
  450. "movq 1%2, %%mm5\n\t"
  451. "movq %%mm0, %%mm2\n\t"
  452. "movq %%mm1, %%mm3\n\t"
  453. "punpcklbw %%mm7, %%mm0\n\t"
  454. "punpcklbw %%mm7, %%mm1\n\t"
  455. "punpckhbw %%mm7, %%mm2\n\t"
  456. "punpckhbw %%mm7, %%mm3\n\t"
  457. "paddusw %%mm1, %%mm0\n\t"
  458. "paddusw %%mm3, %%mm2\n\t"
  459. "movq %%mm4, %%mm1\n\t"
  460. "movq %%mm5, %%mm3\n\t"
  461. "punpcklbw %%mm7, %%mm4\n\t"
  462. "punpcklbw %%mm7, %%mm5\n\t"
  463. "punpckhbw %%mm7, %%mm1\n\t"
  464. "punpckhbw %%mm7, %%mm3\n\t"
  465. "paddusw %%mm5, %%mm4\n\t"
  466. "paddusw %%mm3, %%mm1\n\t"
  467. "paddusw %%mm6, %%mm4\n\t"
  468. "paddusw %%mm6, %%mm1\n\t"
  469. "paddusw %%mm4, %%mm0\n\t"
  470. "paddusw %%mm1, %%mm2\n\t"
  471. "psrlw $2, %%mm0\n\t"
  472. "psrlw $2, %%mm2\n\t"
  473. "packuswb %%mm2, %%mm0\n\t"
  474. "movq %%mm0, %0\n\t"
  475. :"=m"(*p)
  476. :"m"(*pix),
  477. "m"(*(pix+line_size))
  478. :"memory");
  479. pix += line_size;
  480. p += line_size;
  481. } while(--h);
  482. }
  483. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  484. {
  485. UINT8 *p;
  486. const UINT8 *pix;
  487. p = block;
  488. pix = pixels;
  489. MOVQ_ZERO(mm7);
  490. MOVQ_WONE(mm6);
  491. JUMPALIGN();
  492. do {
  493. __asm __volatile(
  494. "movq %0, %%mm0\n\t"
  495. "movq %1, %%mm1\n\t"
  496. "movq %%mm0, %%mm2\n\t"
  497. "movq %%mm1, %%mm3\n\t"
  498. "punpcklbw %%mm7, %%mm0\n\t"
  499. "punpcklbw %%mm7, %%mm1\n\t"
  500. "punpckhbw %%mm7, %%mm2\n\t"
  501. "punpckhbw %%mm7, %%mm3\n\t"
  502. "paddusw %%mm1, %%mm0\n\t"
  503. "paddusw %%mm3, %%mm2\n\t"
  504. "paddusw %%mm6, %%mm0\n\t"
  505. "paddusw %%mm6, %%mm2\n\t"
  506. "psrlw $1, %%mm0\n\t"
  507. "psrlw $1, %%mm2\n\t"
  508. "packuswb %%mm2, %%mm0\n\t"
  509. "movq %%mm0, %0\n\t"
  510. :"+m"(*p)
  511. :"m"(*pix)
  512. :"memory");
  513. pix += line_size;
  514. p += line_size;
  515. }
  516. while (--h);
  517. }
  518. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  519. {
  520. UINT8 *p;
  521. const UINT8 *pix;
  522. p = block;
  523. pix = pixels;
  524. MOVQ_ZERO(mm7);
  525. MOVQ_WONE(mm6);
  526. JUMPALIGN();
  527. do {
  528. __asm __volatile(
  529. "movq %1, %%mm1\n\t"
  530. "movq %0, %%mm0\n\t"
  531. "movq 1%1, %%mm4\n\t"
  532. "movq %%mm0, %%mm2\n\t"
  533. "movq %%mm1, %%mm3\n\t"
  534. "movq %%mm4, %%mm5\n\t"
  535. "punpcklbw %%mm7, %%mm1\n\t"
  536. "punpckhbw %%mm7, %%mm3\n\t"
  537. "punpcklbw %%mm7, %%mm4\n\t"
  538. "punpckhbw %%mm7, %%mm5\n\t"
  539. "punpcklbw %%mm7, %%mm0\n\t"
  540. "punpckhbw %%mm7, %%mm2\n\t"
  541. "paddusw %%mm4, %%mm1\n\t"
  542. "paddusw %%mm5, %%mm3\n\t"
  543. "paddusw %%mm6, %%mm1\n\t"
  544. "paddusw %%mm6, %%mm3\n\t"
  545. "psrlw $1, %%mm1\n\t"
  546. "psrlw $1, %%mm3\n\t"
  547. "paddusw %%mm6, %%mm0\n\t"
  548. "paddusw %%mm6, %%mm2\n\t"
  549. "paddusw %%mm1, %%mm0\n\t"
  550. "paddusw %%mm3, %%mm2\n\t"
  551. "psrlw $1, %%mm0\n\t"
  552. "psrlw $1, %%mm2\n\t"
  553. "packuswb %%mm2, %%mm0\n\t"
  554. "movq %%mm0, %0\n\t"
  555. :"+m"(*p)
  556. :"m"(*pix)
  557. :"memory");
  558. pix += line_size;
  559. p += line_size;
  560. } while (--h);
  561. }
  562. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  563. {
  564. UINT8 *p;
  565. const UINT8 *pix;
  566. p = block;
  567. pix = pixels;
  568. MOVQ_ZERO(mm7);
  569. MOVQ_WONE(mm6);
  570. JUMPALIGN();
  571. do {
  572. __asm __volatile(
  573. "movq %1, %%mm1\n\t"
  574. "movq %0, %%mm0\n\t"
  575. "movq %2, %%mm4\n\t"
  576. "movq %%mm0, %%mm2\n\t"
  577. "movq %%mm1, %%mm3\n\t"
  578. "movq %%mm4, %%mm5\n\t"
  579. "punpcklbw %%mm7, %%mm1\n\t"
  580. "punpckhbw %%mm7, %%mm3\n\t"
  581. "punpcklbw %%mm7, %%mm4\n\t"
  582. "punpckhbw %%mm7, %%mm5\n\t"
  583. "punpcklbw %%mm7, %%mm0\n\t"
  584. "punpckhbw %%mm7, %%mm2\n\t"
  585. "paddusw %%mm4, %%mm1\n\t"
  586. "paddusw %%mm5, %%mm3\n\t"
  587. "paddusw %%mm6, %%mm1\n\t"
  588. "paddusw %%mm6, %%mm3\n\t"
  589. "psrlw $1, %%mm1\n\t"
  590. "psrlw $1, %%mm3\n\t"
  591. "paddusw %%mm6, %%mm0\n\t"
  592. "paddusw %%mm6, %%mm2\n\t"
  593. "paddusw %%mm1, %%mm0\n\t"
  594. "paddusw %%mm3, %%mm2\n\t"
  595. "psrlw $1, %%mm0\n\t"
  596. "psrlw $1, %%mm2\n\t"
  597. "packuswb %%mm2, %%mm0\n\t"
  598. "movq %%mm0, %0\n\t"
  599. :"+m"(*p)
  600. :"m"(*pix), "m"(*(pix+line_size))
  601. :"memory");
  602. pix += line_size;
  603. p += line_size ;
  604. } while(--h);
  605. }
  606. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  607. {
  608. UINT8 *p;
  609. const UINT8 *pix;
  610. p = block;
  611. pix = pixels;
  612. MOVQ_ZERO(mm7);
  613. // this doesn't seem to be used offten - so
  614. // the inside usage of mm_wone is not optimized
  615. MOVQ_WTWO(mm6);
  616. do {
  617. __asm __volatile(
  618. "movq %1, %%mm0\n\t"
  619. "movq %2, %%mm1\n\t"
  620. "movq 1%1, %%mm4\n\t"
  621. "movq 1%2, %%mm5\n\t"
  622. "movq %%mm0, %%mm2\n\t"
  623. "movq %%mm1, %%mm3\n\t"
  624. "punpcklbw %%mm7, %%mm0\n\t"
  625. "punpcklbw %%mm7, %%mm1\n\t"
  626. "punpckhbw %%mm7, %%mm2\n\t"
  627. "punpckhbw %%mm7, %%mm3\n\t"
  628. "paddusw %%mm1, %%mm0\n\t"
  629. "paddusw %%mm3, %%mm2\n\t"
  630. "movq %%mm4, %%mm1\n\t"
  631. "movq %%mm5, %%mm3\n\t"
  632. "punpcklbw %%mm7, %%mm4\n\t"
  633. "punpcklbw %%mm7, %%mm5\n\t"
  634. "punpckhbw %%mm7, %%mm1\n\t"
  635. "punpckhbw %%mm7, %%mm3\n\t"
  636. "paddusw %%mm5, %%mm4\n\t"
  637. "paddusw %%mm3, %%mm1\n\t"
  638. "paddusw %%mm6, %%mm4\n\t"
  639. "paddusw %%mm6, %%mm1\n\t"
  640. "paddusw %%mm4, %%mm0\n\t"
  641. "paddusw %%mm1, %%mm2\n\t"
  642. "movq %3, %%mm5\n\t"
  643. "psrlw $2, %%mm0\n\t"
  644. "movq %0, %%mm1\n\t"
  645. "psrlw $2, %%mm2\n\t"
  646. "movq %%mm1, %%mm3\n\t"
  647. "punpcklbw %%mm7, %%mm1\n\t"
  648. "punpckhbw %%mm7, %%mm3\n\t"
  649. "paddusw %%mm1, %%mm0\n\t"
  650. "paddusw %%mm3, %%mm2\n\t"
  651. "paddusw %%mm5, %%mm0\n\t"
  652. "paddusw %%mm5, %%mm2\n\t"
  653. "psrlw $1, %%mm0\n\t"
  654. "psrlw $1, %%mm2\n\t"
  655. "packuswb %%mm2, %%mm0\n\t"
  656. "movq %%mm0, %0\n\t"
  657. :"+m"(*p)
  658. :"m"(*pix),
  659. "m"(*(pix+line_size)), "m"(mm_wone)
  660. :"memory");
  661. pix += line_size;
  662. p += line_size ;
  663. } while(--h);
  664. }
  665. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  666. {
  667. UINT8 *p;
  668. const UINT8 *pix;
  669. p = block;
  670. pix = pixels;
  671. MOVQ_ZERO(mm7);
  672. do {
  673. __asm __volatile(
  674. "movq %1, %%mm0\n\t"
  675. "movq %0, %%mm1\n\t"
  676. "movq %%mm0, %%mm2\n\t"
  677. "movq %%mm1, %%mm3\n\t"
  678. "punpcklbw %%mm7, %%mm0\n\t"
  679. "punpcklbw %%mm7, %%mm1\n\t"
  680. "punpckhbw %%mm7, %%mm2\n\t"
  681. "punpckhbw %%mm7, %%mm3\n\t"
  682. "paddusw %%mm1, %%mm0\n\t"
  683. "paddusw %%mm3, %%mm2\n\t"
  684. "psrlw $1, %%mm0\n\t"
  685. "psrlw $1, %%mm2\n\t"
  686. "packuswb %%mm2, %%mm0\n\t"
  687. "movq %%mm0, %0\n\t"
  688. :"+m"(*p)
  689. :"m"(*pix)
  690. :"memory");
  691. pix += line_size;
  692. p += line_size ;
  693. } while (--h);
  694. }
  695. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  696. {
  697. UINT8 *p;
  698. const UINT8 *pix;
  699. p = block;
  700. pix = pixels;
  701. MOVQ_ZERO(mm7);
  702. do {
  703. __asm __volatile(
  704. "movq %1, %%mm0\n\t"
  705. "movq 1%1, %%mm1\n\t"
  706. "movq %0, %%mm4\n\t"
  707. "movq %%mm0, %%mm2\n\t"
  708. "movq %%mm1, %%mm3\n\t"
  709. "movq %%mm4, %%mm5\n\t"
  710. "punpcklbw %%mm7, %%mm0\n\t"
  711. "punpcklbw %%mm7, %%mm1\n\t"
  712. "punpckhbw %%mm7, %%mm2\n\t"
  713. "punpckhbw %%mm7, %%mm3\n\t"
  714. "punpcklbw %%mm7, %%mm4\n\t"
  715. "punpckhbw %%mm7, %%mm5\n\t"
  716. "paddusw %%mm1, %%mm0\n\t"
  717. "paddusw %%mm3, %%mm2\n\t"
  718. "psrlw $1, %%mm0\n\t"
  719. "psrlw $1, %%mm2\n\t"
  720. "paddusw %%mm4, %%mm0\n\t"
  721. "paddusw %%mm5, %%mm2\n\t"
  722. "psrlw $1, %%mm0\n\t"
  723. "psrlw $1, %%mm2\n\t"
  724. "packuswb %%mm2, %%mm0\n\t"
  725. "movq %%mm0, %0\n\t"
  726. :"+m"(*p)
  727. :"m"(*pix)
  728. :"memory");
  729. pix += line_size;
  730. p += line_size;
  731. } while (--h);
  732. }
  733. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  734. {
  735. UINT8 *p;
  736. const UINT8 *pix;
  737. p = block;
  738. pix = pixels;
  739. MOVQ_ZERO(mm7);
  740. do {
  741. __asm __volatile(
  742. "movq %1, %%mm0\n\t"
  743. "movq %2, %%mm1\n\t"
  744. "movq %0, %%mm4\n\t"
  745. "movq %%mm0, %%mm2\n\t"
  746. "movq %%mm1, %%mm3\n\t"
  747. "movq %%mm4, %%mm5\n\t"
  748. "punpcklbw %%mm7, %%mm0\n\t"
  749. "punpcklbw %%mm7, %%mm1\n\t"
  750. "punpckhbw %%mm7, %%mm2\n\t"
  751. "punpckhbw %%mm7, %%mm3\n\t"
  752. "punpcklbw %%mm7, %%mm4\n\t"
  753. "punpckhbw %%mm7, %%mm5\n\t"
  754. "paddusw %%mm1, %%mm0\n\t"
  755. "paddusw %%mm3, %%mm2\n\t"
  756. "psrlw $1, %%mm0\n\t"
  757. "psrlw $1, %%mm2\n\t"
  758. "paddusw %%mm4, %%mm0\n\t"
  759. "paddusw %%mm5, %%mm2\n\t"
  760. "psrlw $1, %%mm0\n\t"
  761. "psrlw $1, %%mm2\n\t"
  762. "packuswb %%mm2, %%mm0\n\t"
  763. "movq %%mm0, %0\n\t"
  764. :"+m"(*p)
  765. :"m"(*pix), "m"(*(pix+line_size))
  766. :"memory");
  767. pix += line_size;
  768. p += line_size ;
  769. } while(--h);
  770. }
  771. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  772. {
  773. UINT8 *p;
  774. const UINT8 *pix;
  775. p = block;
  776. pix = pixels;
  777. MOVQ_ZERO(mm7);
  778. MOVQ_WONE(mm6);
  779. JUMPALIGN();
  780. do {
  781. __asm __volatile(
  782. "movq %1, %%mm0\n\t"
  783. "movq %2, %%mm1\n\t"
  784. "movq 1%1, %%mm4\n\t"
  785. "movq 1%2, %%mm5\n\t"
  786. "movq %%mm0, %%mm2\n\t"
  787. "movq %%mm1, %%mm3\n\t"
  788. "punpcklbw %%mm7, %%mm0\n\t"
  789. "punpcklbw %%mm7, %%mm1\n\t"
  790. "punpckhbw %%mm7, %%mm2\n\t"
  791. "punpckhbw %%mm7, %%mm3\n\t"
  792. "paddusw %%mm1, %%mm0\n\t"
  793. "paddusw %%mm3, %%mm2\n\t"
  794. "movq %%mm4, %%mm1\n\t"
  795. "movq %%mm5, %%mm3\n\t"
  796. "punpcklbw %%mm7, %%mm4\n\t"
  797. "punpcklbw %%mm7, %%mm5\n\t"
  798. "punpckhbw %%mm7, %%mm1\n\t"
  799. "punpckhbw %%mm7, %%mm3\n\t"
  800. "paddusw %%mm5, %%mm4\n\t"
  801. "paddusw %%mm3, %%mm1\n\t"
  802. "paddusw %%mm6, %%mm4\n\t"
  803. "paddusw %%mm6, %%mm1\n\t"
  804. "paddusw %%mm4, %%mm0\n\t"
  805. "paddusw %%mm1, %%mm2\n\t"
  806. "movq %0, %%mm1\n\t"
  807. "psrlw $2, %%mm0\n\t"
  808. "movq %%mm1, %%mm3\n\t"
  809. "psrlw $2, %%mm2\n\t"
  810. "punpcklbw %%mm7, %%mm1\n\t"
  811. "punpckhbw %%mm7, %%mm3\n\t"
  812. "paddusw %%mm1, %%mm0\n\t"
  813. "paddusw %%mm3, %%mm2\n\t"
  814. "psrlw $1, %%mm0\n\t"
  815. "psrlw $1, %%mm2\n\t"
  816. "packuswb %%mm2, %%mm0\n\t"
  817. "movq %%mm0, %0\n\t"
  818. :"+m"(*p)
  819. :"m"(*pix),
  820. "m"(*(pix+line_size))
  821. :"memory");
  822. pix += line_size;
  823. p += line_size;
  824. } while(--h);
  825. }
  826. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  827. {
  828. DCTELEM *p;
  829. const UINT8 *pix;
  830. p = block;
  831. pix = pixels;
  832. MOVQ_ZERO(mm7);
  833. do {
  834. __asm __volatile(
  835. "movq %0, %%mm0\n\t"
  836. "movq %1, %%mm2\n\t"
  837. "movq 8%0, %%mm1\n\t"
  838. "movq %%mm2, %%mm3\n\t"
  839. "punpcklbw %%mm7, %%mm2\n\t"
  840. "punpckhbw %%mm7, %%mm3\n\t"
  841. "psubsw %%mm2, %%mm0\n\t"
  842. "psubsw %%mm3, %%mm1\n\t"
  843. "movq %%mm0, %0\n\t"
  844. "movq %%mm1, 8%0\n\t"
  845. :"+m"(*p)
  846. :"m"(*pix)
  847. :"memory");
  848. pix += line_size;
  849. p += 8;
  850. } while (--h);
  851. }
  852. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  853. {
  854. DCTELEM *p;
  855. const UINT8 *pix;
  856. p = block;
  857. pix = pixels;
  858. MOVQ_ZERO(mm7);
  859. MOVQ_WONE(mm6);
  860. JUMPALIGN();
  861. do {
  862. __asm __volatile(
  863. "movq %0, %%mm0\n\t"
  864. "movq %1, %%mm2\n\t"
  865. "movq 8%0, %%mm1\n\t"
  866. "movq 1%1, %%mm4\n\t"
  867. "movq %%mm2, %%mm3\n\t"
  868. "movq %%mm4, %%mm5\n\t"
  869. "punpcklbw %%mm7, %%mm2\n\t"
  870. "punpckhbw %%mm7, %%mm3\n\t"
  871. "punpcklbw %%mm7, %%mm4\n\t"
  872. "punpckhbw %%mm7, %%mm5\n\t"
  873. "paddusw %%mm4, %%mm2\n\t"
  874. "paddusw %%mm5, %%mm3\n\t"
  875. "paddusw %%mm6, %%mm2\n\t"
  876. "paddusw %%mm6, %%mm3\n\t"
  877. "psrlw $1, %%mm2\n\t"
  878. "psrlw $1, %%mm3\n\t"
  879. "psubsw %%mm2, %%mm0\n\t"
  880. "psubsw %%mm3, %%mm1\n\t"
  881. "movq %%mm0, %0\n\t"
  882. "movq %%mm1, 8%0\n\t"
  883. :"+m"(*p)
  884. :"m"(*pix)
  885. :"memory");
  886. pix += line_size;
  887. p += 8;
  888. } while (--h);
  889. }
  890. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  891. {
  892. DCTELEM *p;
  893. const UINT8 *pix;
  894. p = block;
  895. pix = pixels;
  896. MOVQ_ZERO(mm7);
  897. MOVQ_WONE(mm6);
  898. do {
  899. __asm __volatile(
  900. "movq %0, %%mm0\n\t"
  901. "movq %1, %%mm2\n\t"
  902. "movq 8%0, %%mm1\n\t"
  903. "movq %2, %%mm4\n\t"
  904. "movq %%mm2, %%mm3\n\t"
  905. "movq %%mm4, %%mm5\n\t"
  906. "punpcklbw %%mm7, %%mm2\n\t"
  907. "punpckhbw %%mm7, %%mm3\n\t"
  908. "punpcklbw %%mm7, %%mm4\n\t"
  909. "punpckhbw %%mm7, %%mm5\n\t"
  910. "paddusw %%mm4, %%mm2\n\t"
  911. "paddusw %%mm5, %%mm3\n\t"
  912. "paddusw %%mm6, %%mm2\n\t"
  913. "paddusw %%mm6, %%mm3\n\t"
  914. "psrlw $1, %%mm2\n\t"
  915. "psrlw $1, %%mm3\n\t"
  916. "psubsw %%mm2, %%mm0\n\t"
  917. "psubsw %%mm3, %%mm1\n\t"
  918. "movq %%mm0, %0\n\t"
  919. "movq %%mm1, 8%0\n\t"
  920. :"+m"(*p)
  921. :"m"(*pix), "m"(*(pix+line_size))
  922. :"memory");
  923. pix += line_size;
  924. p += 8;
  925. } while (--h);
  926. }
  927. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  928. {
  929. DCTELEM *p;
  930. const UINT8 *pix;
  931. p = block;
  932. pix = pixels;
  933. MOVQ_ZERO(mm7);
  934. MOVQ_WTWO(mm6);
  935. JUMPALIGN();
  936. do {
  937. __asm __volatile(
  938. "movq %1, %%mm0\n\t"
  939. "movq %2, %%mm1\n\t"
  940. "movq 1%1, %%mm4\n\t"
  941. "movq 1%2, %%mm5\n\t"
  942. "movq %%mm0, %%mm2\n\t"
  943. "movq %%mm1, %%mm3\n\t"
  944. "punpcklbw %%mm7, %%mm0\n\t"
  945. "punpcklbw %%mm7, %%mm1\n\t"
  946. "punpckhbw %%mm7, %%mm2\n\t"
  947. "punpckhbw %%mm7, %%mm3\n\t"
  948. "paddusw %%mm1, %%mm0\n\t"
  949. "paddusw %%mm3, %%mm2\n\t"
  950. "movq %%mm4, %%mm1\n\t"
  951. "movq %%mm5, %%mm3\n\t"
  952. "punpcklbw %%mm7, %%mm4\n\t"
  953. "punpcklbw %%mm7, %%mm5\n\t"
  954. "punpckhbw %%mm7, %%mm1\n\t"
  955. "punpckhbw %%mm7, %%mm3\n\t"
  956. "paddusw %%mm5, %%mm4\n\t"
  957. "paddusw %%mm3, %%mm1\n\t"
  958. "paddusw %%mm6, %%mm4\n\t"
  959. "paddusw %%mm6, %%mm1\n\t"
  960. "paddusw %%mm4, %%mm0\n\t"
  961. "paddusw %%mm1, %%mm2\n\t"
  962. "movq %0, %%mm1\n\t"
  963. "movq 8%0, %%mm3\n\t"
  964. "psrlw $2, %%mm0\n\t"
  965. "psrlw $2, %%mm2\n\t"
  966. "psubsw %%mm0, %%mm1\n\t"
  967. "psubsw %%mm2, %%mm3\n\t"
  968. "movq %%mm1, %0\n\t"
  969. "movq %%mm3, 8%0\n\t"
  970. :"+m"(*p)
  971. :"m"(*pix),
  972. "m"(*(pix+line_size))
  973. :"memory");
  974. pix += line_size;
  975. p += 8 ;
  976. } while(--h);
  977. }
  978. static void clear_blocks_mmx(DCTELEM *blocks)
  979. {
  980. asm volatile(
  981. "pxor %%mm7, %%mm7 \n\t"
  982. "movl $-128*6, %%eax \n\t"
  983. "1: \n\t"
  984. "movq %%mm7, (%0, %%eax) \n\t"
  985. "movq %%mm7, 8(%0, %%eax) \n\t"
  986. "movq %%mm7, 16(%0, %%eax) \n\t"
  987. "movq %%mm7, 24(%0, %%eax) \n\t"
  988. "addl $32, %%eax \n\t"
  989. " js 1b \n\t"
  990. : : "r" (((int)blocks)+128*6)
  991. : "%eax"
  992. );
  993. }
  994. static void just_return() { return; }
  995. void dsputil_init_mmx(void)
  996. {
  997. mm_flags = mm_support();
  998. #if 1
  999. printf("libavcodec: CPU flags:");
  1000. if (mm_flags & MM_MMX)
  1001. printf(" mmx");
  1002. if (mm_flags & MM_MMXEXT)
  1003. printf(" mmxext");
  1004. if (mm_flags & MM_3DNOW)
  1005. printf(" 3dnow");
  1006. if (mm_flags & MM_SSE)
  1007. printf(" sse");
  1008. if (mm_flags & MM_SSE2)
  1009. printf(" sse2");
  1010. printf("\n");
  1011. #endif
  1012. if (mm_flags & MM_MMX) {
  1013. get_pixels = get_pixels_mmx;
  1014. put_pixels_clamped = put_pixels_clamped_mmx;
  1015. add_pixels_clamped = add_pixels_clamped_mmx;
  1016. clear_blocks= clear_blocks_mmx;
  1017. pix_abs16x16 = pix_abs16x16_mmx;
  1018. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1019. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1020. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1021. pix_abs8x8 = pix_abs8x8_mmx;
  1022. pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1023. pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1024. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  1025. av_fdct = fdct_mmx;
  1026. put_pixels_tab[0] = put_pixels_mmx;
  1027. put_pixels_tab[1] = put_pixels_x2_mmx;
  1028. put_pixels_tab[2] = put_pixels_y2_mmx;
  1029. put_pixels_tab[3] = put_pixels_xy2_mmx;
  1030. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  1031. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1032. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1033. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  1034. avg_pixels_tab[0] = avg_pixels_mmx;
  1035. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  1036. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  1037. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1038. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  1039. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  1040. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  1041. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  1042. sub_pixels_tab[0] = sub_pixels_mmx;
  1043. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  1044. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  1045. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  1046. if (mm_flags & MM_MMXEXT) {
  1047. pix_abs16x16 = pix_abs16x16_mmx2;
  1048. pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  1049. pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  1050. pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
  1051. pix_abs8x8 = pix_abs8x8_mmx2;
  1052. pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  1053. pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  1054. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
  1055. put_pixels_tab[1] = put_pixels_x2_sse;
  1056. put_pixels_tab[2] = put_pixels_y2_sse;
  1057. avg_pixels_tab[0] = avg_pixels_sse;
  1058. avg_pixels_tab[1] = avg_pixels_x2_sse;
  1059. avg_pixels_tab[2] = avg_pixels_y2_sse;
  1060. avg_pixels_tab[3] = avg_pixels_xy2_sse;
  1061. sub_pixels_tab[1] = sub_pixels_x2_sse;
  1062. sub_pixels_tab[2] = sub_pixels_y2_sse;
  1063. } else if (mm_flags & MM_3DNOW) {
  1064. put_pixels_tab[1] = put_pixels_x2_3dnow;
  1065. put_pixels_tab[2] = put_pixels_y2_3dnow;
  1066. avg_pixels_tab[0] = avg_pixels_3dnow;
  1067. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  1068. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  1069. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  1070. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  1071. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  1072. }
  1073. /* idct */
  1074. if (mm_flags & MM_MMXEXT) {
  1075. ff_idct = ff_mmxext_idct;
  1076. } else {
  1077. ff_idct = ff_mmx_idct;
  1078. }
  1079. #ifdef SIMPLE_IDCT
  1080. // ff_idct = simple_idct;
  1081. ff_idct = simple_idct_mmx;
  1082. #endif
  1083. }
  1084. #if 0
  1085. // for speed testing
  1086. get_pixels = just_return;
  1087. put_pixels_clamped = just_return;
  1088. add_pixels_clamped = just_return;
  1089. pix_abs16x16 = just_return;
  1090. pix_abs16x16_x2 = just_return;
  1091. pix_abs16x16_y2 = just_return;
  1092. pix_abs16x16_xy2 = just_return;
  1093. put_pixels_tab[0] = just_return;
  1094. put_pixels_tab[1] = just_return;
  1095. put_pixels_tab[2] = just_return;
  1096. put_pixels_tab[3] = just_return;
  1097. put_no_rnd_pixels_tab[0] = just_return;
  1098. put_no_rnd_pixels_tab[1] = just_return;
  1099. put_no_rnd_pixels_tab[2] = just_return;
  1100. put_no_rnd_pixels_tab[3] = just_return;
  1101. avg_pixels_tab[0] = just_return;
  1102. avg_pixels_tab[1] = just_return;
  1103. avg_pixels_tab[2] = just_return;
  1104. avg_pixels_tab[3] = just_return;
  1105. avg_no_rnd_pixels_tab[0] = just_return;
  1106. avg_no_rnd_pixels_tab[1] = just_return;
  1107. avg_no_rnd_pixels_tab[2] = just_return;
  1108. avg_no_rnd_pixels_tab[3] = just_return;
  1109. sub_pixels_tab[0] = just_return;
  1110. sub_pixels_tab[1] = just_return;
  1111. sub_pixels_tab[2] = just_return;
  1112. sub_pixels_tab[3] = just_return;
  1113. //av_fdct = just_return;
  1114. //ff_idct = just_return;
  1115. #endif
  1116. }