You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1228 lines
31KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. #include "../mangle.h"
  24. int mm_flags; /* multimedia extension flags */
  25. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  41. /* external functions, from idct_mmx.c */
  42. void ff_mmx_idct(DCTELEM *block);
  43. void ff_mmxext_idct(DCTELEM *block);
  44. /* pixel operations */
  45. static const unsigned long long int mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101LL;
  46. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001LL;
  47. static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002LL;
  48. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  49. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  50. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  51. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  52. #ifndef PIC
  53. #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
  54. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  55. #else
  56. // for shared library it's better to use this way for accessing constants
  57. // pcmpeqd -> -1
  58. #define MOVQ_WONE(regd) \
  59. __asm __volatile ( \
  60. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  61. "psrlw $15, %%" #regd ::)
  62. #define MOVQ_WTWO(regd) \
  63. __asm __volatile ( \
  64. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  65. "psrlw $15, %%" #regd " \n\t" \
  66. "psllw $1, %%" #regd ::)
  67. #endif
  68. /***********************************/
  69. /* 3Dnow specific */
  70. #define DEF(x) x ## _3dnow
  71. /* for Athlons PAVGUSB is prefered */
  72. #define PAVGB "pavgusb"
  73. #include "dsputil_mmx_avg.h"
  74. #undef DEF
  75. #undef PAVGB
  76. /***********************************/
  77. /* MMX2 specific */
  78. #define DEF(x) x ## _mmx2
  79. /* Introduced only in MMX2 set */
  80. #define PAVGB "pavgb"
  81. #include "dsputil_mmx_avg.h"
  82. #undef DEF
  83. #undef PAVGB
  84. /***********************************/
  85. /* standard MMX */
  86. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  87. {
  88. asm volatile(
  89. "movl $-128, %%eax \n\t"
  90. "pxor %%mm7, %%mm7 \n\t"
  91. ".balign 16 \n\t"
  92. "1: \n\t"
  93. "movq (%0), %%mm0 \n\t"
  94. "movq (%0, %2), %%mm2 \n\t"
  95. "movq %%mm0, %%mm1 \n\t"
  96. "movq %%mm2, %%mm3 \n\t"
  97. "punpcklbw %%mm7, %%mm0 \n\t"
  98. "punpckhbw %%mm7, %%mm1 \n\t"
  99. "punpcklbw %%mm7, %%mm2 \n\t"
  100. "punpckhbw %%mm7, %%mm3 \n\t"
  101. "movq %%mm0, (%1, %%eax)\n\t"
  102. "movq %%mm1, 8(%1, %%eax)\n\t"
  103. "movq %%mm2, 16(%1, %%eax)\n\t"
  104. "movq %%mm3, 24(%1, %%eax)\n\t"
  105. "addl %3, %0 \n\t"
  106. "addl $32, %%eax \n\t"
  107. "js 1b \n\t"
  108. : "+r" (pixels)
  109. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  110. : "%eax"
  111. );
  112. }
  113. static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
  114. {
  115. asm volatile(
  116. "pxor %%mm7, %%mm7 \n\t"
  117. "movl $-128, %%eax \n\t"
  118. ".balign 16 \n\t"
  119. "1: \n\t"
  120. "movq (%0), %%mm0 \n\t"
  121. "movq (%1), %%mm2 \n\t"
  122. "movq %%mm0, %%mm1 \n\t"
  123. "movq %%mm2, %%mm3 \n\t"
  124. "punpcklbw %%mm7, %%mm0 \n\t"
  125. "punpckhbw %%mm7, %%mm1 \n\t"
  126. "punpcklbw %%mm7, %%mm2 \n\t"
  127. "punpckhbw %%mm7, %%mm3 \n\t"
  128. "psubw %%mm2, %%mm0 \n\t"
  129. "psubw %%mm3, %%mm1 \n\t"
  130. "movq %%mm0, (%2, %%eax)\n\t"
  131. "movq %%mm1, 8(%2, %%eax)\n\t"
  132. "addl %3, %0 \n\t"
  133. "addl %3, %1 \n\t"
  134. "addl $16, %%eax \n\t"
  135. "jnz 1b \n\t"
  136. : "+r" (s1), "+r" (s2)
  137. : "r" (block+64), "r" (stride)
  138. : "%eax"
  139. );
  140. }
  141. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  142. {
  143. const DCTELEM *p;
  144. UINT8 *pix;
  145. /* read the pixels */
  146. p = block;
  147. pix = pixels;
  148. /* unrolled loop */
  149. __asm __volatile(
  150. "movq %3, %%mm0\n\t"
  151. "movq 8%3, %%mm1\n\t"
  152. "movq 16%3, %%mm2\n\t"
  153. "movq 24%3, %%mm3\n\t"
  154. "movq 32%3, %%mm4\n\t"
  155. "movq 40%3, %%mm5\n\t"
  156. "movq 48%3, %%mm6\n\t"
  157. "movq 56%3, %%mm7\n\t"
  158. "packuswb %%mm1, %%mm0\n\t"
  159. "packuswb %%mm3, %%mm2\n\t"
  160. "packuswb %%mm5, %%mm4\n\t"
  161. "packuswb %%mm7, %%mm6\n\t"
  162. "movq %%mm0, (%0)\n\t"
  163. "movq %%mm2, (%0, %1)\n\t"
  164. "movq %%mm4, (%0, %1, 2)\n\t"
  165. "movq %%mm6, (%0, %2)\n\t"
  166. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  167. :"memory");
  168. pix += line_size*4;
  169. p += 32;
  170. // if here would be an exact copy of the code above
  171. // compiler would generate some very strange code
  172. // thus using "r"
  173. __asm __volatile(
  174. "movq (%3), %%mm0\n\t"
  175. "movq 8(%3), %%mm1\n\t"
  176. "movq 16(%3), %%mm2\n\t"
  177. "movq 24(%3), %%mm3\n\t"
  178. "movq 32(%3), %%mm4\n\t"
  179. "movq 40(%3), %%mm5\n\t"
  180. "movq 48(%3), %%mm6\n\t"
  181. "movq 56(%3), %%mm7\n\t"
  182. "packuswb %%mm1, %%mm0\n\t"
  183. "packuswb %%mm3, %%mm2\n\t"
  184. "packuswb %%mm5, %%mm4\n\t"
  185. "packuswb %%mm7, %%mm6\n\t"
  186. "movq %%mm0, (%0)\n\t"
  187. "movq %%mm2, (%0, %1)\n\t"
  188. "movq %%mm4, (%0, %1, 2)\n\t"
  189. "movq %%mm6, (%0, %2)\n\t"
  190. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  191. :"memory");
  192. }
  193. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  194. {
  195. const DCTELEM *p;
  196. UINT8 *pix;
  197. int i;
  198. /* read the pixels */
  199. p = block;
  200. pix = pixels;
  201. MOVQ_ZERO(mm7);
  202. i = 4;
  203. do {
  204. __asm __volatile(
  205. "movq (%2), %%mm0\n\t"
  206. "movq 8(%2), %%mm1\n\t"
  207. "movq 16(%2), %%mm2\n\t"
  208. "movq 24(%2), %%mm3\n\t"
  209. "movq %0, %%mm4\n\t"
  210. "movq %1, %%mm6\n\t"
  211. "movq %%mm4, %%mm5\n\t"
  212. "punpcklbw %%mm7, %%mm4\n\t"
  213. "punpckhbw %%mm7, %%mm5\n\t"
  214. "paddsw %%mm4, %%mm0\n\t"
  215. "paddsw %%mm5, %%mm1\n\t"
  216. "movq %%mm6, %%mm5\n\t"
  217. "punpcklbw %%mm7, %%mm6\n\t"
  218. "punpckhbw %%mm7, %%mm5\n\t"
  219. "paddsw %%mm6, %%mm2\n\t"
  220. "paddsw %%mm5, %%mm3\n\t"
  221. "packuswb %%mm1, %%mm0\n\t"
  222. "packuswb %%mm3, %%mm2\n\t"
  223. "movq %%mm0, %0\n\t"
  224. "movq %%mm2, %1\n\t"
  225. :"+m"(*pix), "+m"(*(pix+line_size))
  226. :"r"(p)
  227. :"memory");
  228. pix += line_size*2;
  229. p += 16;
  230. } while (--i);
  231. }
  232. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  233. {
  234. #if 0 //FIXME h==4 case
  235. asm volatile(
  236. "xorl %%eax, %%eax \n\t"
  237. "movl %3, %%esi \n\t"
  238. "1: \n\t"
  239. "movq (%1, %%eax), %%mm0 \n\t"
  240. "movq %%mm0, (%0, %%eax) \n\t"
  241. "addl %2, %%eax \n\t"
  242. "movq (%1, %%eax), %%mm0 \n\t"
  243. "movq %%mm0, (%0, %%eax) \n\t"
  244. "addl %2, %%eax \n\t"
  245. "movq (%1, %%eax), %%mm0 \n\t"
  246. "movq %%mm0, (%0, %%eax) \n\t"
  247. "addl %2, %%eax \n\t"
  248. "movq (%1, %%eax), %%mm0 \n\t"
  249. "movq %%mm0, (%0, %%eax) \n\t"
  250. "addl %2, %%eax \n\t"
  251. "movq (%1, %%eax), %%mm0 \n\t"
  252. "movq %%mm0, (%0, %%eax) \n\t"
  253. "addl %2, %%eax \n\t"
  254. "movq (%1, %%eax), %%mm0 \n\t"
  255. "movq %%mm0, (%0, %%eax) \n\t"
  256. "addl %2, %%eax \n\t"
  257. "movq (%1, %%eax), %%mm0 \n\t"
  258. "movq %%mm0, (%0, %%eax) \n\t"
  259. "addl %2, %%eax \n\t"
  260. "movq (%1, %%eax), %%mm0 \n\t"
  261. "movq %%mm0, (%0, %%eax) \n\t"
  262. "addl %2, %%eax \n\t"
  263. "subl $8, %%esi \n\t"
  264. " jnz 1b \n\t"
  265. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  266. : "%eax", "%esi", "memory"
  267. );
  268. #else
  269. asm volatile(
  270. "xorl %%eax, %%eax \n\t"
  271. "movl %3, %%esi \n\t"
  272. "1: \n\t"
  273. "movq (%1, %%eax), %%mm0 \n\t"
  274. "movq %%mm0, (%0, %%eax) \n\t"
  275. "addl %2, %%eax \n\t"
  276. "movq (%1, %%eax), %%mm0 \n\t"
  277. "movq %%mm0, (%0, %%eax) \n\t"
  278. "addl %2, %%eax \n\t"
  279. "movq (%1, %%eax), %%mm0 \n\t"
  280. "movq %%mm0, (%0, %%eax) \n\t"
  281. "addl %2, %%eax \n\t"
  282. "movq (%1, %%eax), %%mm0 \n\t"
  283. "movq %%mm0, (%0, %%eax) \n\t"
  284. "addl %2, %%eax \n\t"
  285. "subl $4, %%esi \n\t"
  286. " jnz 1b \n\t"
  287. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  288. : "%eax", "%esi", "memory"
  289. );
  290. #endif
  291. }
  292. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  293. {
  294. UINT8 *p;
  295. const UINT8 *pix;
  296. p = block;
  297. pix = pixels;
  298. MOVQ_ZERO(mm7);
  299. MOVQ_WONE(mm4);
  300. JUMPALIGN();
  301. do {
  302. __asm __volatile(
  303. "movq %1, %%mm0\n\t"
  304. "movq 1%1, %%mm1\n\t"
  305. "movq %%mm0, %%mm2\n\t"
  306. "movq %%mm1, %%mm3\n\t"
  307. "punpcklbw %%mm7, %%mm0\n\t"
  308. "punpcklbw %%mm7, %%mm1\n\t"
  309. "punpckhbw %%mm7, %%mm2\n\t"
  310. "punpckhbw %%mm7, %%mm3\n\t"
  311. "paddusw %%mm1, %%mm0\n\t"
  312. "paddusw %%mm3, %%mm2\n\t"
  313. "paddusw %%mm4, %%mm0\n\t"
  314. "paddusw %%mm4, %%mm2\n\t"
  315. "psrlw $1, %%mm0\n\t"
  316. "psrlw $1, %%mm2\n\t"
  317. "packuswb %%mm2, %%mm0\n\t"
  318. "movq %%mm0, %0\n\t"
  319. :"=m"(*p)
  320. :"m"(*pix)
  321. :"memory");
  322. pix += line_size; p += line_size;
  323. } while (--h);
  324. }
  325. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  326. {
  327. UINT8 *p;
  328. const UINT8 *pix;
  329. p = block;
  330. pix = pixels;
  331. MOVQ_ZERO(mm7);
  332. MOVQ_WONE(mm4);
  333. JUMPALIGN();
  334. do {
  335. __asm __volatile(
  336. "movq %1, %%mm0\n\t"
  337. "movq %2, %%mm1\n\t"
  338. "movq %%mm0, %%mm2\n\t"
  339. "movq %%mm1, %%mm3\n\t"
  340. "punpcklbw %%mm7, %%mm0\n\t"
  341. "punpcklbw %%mm7, %%mm1\n\t"
  342. "punpckhbw %%mm7, %%mm2\n\t"
  343. "punpckhbw %%mm7, %%mm3\n\t"
  344. "paddusw %%mm1, %%mm0\n\t"
  345. "paddusw %%mm3, %%mm2\n\t"
  346. "paddusw %%mm4, %%mm0\n\t"
  347. "paddusw %%mm4, %%mm2\n\t"
  348. "psrlw $1, %%mm0\n\t"
  349. "psrlw $1, %%mm2\n\t"
  350. "packuswb %%mm2, %%mm0\n\t"
  351. "movq %%mm0, %0\n\t"
  352. :"=m"(*p)
  353. :"m"(*pix),
  354. "m"(*(pix+line_size))
  355. :"memory");
  356. pix += line_size;
  357. p += line_size;
  358. } while (--h);
  359. }
  360. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  361. {
  362. UINT8 *p;
  363. const UINT8 *pix;
  364. p = block;
  365. pix = pixels; // 1s
  366. MOVQ_ZERO(mm7);
  367. MOVQ_WTWO(mm6);
  368. JUMPALIGN();
  369. do {
  370. __asm __volatile(
  371. "movq %1, %%mm0\n\t"
  372. "movq %2, %%mm1\n\t"
  373. "movq 1%1, %%mm4\n\t"
  374. "movq 1%2, %%mm5\n\t"
  375. "movq %%mm0, %%mm2\n\t"
  376. "movq %%mm1, %%mm3\n\t"
  377. "punpcklbw %%mm7, %%mm0\n\t"
  378. "punpcklbw %%mm7, %%mm1\n\t"
  379. "punpckhbw %%mm7, %%mm2\n\t"
  380. "punpckhbw %%mm7, %%mm3\n\t"
  381. "paddusw %%mm1, %%mm0\n\t"
  382. "paddusw %%mm3, %%mm2\n\t"
  383. "movq %%mm4, %%mm1\n\t"
  384. "movq %%mm5, %%mm3\n\t"
  385. "punpcklbw %%mm7, %%mm4\n\t"
  386. "punpcklbw %%mm7, %%mm5\n\t"
  387. "punpckhbw %%mm7, %%mm1\n\t"
  388. "punpckhbw %%mm7, %%mm3\n\t"
  389. "paddusw %%mm5, %%mm4\n\t"
  390. "paddusw %%mm3, %%mm1\n\t"
  391. "paddusw %%mm6, %%mm4\n\t"
  392. "paddusw %%mm6, %%mm1\n\t"
  393. "paddusw %%mm4, %%mm0\n\t"
  394. "paddusw %%mm1, %%mm2\n\t"
  395. "psrlw $2, %%mm0\n\t"
  396. "psrlw $2, %%mm2\n\t"
  397. "packuswb %%mm2, %%mm0\n\t"
  398. "movq %%mm0, %0\n\t"
  399. :"=m"(*p)
  400. :"m"(*pix),
  401. "m"(*(pix+line_size))
  402. :"memory");
  403. pix += line_size;
  404. p += line_size;
  405. } while(--h);
  406. }
  407. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  408. {
  409. UINT8 *p;
  410. const UINT8 *pix;
  411. p = block;
  412. pix = pixels;
  413. MOVQ_ZERO(mm7);
  414. do {
  415. __asm __volatile(
  416. "movq %1, %%mm0\n\t"
  417. "movq 1%1, %%mm1\n\t"
  418. "movq %%mm0, %%mm2\n\t"
  419. "movq %%mm1, %%mm3\n\t"
  420. "punpcklbw %%mm7, %%mm0\n\t"
  421. "punpcklbw %%mm7, %%mm1\n\t"
  422. "punpckhbw %%mm7, %%mm2\n\t"
  423. "punpckhbw %%mm7, %%mm3\n\t"
  424. "paddusw %%mm1, %%mm0\n\t"
  425. "paddusw %%mm3, %%mm2\n\t"
  426. "psrlw $1, %%mm0\n\t"
  427. "psrlw $1, %%mm2\n\t"
  428. "packuswb %%mm2, %%mm0\n\t"
  429. "movq %%mm0, %0\n\t"
  430. :"=m"(*p)
  431. :"m"(*pix)
  432. :"memory");
  433. pix += line_size;
  434. p += line_size;
  435. } while (--h);
  436. }
  437. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  438. {
  439. UINT8 *p;
  440. const UINT8 *pix;
  441. p = block;
  442. pix = pixels;
  443. MOVQ_ZERO(mm7);
  444. JUMPALIGN();
  445. do {
  446. __asm __volatile(
  447. "movq %1, %%mm0\n\t"
  448. "movq %2, %%mm1\n\t"
  449. "movq %%mm0, %%mm2\n\t"
  450. "movq %%mm1, %%mm3\n\t"
  451. "punpcklbw %%mm7, %%mm0\n\t"
  452. "punpcklbw %%mm7, %%mm1\n\t"
  453. "punpckhbw %%mm7, %%mm2\n\t"
  454. "punpckhbw %%mm7, %%mm3\n\t"
  455. "paddusw %%mm1, %%mm0\n\t"
  456. "paddusw %%mm3, %%mm2\n\t"
  457. "psrlw $1, %%mm0\n\t"
  458. "psrlw $1, %%mm2\n\t"
  459. "packuswb %%mm2, %%mm0\n\t"
  460. "movq %%mm0, %0\n\t"
  461. :"=m"(*p)
  462. :"m"(*pix),
  463. "m"(*(pix+line_size))
  464. :"memory");
  465. pix += line_size;
  466. p += line_size;
  467. } while(--h);
  468. }
  469. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  470. {
  471. UINT8 *p;
  472. const UINT8 *pix;
  473. p = block;
  474. pix = pixels;
  475. MOVQ_ZERO(mm7);
  476. MOVQ_WONE(mm6);
  477. JUMPALIGN();
  478. do {
  479. __asm __volatile(
  480. "movq %1, %%mm0\n\t"
  481. "movq %2, %%mm1\n\t"
  482. "movq 1%1, %%mm4\n\t"
  483. "movq 1%2, %%mm5\n\t"
  484. "movq %%mm0, %%mm2\n\t"
  485. "movq %%mm1, %%mm3\n\t"
  486. "punpcklbw %%mm7, %%mm0\n\t"
  487. "punpcklbw %%mm7, %%mm1\n\t"
  488. "punpckhbw %%mm7, %%mm2\n\t"
  489. "punpckhbw %%mm7, %%mm3\n\t"
  490. "paddusw %%mm1, %%mm0\n\t"
  491. "paddusw %%mm3, %%mm2\n\t"
  492. "movq %%mm4, %%mm1\n\t"
  493. "movq %%mm5, %%mm3\n\t"
  494. "punpcklbw %%mm7, %%mm4\n\t"
  495. "punpcklbw %%mm7, %%mm5\n\t"
  496. "punpckhbw %%mm7, %%mm1\n\t"
  497. "punpckhbw %%mm7, %%mm3\n\t"
  498. "paddusw %%mm5, %%mm4\n\t"
  499. "paddusw %%mm3, %%mm1\n\t"
  500. "paddusw %%mm6, %%mm4\n\t"
  501. "paddusw %%mm6, %%mm1\n\t"
  502. "paddusw %%mm4, %%mm0\n\t"
  503. "paddusw %%mm1, %%mm2\n\t"
  504. "psrlw $2, %%mm0\n\t"
  505. "psrlw $2, %%mm2\n\t"
  506. "packuswb %%mm2, %%mm0\n\t"
  507. "movq %%mm0, %0\n\t"
  508. :"=m"(*p)
  509. :"m"(*pix),
  510. "m"(*(pix+line_size))
  511. :"memory");
  512. pix += line_size;
  513. p += line_size;
  514. } while(--h);
  515. }
  516. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  517. {
  518. UINT8 *p;
  519. const UINT8 *pix;
  520. p = block;
  521. pix = pixels;
  522. MOVQ_ZERO(mm7);
  523. MOVQ_WONE(mm6);
  524. JUMPALIGN();
  525. do {
  526. __asm __volatile(
  527. "movq %0, %%mm0\n\t"
  528. "movq %1, %%mm1\n\t"
  529. "movq %%mm0, %%mm2\n\t"
  530. "movq %%mm1, %%mm3\n\t"
  531. "punpcklbw %%mm7, %%mm0\n\t"
  532. "punpcklbw %%mm7, %%mm1\n\t"
  533. "punpckhbw %%mm7, %%mm2\n\t"
  534. "punpckhbw %%mm7, %%mm3\n\t"
  535. "paddusw %%mm1, %%mm0\n\t"
  536. "paddusw %%mm3, %%mm2\n\t"
  537. "paddusw %%mm6, %%mm0\n\t"
  538. "paddusw %%mm6, %%mm2\n\t"
  539. "psrlw $1, %%mm0\n\t"
  540. "psrlw $1, %%mm2\n\t"
  541. "packuswb %%mm2, %%mm0\n\t"
  542. "movq %%mm0, %0\n\t"
  543. :"+m"(*p)
  544. :"m"(*pix)
  545. :"memory");
  546. pix += line_size;
  547. p += line_size;
  548. }
  549. while (--h);
  550. }
  551. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  552. {
  553. UINT8 *p;
  554. const UINT8 *pix;
  555. p = block;
  556. pix = pixels;
  557. MOVQ_ZERO(mm7);
  558. MOVQ_WONE(mm6);
  559. JUMPALIGN();
  560. do {
  561. __asm __volatile(
  562. "movq %1, %%mm1\n\t"
  563. "movq %0, %%mm0\n\t"
  564. "movq 1%1, %%mm4\n\t"
  565. "movq %%mm0, %%mm2\n\t"
  566. "movq %%mm1, %%mm3\n\t"
  567. "movq %%mm4, %%mm5\n\t"
  568. "punpcklbw %%mm7, %%mm1\n\t"
  569. "punpckhbw %%mm7, %%mm3\n\t"
  570. "punpcklbw %%mm7, %%mm4\n\t"
  571. "punpckhbw %%mm7, %%mm5\n\t"
  572. "punpcklbw %%mm7, %%mm0\n\t"
  573. "punpckhbw %%mm7, %%mm2\n\t"
  574. "paddusw %%mm4, %%mm1\n\t"
  575. "paddusw %%mm5, %%mm3\n\t"
  576. "paddusw %%mm6, %%mm1\n\t"
  577. "paddusw %%mm6, %%mm3\n\t"
  578. "psrlw $1, %%mm1\n\t"
  579. "psrlw $1, %%mm3\n\t"
  580. "paddusw %%mm6, %%mm0\n\t"
  581. "paddusw %%mm6, %%mm2\n\t"
  582. "paddusw %%mm1, %%mm0\n\t"
  583. "paddusw %%mm3, %%mm2\n\t"
  584. "psrlw $1, %%mm0\n\t"
  585. "psrlw $1, %%mm2\n\t"
  586. "packuswb %%mm2, %%mm0\n\t"
  587. "movq %%mm0, %0\n\t"
  588. :"+m"(*p)
  589. :"m"(*pix)
  590. :"memory");
  591. pix += line_size;
  592. p += line_size;
  593. } while (--h);
  594. }
  595. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  596. {
  597. UINT8 *p;
  598. const UINT8 *pix;
  599. p = block;
  600. pix = pixels;
  601. MOVQ_ZERO(mm7);
  602. MOVQ_WONE(mm6);
  603. JUMPALIGN();
  604. do {
  605. __asm __volatile(
  606. "movq %1, %%mm1\n\t"
  607. "movq %0, %%mm0\n\t"
  608. "movq %2, %%mm4\n\t"
  609. "movq %%mm0, %%mm2\n\t"
  610. "movq %%mm1, %%mm3\n\t"
  611. "movq %%mm4, %%mm5\n\t"
  612. "punpcklbw %%mm7, %%mm1\n\t"
  613. "punpckhbw %%mm7, %%mm3\n\t"
  614. "punpcklbw %%mm7, %%mm4\n\t"
  615. "punpckhbw %%mm7, %%mm5\n\t"
  616. "punpcklbw %%mm7, %%mm0\n\t"
  617. "punpckhbw %%mm7, %%mm2\n\t"
  618. "paddusw %%mm4, %%mm1\n\t"
  619. "paddusw %%mm5, %%mm3\n\t"
  620. "paddusw %%mm6, %%mm1\n\t"
  621. "paddusw %%mm6, %%mm3\n\t"
  622. "psrlw $1, %%mm1\n\t"
  623. "psrlw $1, %%mm3\n\t"
  624. "paddusw %%mm6, %%mm0\n\t"
  625. "paddusw %%mm6, %%mm2\n\t"
  626. "paddusw %%mm1, %%mm0\n\t"
  627. "paddusw %%mm3, %%mm2\n\t"
  628. "psrlw $1, %%mm0\n\t"
  629. "psrlw $1, %%mm2\n\t"
  630. "packuswb %%mm2, %%mm0\n\t"
  631. "movq %%mm0, %0\n\t"
  632. :"+m"(*p)
  633. :"m"(*pix), "m"(*(pix+line_size))
  634. :"memory");
  635. pix += line_size;
  636. p += line_size ;
  637. } while(--h);
  638. }
  639. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  640. {
  641. UINT8 *p;
  642. const UINT8 *pix;
  643. p = block;
  644. pix = pixels;
  645. MOVQ_ZERO(mm7);
  646. // this doesn't seem to be used offten - so
  647. // the inside usage of mm_wone is not optimized
  648. MOVQ_WTWO(mm6);
  649. do {
  650. __asm __volatile(
  651. "movq %1, %%mm0\n\t"
  652. "movq %2, %%mm1\n\t"
  653. "movq 1%1, %%mm4\n\t"
  654. "movq 1%2, %%mm5\n\t"
  655. "movq %%mm0, %%mm2\n\t"
  656. "movq %%mm1, %%mm3\n\t"
  657. "punpcklbw %%mm7, %%mm0\n\t"
  658. "punpcklbw %%mm7, %%mm1\n\t"
  659. "punpckhbw %%mm7, %%mm2\n\t"
  660. "punpckhbw %%mm7, %%mm3\n\t"
  661. "paddusw %%mm1, %%mm0\n\t"
  662. "paddusw %%mm3, %%mm2\n\t"
  663. "movq %%mm4, %%mm1\n\t"
  664. "movq %%mm5, %%mm3\n\t"
  665. "punpcklbw %%mm7, %%mm4\n\t"
  666. "punpcklbw %%mm7, %%mm5\n\t"
  667. "punpckhbw %%mm7, %%mm1\n\t"
  668. "punpckhbw %%mm7, %%mm3\n\t"
  669. "paddusw %%mm5, %%mm4\n\t"
  670. "paddusw %%mm3, %%mm1\n\t"
  671. "paddusw %%mm6, %%mm4\n\t"
  672. "paddusw %%mm6, %%mm1\n\t"
  673. "paddusw %%mm4, %%mm0\n\t"
  674. "paddusw %%mm1, %%mm2\n\t"
  675. "movq %3, %%mm5\n\t"
  676. "psrlw $2, %%mm0\n\t"
  677. "movq %0, %%mm1\n\t"
  678. "psrlw $2, %%mm2\n\t"
  679. "movq %%mm1, %%mm3\n\t"
  680. "punpcklbw %%mm7, %%mm1\n\t"
  681. "punpckhbw %%mm7, %%mm3\n\t"
  682. "paddusw %%mm1, %%mm0\n\t"
  683. "paddusw %%mm3, %%mm2\n\t"
  684. "paddusw %%mm5, %%mm0\n\t"
  685. "paddusw %%mm5, %%mm2\n\t"
  686. "psrlw $1, %%mm0\n\t"
  687. "psrlw $1, %%mm2\n\t"
  688. "packuswb %%mm2, %%mm0\n\t"
  689. "movq %%mm0, %0\n\t"
  690. :"+m"(*p)
  691. :"m"(*pix),
  692. "m"(*(pix+line_size)), "m"(mm_wone)
  693. :"memory");
  694. pix += line_size;
  695. p += line_size ;
  696. } while(--h);
  697. }
  698. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  699. {
  700. UINT8 *p;
  701. const UINT8 *pix;
  702. p = block;
  703. pix = pixels;
  704. MOVQ_ZERO(mm7);
  705. do {
  706. __asm __volatile(
  707. "movq %1, %%mm0\n\t"
  708. "movq %0, %%mm1\n\t"
  709. "movq %%mm0, %%mm2\n\t"
  710. "movq %%mm1, %%mm3\n\t"
  711. "punpcklbw %%mm7, %%mm0\n\t"
  712. "punpcklbw %%mm7, %%mm1\n\t"
  713. "punpckhbw %%mm7, %%mm2\n\t"
  714. "punpckhbw %%mm7, %%mm3\n\t"
  715. "paddusw %%mm1, %%mm0\n\t"
  716. "paddusw %%mm3, %%mm2\n\t"
  717. "psrlw $1, %%mm0\n\t"
  718. "psrlw $1, %%mm2\n\t"
  719. "packuswb %%mm2, %%mm0\n\t"
  720. "movq %%mm0, %0\n\t"
  721. :"+m"(*p)
  722. :"m"(*pix)
  723. :"memory");
  724. pix += line_size;
  725. p += line_size ;
  726. } while (--h);
  727. }
  728. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  729. {
  730. UINT8 *p;
  731. const UINT8 *pix;
  732. p = block;
  733. pix = pixels;
  734. MOVQ_ZERO(mm7);
  735. do {
  736. __asm __volatile(
  737. "movq %1, %%mm0\n\t"
  738. "movq 1%1, %%mm1\n\t"
  739. "movq %0, %%mm4\n\t"
  740. "movq %%mm0, %%mm2\n\t"
  741. "movq %%mm1, %%mm3\n\t"
  742. "movq %%mm4, %%mm5\n\t"
  743. "punpcklbw %%mm7, %%mm0\n\t"
  744. "punpcklbw %%mm7, %%mm1\n\t"
  745. "punpckhbw %%mm7, %%mm2\n\t"
  746. "punpckhbw %%mm7, %%mm3\n\t"
  747. "punpcklbw %%mm7, %%mm4\n\t"
  748. "punpckhbw %%mm7, %%mm5\n\t"
  749. "paddusw %%mm1, %%mm0\n\t"
  750. "paddusw %%mm3, %%mm2\n\t"
  751. "psrlw $1, %%mm0\n\t"
  752. "psrlw $1, %%mm2\n\t"
  753. "paddusw %%mm4, %%mm0\n\t"
  754. "paddusw %%mm5, %%mm2\n\t"
  755. "psrlw $1, %%mm0\n\t"
  756. "psrlw $1, %%mm2\n\t"
  757. "packuswb %%mm2, %%mm0\n\t"
  758. "movq %%mm0, %0\n\t"
  759. :"+m"(*p)
  760. :"m"(*pix)
  761. :"memory");
  762. pix += line_size;
  763. p += line_size;
  764. } while (--h);
  765. }
  766. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  767. {
  768. UINT8 *p;
  769. const UINT8 *pix;
  770. p = block;
  771. pix = pixels;
  772. MOVQ_ZERO(mm7);
  773. do {
  774. __asm __volatile(
  775. "movq %1, %%mm0\n\t"
  776. "movq %2, %%mm1\n\t"
  777. "movq %0, %%mm4\n\t"
  778. "movq %%mm0, %%mm2\n\t"
  779. "movq %%mm1, %%mm3\n\t"
  780. "movq %%mm4, %%mm5\n\t"
  781. "punpcklbw %%mm7, %%mm0\n\t"
  782. "punpcklbw %%mm7, %%mm1\n\t"
  783. "punpckhbw %%mm7, %%mm2\n\t"
  784. "punpckhbw %%mm7, %%mm3\n\t"
  785. "punpcklbw %%mm7, %%mm4\n\t"
  786. "punpckhbw %%mm7, %%mm5\n\t"
  787. "paddusw %%mm1, %%mm0\n\t"
  788. "paddusw %%mm3, %%mm2\n\t"
  789. "psrlw $1, %%mm0\n\t"
  790. "psrlw $1, %%mm2\n\t"
  791. "paddusw %%mm4, %%mm0\n\t"
  792. "paddusw %%mm5, %%mm2\n\t"
  793. "psrlw $1, %%mm0\n\t"
  794. "psrlw $1, %%mm2\n\t"
  795. "packuswb %%mm2, %%mm0\n\t"
  796. "movq %%mm0, %0\n\t"
  797. :"+m"(*p)
  798. :"m"(*pix), "m"(*(pix+line_size))
  799. :"memory");
  800. pix += line_size;
  801. p += line_size ;
  802. } while(--h);
  803. }
  804. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  805. {
  806. UINT8 *p;
  807. const UINT8 *pix;
  808. p = block;
  809. pix = pixels;
  810. MOVQ_ZERO(mm7);
  811. MOVQ_WONE(mm6);
  812. JUMPALIGN();
  813. do {
  814. __asm __volatile(
  815. "movq %1, %%mm0\n\t"
  816. "movq %2, %%mm1\n\t"
  817. "movq 1%1, %%mm4\n\t"
  818. "movq 1%2, %%mm5\n\t"
  819. "movq %%mm0, %%mm2\n\t"
  820. "movq %%mm1, %%mm3\n\t"
  821. "punpcklbw %%mm7, %%mm0\n\t"
  822. "punpcklbw %%mm7, %%mm1\n\t"
  823. "punpckhbw %%mm7, %%mm2\n\t"
  824. "punpckhbw %%mm7, %%mm3\n\t"
  825. "paddusw %%mm1, %%mm0\n\t"
  826. "paddusw %%mm3, %%mm2\n\t"
  827. "movq %%mm4, %%mm1\n\t"
  828. "movq %%mm5, %%mm3\n\t"
  829. "punpcklbw %%mm7, %%mm4\n\t"
  830. "punpcklbw %%mm7, %%mm5\n\t"
  831. "punpckhbw %%mm7, %%mm1\n\t"
  832. "punpckhbw %%mm7, %%mm3\n\t"
  833. "paddusw %%mm5, %%mm4\n\t"
  834. "paddusw %%mm3, %%mm1\n\t"
  835. "paddusw %%mm6, %%mm4\n\t"
  836. "paddusw %%mm6, %%mm1\n\t"
  837. "paddusw %%mm4, %%mm0\n\t"
  838. "paddusw %%mm1, %%mm2\n\t"
  839. "movq %0, %%mm1\n\t"
  840. "psrlw $2, %%mm0\n\t"
  841. "movq %%mm1, %%mm3\n\t"
  842. "psrlw $2, %%mm2\n\t"
  843. "punpcklbw %%mm7, %%mm1\n\t"
  844. "punpckhbw %%mm7, %%mm3\n\t"
  845. "paddusw %%mm1, %%mm0\n\t"
  846. "paddusw %%mm3, %%mm2\n\t"
  847. "psrlw $1, %%mm0\n\t"
  848. "psrlw $1, %%mm2\n\t"
  849. "packuswb %%mm2, %%mm0\n\t"
  850. "movq %%mm0, %0\n\t"
  851. :"+m"(*p)
  852. :"m"(*pix),
  853. "m"(*(pix+line_size))
  854. :"memory");
  855. pix += line_size;
  856. p += line_size;
  857. } while(--h);
  858. }
  859. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  860. {
  861. DCTELEM *p;
  862. const UINT8 *pix;
  863. p = block;
  864. pix = pixels;
  865. MOVQ_ZERO(mm7);
  866. do {
  867. __asm __volatile(
  868. "movq %0, %%mm0\n\t"
  869. "movq %1, %%mm2\n\t"
  870. "movq 8%0, %%mm1\n\t"
  871. "movq %%mm2, %%mm3\n\t"
  872. "punpcklbw %%mm7, %%mm2\n\t"
  873. "punpckhbw %%mm7, %%mm3\n\t"
  874. "psubsw %%mm2, %%mm0\n\t"
  875. "psubsw %%mm3, %%mm1\n\t"
  876. "movq %%mm0, %0\n\t"
  877. "movq %%mm1, 8%0\n\t"
  878. :"+m"(*p)
  879. :"m"(*pix)
  880. :"memory");
  881. pix += line_size;
  882. p += 8;
  883. } while (--h);
  884. }
  885. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  886. {
  887. DCTELEM *p;
  888. const UINT8 *pix;
  889. p = block;
  890. pix = pixels;
  891. MOVQ_ZERO(mm7);
  892. MOVQ_WONE(mm6);
  893. JUMPALIGN();
  894. do {
  895. __asm __volatile(
  896. "movq %0, %%mm0\n\t"
  897. "movq %1, %%mm2\n\t"
  898. "movq 8%0, %%mm1\n\t"
  899. "movq 1%1, %%mm4\n\t"
  900. "movq %%mm2, %%mm3\n\t"
  901. "movq %%mm4, %%mm5\n\t"
  902. "punpcklbw %%mm7, %%mm2\n\t"
  903. "punpckhbw %%mm7, %%mm3\n\t"
  904. "punpcklbw %%mm7, %%mm4\n\t"
  905. "punpckhbw %%mm7, %%mm5\n\t"
  906. "paddusw %%mm4, %%mm2\n\t"
  907. "paddusw %%mm5, %%mm3\n\t"
  908. "paddusw %%mm6, %%mm2\n\t"
  909. "paddusw %%mm6, %%mm3\n\t"
  910. "psrlw $1, %%mm2\n\t"
  911. "psrlw $1, %%mm3\n\t"
  912. "psubsw %%mm2, %%mm0\n\t"
  913. "psubsw %%mm3, %%mm1\n\t"
  914. "movq %%mm0, %0\n\t"
  915. "movq %%mm1, 8%0\n\t"
  916. :"+m"(*p)
  917. :"m"(*pix)
  918. :"memory");
  919. pix += line_size;
  920. p += 8;
  921. } while (--h);
  922. }
  923. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  924. {
  925. DCTELEM *p;
  926. const UINT8 *pix;
  927. p = block;
  928. pix = pixels;
  929. MOVQ_ZERO(mm7);
  930. MOVQ_WONE(mm6);
  931. do {
  932. __asm __volatile(
  933. "movq %0, %%mm0\n\t"
  934. "movq %1, %%mm2\n\t"
  935. "movq 8%0, %%mm1\n\t"
  936. "movq %2, %%mm4\n\t"
  937. "movq %%mm2, %%mm3\n\t"
  938. "movq %%mm4, %%mm5\n\t"
  939. "punpcklbw %%mm7, %%mm2\n\t"
  940. "punpckhbw %%mm7, %%mm3\n\t"
  941. "punpcklbw %%mm7, %%mm4\n\t"
  942. "punpckhbw %%mm7, %%mm5\n\t"
  943. "paddusw %%mm4, %%mm2\n\t"
  944. "paddusw %%mm5, %%mm3\n\t"
  945. "paddusw %%mm6, %%mm2\n\t"
  946. "paddusw %%mm6, %%mm3\n\t"
  947. "psrlw $1, %%mm2\n\t"
  948. "psrlw $1, %%mm3\n\t"
  949. "psubsw %%mm2, %%mm0\n\t"
  950. "psubsw %%mm3, %%mm1\n\t"
  951. "movq %%mm0, %0\n\t"
  952. "movq %%mm1, 8%0\n\t"
  953. :"+m"(*p)
  954. :"m"(*pix), "m"(*(pix+line_size))
  955. :"memory");
  956. pix += line_size;
  957. p += 8;
  958. } while (--h);
  959. }
  960. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  961. {
  962. DCTELEM *p;
  963. const UINT8 *pix;
  964. p = block;
  965. pix = pixels;
  966. MOVQ_ZERO(mm7);
  967. MOVQ_WTWO(mm6);
  968. JUMPALIGN();
  969. do {
  970. __asm __volatile(
  971. "movq %1, %%mm0\n\t"
  972. "movq %2, %%mm1\n\t"
  973. "movq 1%1, %%mm4\n\t"
  974. "movq 1%2, %%mm5\n\t"
  975. "movq %%mm0, %%mm2\n\t"
  976. "movq %%mm1, %%mm3\n\t"
  977. "punpcklbw %%mm7, %%mm0\n\t"
  978. "punpcklbw %%mm7, %%mm1\n\t"
  979. "punpckhbw %%mm7, %%mm2\n\t"
  980. "punpckhbw %%mm7, %%mm3\n\t"
  981. "paddusw %%mm1, %%mm0\n\t"
  982. "paddusw %%mm3, %%mm2\n\t"
  983. "movq %%mm4, %%mm1\n\t"
  984. "movq %%mm5, %%mm3\n\t"
  985. "punpcklbw %%mm7, %%mm4\n\t"
  986. "punpcklbw %%mm7, %%mm5\n\t"
  987. "punpckhbw %%mm7, %%mm1\n\t"
  988. "punpckhbw %%mm7, %%mm3\n\t"
  989. "paddusw %%mm5, %%mm4\n\t"
  990. "paddusw %%mm3, %%mm1\n\t"
  991. "paddusw %%mm6, %%mm4\n\t"
  992. "paddusw %%mm6, %%mm1\n\t"
  993. "paddusw %%mm4, %%mm0\n\t"
  994. "paddusw %%mm1, %%mm2\n\t"
  995. "movq %0, %%mm1\n\t"
  996. "movq 8%0, %%mm3\n\t"
  997. "psrlw $2, %%mm0\n\t"
  998. "psrlw $2, %%mm2\n\t"
  999. "psubsw %%mm0, %%mm1\n\t"
  1000. "psubsw %%mm2, %%mm3\n\t"
  1001. "movq %%mm1, %0\n\t"
  1002. "movq %%mm3, 8%0\n\t"
  1003. :"+m"(*p)
  1004. :"m"(*pix),
  1005. "m"(*(pix+line_size))
  1006. :"memory");
  1007. pix += line_size;
  1008. p += 8 ;
  1009. } while(--h);
  1010. }
  1011. static void clear_blocks_mmx(DCTELEM *blocks)
  1012. {
  1013. asm volatile(
  1014. "pxor %%mm7, %%mm7 \n\t"
  1015. "movl $-128*6, %%eax \n\t"
  1016. "1: \n\t"
  1017. "movq %%mm7, (%0, %%eax) \n\t"
  1018. "movq %%mm7, 8(%0, %%eax) \n\t"
  1019. "movq %%mm7, 16(%0, %%eax) \n\t"
  1020. "movq %%mm7, 24(%0, %%eax) \n\t"
  1021. "addl $32, %%eax \n\t"
  1022. " js 1b \n\t"
  1023. : : "r" (((int)blocks)+128*6)
  1024. : "%eax"
  1025. );
  1026. }
  1027. static void just_return() { return; }
  1028. void dsputil_init_mmx(void)
  1029. {
  1030. mm_flags = mm_support();
  1031. #if 1
  1032. printf("libavcodec: CPU flags:");
  1033. if (mm_flags & MM_MMX)
  1034. printf(" mmx");
  1035. if (mm_flags & MM_MMXEXT)
  1036. printf(" mmxext");
  1037. if (mm_flags & MM_3DNOW)
  1038. printf(" 3dnow");
  1039. if (mm_flags & MM_SSE)
  1040. printf(" sse");
  1041. if (mm_flags & MM_SSE2)
  1042. printf(" sse2");
  1043. printf("\n");
  1044. #endif
  1045. if (mm_flags & MM_MMX) {
  1046. get_pixels = get_pixels_mmx;
  1047. diff_pixels = diff_pixels_mmx;
  1048. put_pixels_clamped = put_pixels_clamped_mmx;
  1049. add_pixels_clamped = add_pixels_clamped_mmx;
  1050. clear_blocks= clear_blocks_mmx;
  1051. pix_abs16x16 = pix_abs16x16_mmx;
  1052. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1053. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1054. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1055. pix_abs8x8 = pix_abs8x8_mmx;
  1056. pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1057. pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1058. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  1059. av_fdct = fdct_mmx;
  1060. put_pixels_tab[0] = put_pixels_mmx;
  1061. put_pixels_tab[1] = put_pixels_x2_mmx;
  1062. put_pixels_tab[2] = put_pixels_y2_mmx;
  1063. put_pixels_tab[3] = put_pixels_xy2_mmx;
  1064. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  1065. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1066. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1067. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  1068. avg_pixels_tab[0] = avg_pixels_mmx;
  1069. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  1070. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  1071. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1072. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  1073. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  1074. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  1075. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  1076. sub_pixels_tab[0] = sub_pixels_mmx;
  1077. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  1078. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  1079. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  1080. if (mm_flags & MM_MMXEXT) {
  1081. pix_abs16x16 = pix_abs16x16_mmx2;
  1082. pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  1083. pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  1084. pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
  1085. pix_abs8x8 = pix_abs8x8_mmx2;
  1086. pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  1087. pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  1088. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
  1089. put_pixels_tab[1] = put_pixels_x2_mmx2;
  1090. put_pixels_tab[2] = put_pixels_y2_mmx2;
  1091. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
  1092. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
  1093. avg_pixels_tab[0] = avg_pixels_mmx2;
  1094. avg_pixels_tab[1] = avg_pixels_x2_mmx2;
  1095. avg_pixels_tab[2] = avg_pixels_y2_mmx2;
  1096. avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
  1097. sub_pixels_tab[1] = sub_pixels_x2_mmx2;
  1098. sub_pixels_tab[2] = sub_pixels_y2_mmx2;
  1099. } else if (mm_flags & MM_3DNOW) {
  1100. put_pixels_tab[1] = put_pixels_x2_3dnow;
  1101. put_pixels_tab[2] = put_pixels_y2_3dnow;
  1102. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
  1103. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
  1104. avg_pixels_tab[0] = avg_pixels_3dnow;
  1105. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  1106. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  1107. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  1108. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  1109. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  1110. }
  1111. /* idct */
  1112. if (mm_flags & MM_MMXEXT) {
  1113. ff_idct = ff_mmxext_idct;
  1114. } else {
  1115. ff_idct = ff_mmx_idct;
  1116. }
  1117. #ifdef SIMPLE_IDCT
  1118. // ff_idct = simple_idct;
  1119. ff_idct = simple_idct_mmx;
  1120. #endif
  1121. }
  1122. #if 0
  1123. // for speed testing
  1124. get_pixels = just_return;
  1125. put_pixels_clamped = just_return;
  1126. add_pixels_clamped = just_return;
  1127. pix_abs16x16 = just_return;
  1128. pix_abs16x16_x2 = just_return;
  1129. pix_abs16x16_y2 = just_return;
  1130. pix_abs16x16_xy2 = just_return;
  1131. put_pixels_tab[0] = just_return;
  1132. put_pixels_tab[1] = just_return;
  1133. put_pixels_tab[2] = just_return;
  1134. put_pixels_tab[3] = just_return;
  1135. put_no_rnd_pixels_tab[0] = just_return;
  1136. put_no_rnd_pixels_tab[1] = just_return;
  1137. put_no_rnd_pixels_tab[2] = just_return;
  1138. put_no_rnd_pixels_tab[3] = just_return;
  1139. avg_pixels_tab[0] = just_return;
  1140. avg_pixels_tab[1] = just_return;
  1141. avg_pixels_tab[2] = just_return;
  1142. avg_pixels_tab[3] = just_return;
  1143. avg_no_rnd_pixels_tab[0] = just_return;
  1144. avg_no_rnd_pixels_tab[1] = just_return;
  1145. avg_no_rnd_pixels_tab[2] = just_return;
  1146. avg_no_rnd_pixels_tab[3] = just_return;
  1147. sub_pixels_tab[0] = just_return;
  1148. sub_pixels_tab[1] = just_return;
  1149. sub_pixels_tab[2] = just_return;
  1150. sub_pixels_tab[3] = just_return;
  1151. //av_fdct = just_return;
  1152. //ff_idct = just_return;
  1153. #endif
  1154. }