You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1237 lines
32KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. #include "../mangle.h"
  24. int mm_flags; /* multimedia extension flags */
  25. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  41. /* external functions, from idct_mmx.c */
  42. void ff_mmx_idct(DCTELEM *block);
  43. void ff_mmxext_idct(DCTELEM *block);
  44. /* pixel operations */
  45. static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  46. static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  47. static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  48. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  49. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  50. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  51. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  52. #ifndef PIC
  53. #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
  54. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  55. #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
  56. #else
  57. // for shared library it's better to use this way for accessing constants
  58. // pcmpeqd -> -1
  59. #define MOVQ_WONE(regd) \
  60. __asm __volatile ( \
  61. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  62. "psrlw $15, %%" #regd ::)
  63. #define MOVQ_WTWO(regd) \
  64. __asm __volatile ( \
  65. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  66. "psrlw $15, %%" #regd " \n\t" \
  67. "psllw $1, %%" #regd ::)
  68. #define MOVQ_BONE(regd) \
  69. "pcmpeqd " #regd ", " #regd " \n\t" \
  70. "psrlw $15, " #regd " \n\t"\
  71. "packuswb " #regd ", " #regd " \n\t"
  72. #endif
  73. /***********************************/
  74. /* 3Dnow specific */
  75. #define DEF(x) x ## _3dnow
  76. /* for Athlons PAVGUSB is prefered */
  77. #define PAVGB "pavgusb"
  78. #include "dsputil_mmx_avg.h"
  79. #undef DEF
  80. #undef PAVGB
  81. /***********************************/
  82. /* MMX2 specific */
  83. #define DEF(x) x ## _mmx2
  84. /* Introduced only in MMX2 set */
  85. #define PAVGB "pavgb"
  86. #include "dsputil_mmx_avg.h"
  87. #undef DEF
  88. #undef PAVGB
  89. /***********************************/
  90. /* standard MMX */
  91. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  92. {
  93. asm volatile(
  94. "movl $-128, %%eax \n\t"
  95. "pxor %%mm7, %%mm7 \n\t"
  96. ".balign 16 \n\t"
  97. "1: \n\t"
  98. "movq (%0), %%mm0 \n\t"
  99. "movq (%0, %2), %%mm2 \n\t"
  100. "movq %%mm0, %%mm1 \n\t"
  101. "movq %%mm2, %%mm3 \n\t"
  102. "punpcklbw %%mm7, %%mm0 \n\t"
  103. "punpckhbw %%mm7, %%mm1 \n\t"
  104. "punpcklbw %%mm7, %%mm2 \n\t"
  105. "punpckhbw %%mm7, %%mm3 \n\t"
  106. "movq %%mm0, (%1, %%eax)\n\t"
  107. "movq %%mm1, 8(%1, %%eax)\n\t"
  108. "movq %%mm2, 16(%1, %%eax)\n\t"
  109. "movq %%mm3, 24(%1, %%eax)\n\t"
  110. "addl %3, %0 \n\t"
  111. "addl $32, %%eax \n\t"
  112. "js 1b \n\t"
  113. : "+r" (pixels)
  114. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  115. : "%eax"
  116. );
  117. }
  118. static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
  119. {
  120. asm volatile(
  121. "pxor %%mm7, %%mm7 \n\t"
  122. "movl $-128, %%eax \n\t"
  123. ".balign 16 \n\t"
  124. "1: \n\t"
  125. "movq (%0), %%mm0 \n\t"
  126. "movq (%1), %%mm2 \n\t"
  127. "movq %%mm0, %%mm1 \n\t"
  128. "movq %%mm2, %%mm3 \n\t"
  129. "punpcklbw %%mm7, %%mm0 \n\t"
  130. "punpckhbw %%mm7, %%mm1 \n\t"
  131. "punpcklbw %%mm7, %%mm2 \n\t"
  132. "punpckhbw %%mm7, %%mm3 \n\t"
  133. "psubw %%mm2, %%mm0 \n\t"
  134. "psubw %%mm3, %%mm1 \n\t"
  135. "movq %%mm0, (%2, %%eax)\n\t"
  136. "movq %%mm1, 8(%2, %%eax)\n\t"
  137. "addl %3, %0 \n\t"
  138. "addl %3, %1 \n\t"
  139. "addl $16, %%eax \n\t"
  140. "jnz 1b \n\t"
  141. : "+r" (s1), "+r" (s2)
  142. : "r" (block+64), "r" (stride)
  143. : "%eax"
  144. );
  145. }
  146. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  147. {
  148. const DCTELEM *p;
  149. UINT8 *pix;
  150. /* read the pixels */
  151. p = block;
  152. pix = pixels;
  153. /* unrolled loop */
  154. __asm __volatile(
  155. "movq %3, %%mm0\n\t"
  156. "movq 8%3, %%mm1\n\t"
  157. "movq 16%3, %%mm2\n\t"
  158. "movq 24%3, %%mm3\n\t"
  159. "movq 32%3, %%mm4\n\t"
  160. "movq 40%3, %%mm5\n\t"
  161. "movq 48%3, %%mm6\n\t"
  162. "movq 56%3, %%mm7\n\t"
  163. "packuswb %%mm1, %%mm0\n\t"
  164. "packuswb %%mm3, %%mm2\n\t"
  165. "packuswb %%mm5, %%mm4\n\t"
  166. "packuswb %%mm7, %%mm6\n\t"
  167. "movq %%mm0, (%0)\n\t"
  168. "movq %%mm2, (%0, %1)\n\t"
  169. "movq %%mm4, (%0, %1, 2)\n\t"
  170. "movq %%mm6, (%0, %2)\n\t"
  171. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  172. :"memory");
  173. pix += line_size*4;
  174. p += 32;
  175. // if here would be an exact copy of the code above
  176. // compiler would generate some very strange code
  177. // thus using "r"
  178. __asm __volatile(
  179. "movq (%3), %%mm0\n\t"
  180. "movq 8(%3), %%mm1\n\t"
  181. "movq 16(%3), %%mm2\n\t"
  182. "movq 24(%3), %%mm3\n\t"
  183. "movq 32(%3), %%mm4\n\t"
  184. "movq 40(%3), %%mm5\n\t"
  185. "movq 48(%3), %%mm6\n\t"
  186. "movq 56(%3), %%mm7\n\t"
  187. "packuswb %%mm1, %%mm0\n\t"
  188. "packuswb %%mm3, %%mm2\n\t"
  189. "packuswb %%mm5, %%mm4\n\t"
  190. "packuswb %%mm7, %%mm6\n\t"
  191. "movq %%mm0, (%0)\n\t"
  192. "movq %%mm2, (%0, %1)\n\t"
  193. "movq %%mm4, (%0, %1, 2)\n\t"
  194. "movq %%mm6, (%0, %2)\n\t"
  195. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  196. :"memory");
  197. }
  198. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  199. {
  200. const DCTELEM *p;
  201. UINT8 *pix;
  202. int i;
  203. /* read the pixels */
  204. p = block;
  205. pix = pixels;
  206. MOVQ_ZERO(mm7);
  207. i = 4;
  208. do {
  209. __asm __volatile(
  210. "movq (%2), %%mm0\n\t"
  211. "movq 8(%2), %%mm1\n\t"
  212. "movq 16(%2), %%mm2\n\t"
  213. "movq 24(%2), %%mm3\n\t"
  214. "movq %0, %%mm4\n\t"
  215. "movq %1, %%mm6\n\t"
  216. "movq %%mm4, %%mm5\n\t"
  217. "punpcklbw %%mm7, %%mm4\n\t"
  218. "punpckhbw %%mm7, %%mm5\n\t"
  219. "paddsw %%mm4, %%mm0\n\t"
  220. "paddsw %%mm5, %%mm1\n\t"
  221. "movq %%mm6, %%mm5\n\t"
  222. "punpcklbw %%mm7, %%mm6\n\t"
  223. "punpckhbw %%mm7, %%mm5\n\t"
  224. "paddsw %%mm6, %%mm2\n\t"
  225. "paddsw %%mm5, %%mm3\n\t"
  226. "packuswb %%mm1, %%mm0\n\t"
  227. "packuswb %%mm3, %%mm2\n\t"
  228. "movq %%mm0, %0\n\t"
  229. "movq %%mm2, %1\n\t"
  230. :"+m"(*pix), "+m"(*(pix+line_size))
  231. :"r"(p)
  232. :"memory");
  233. pix += line_size*2;
  234. p += 16;
  235. } while (--i);
  236. }
  237. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  238. {
  239. #if 0 //FIXME h==4 case
  240. asm volatile(
  241. "xorl %%eax, %%eax \n\t"
  242. "movl %3, %%esi \n\t"
  243. "1: \n\t"
  244. "movq (%1, %%eax), %%mm0 \n\t"
  245. "movq %%mm0, (%0, %%eax) \n\t"
  246. "addl %2, %%eax \n\t"
  247. "movq (%1, %%eax), %%mm0 \n\t"
  248. "movq %%mm0, (%0, %%eax) \n\t"
  249. "addl %2, %%eax \n\t"
  250. "movq (%1, %%eax), %%mm0 \n\t"
  251. "movq %%mm0, (%0, %%eax) \n\t"
  252. "addl %2, %%eax \n\t"
  253. "movq (%1, %%eax), %%mm0 \n\t"
  254. "movq %%mm0, (%0, %%eax) \n\t"
  255. "addl %2, %%eax \n\t"
  256. "movq (%1, %%eax), %%mm0 \n\t"
  257. "movq %%mm0, (%0, %%eax) \n\t"
  258. "addl %2, %%eax \n\t"
  259. "movq (%1, %%eax), %%mm0 \n\t"
  260. "movq %%mm0, (%0, %%eax) \n\t"
  261. "addl %2, %%eax \n\t"
  262. "movq (%1, %%eax), %%mm0 \n\t"
  263. "movq %%mm0, (%0, %%eax) \n\t"
  264. "addl %2, %%eax \n\t"
  265. "movq (%1, %%eax), %%mm0 \n\t"
  266. "movq %%mm0, (%0, %%eax) \n\t"
  267. "addl %2, %%eax \n\t"
  268. "subl $8, %%esi \n\t"
  269. " jnz 1b \n\t"
  270. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  271. : "%eax", "%esi", "memory"
  272. );
  273. #else
  274. asm volatile(
  275. "xorl %%eax, %%eax \n\t"
  276. "movl %3, %%esi \n\t"
  277. "1: \n\t"
  278. "movq (%1, %%eax), %%mm0 \n\t"
  279. "movq %%mm0, (%0, %%eax) \n\t"
  280. "addl %2, %%eax \n\t"
  281. "movq (%1, %%eax), %%mm0 \n\t"
  282. "movq %%mm0, (%0, %%eax) \n\t"
  283. "addl %2, %%eax \n\t"
  284. "movq (%1, %%eax), %%mm0 \n\t"
  285. "movq %%mm0, (%0, %%eax) \n\t"
  286. "addl %2, %%eax \n\t"
  287. "movq (%1, %%eax), %%mm0 \n\t"
  288. "movq %%mm0, (%0, %%eax) \n\t"
  289. "addl %2, %%eax \n\t"
  290. "subl $4, %%esi \n\t"
  291. " jnz 1b \n\t"
  292. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  293. : "%eax", "%esi", "memory"
  294. );
  295. #endif
  296. }
  297. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  298. {
  299. UINT8 *p;
  300. const UINT8 *pix;
  301. p = block;
  302. pix = pixels;
  303. MOVQ_ZERO(mm7);
  304. MOVQ_WONE(mm4);
  305. JUMPALIGN();
  306. do {
  307. __asm __volatile(
  308. "movq %1, %%mm0\n\t"
  309. "movq 1%1, %%mm1\n\t"
  310. "movq %%mm0, %%mm2\n\t"
  311. "movq %%mm1, %%mm3\n\t"
  312. "punpcklbw %%mm7, %%mm0\n\t"
  313. "punpcklbw %%mm7, %%mm1\n\t"
  314. "punpckhbw %%mm7, %%mm2\n\t"
  315. "punpckhbw %%mm7, %%mm3\n\t"
  316. "paddusw %%mm1, %%mm0\n\t"
  317. "paddusw %%mm3, %%mm2\n\t"
  318. "paddusw %%mm4, %%mm0\n\t"
  319. "paddusw %%mm4, %%mm2\n\t"
  320. "psrlw $1, %%mm0\n\t"
  321. "psrlw $1, %%mm2\n\t"
  322. "packuswb %%mm2, %%mm0\n\t"
  323. "movq %%mm0, %0\n\t"
  324. :"=m"(*p)
  325. :"m"(*pix)
  326. :"memory");
  327. pix += line_size; p += line_size;
  328. } while (--h);
  329. }
  330. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  331. {
  332. UINT8 *p;
  333. const UINT8 *pix;
  334. p = block;
  335. pix = pixels;
  336. MOVQ_ZERO(mm7);
  337. MOVQ_WONE(mm4);
  338. JUMPALIGN();
  339. do {
  340. __asm __volatile(
  341. "movq %1, %%mm0\n\t"
  342. "movq %2, %%mm1\n\t"
  343. "movq %%mm0, %%mm2\n\t"
  344. "movq %%mm1, %%mm3\n\t"
  345. "punpcklbw %%mm7, %%mm0\n\t"
  346. "punpcklbw %%mm7, %%mm1\n\t"
  347. "punpckhbw %%mm7, %%mm2\n\t"
  348. "punpckhbw %%mm7, %%mm3\n\t"
  349. "paddusw %%mm1, %%mm0\n\t"
  350. "paddusw %%mm3, %%mm2\n\t"
  351. "paddusw %%mm4, %%mm0\n\t"
  352. "paddusw %%mm4, %%mm2\n\t"
  353. "psrlw $1, %%mm0\n\t"
  354. "psrlw $1, %%mm2\n\t"
  355. "packuswb %%mm2, %%mm0\n\t"
  356. "movq %%mm0, %0\n\t"
  357. :"=m"(*p)
  358. :"m"(*pix),
  359. "m"(*(pix+line_size))
  360. :"memory");
  361. pix += line_size;
  362. p += line_size;
  363. } while (--h);
  364. }
  365. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  366. {
  367. UINT8 *p;
  368. const UINT8 *pix;
  369. p = block;
  370. pix = pixels; // 1s
  371. MOVQ_ZERO(mm7);
  372. MOVQ_WTWO(mm6);
  373. JUMPALIGN();
  374. do {
  375. __asm __volatile(
  376. "movq %1, %%mm0\n\t"
  377. "movq %2, %%mm1\n\t"
  378. "movq 1%1, %%mm4\n\t"
  379. "movq 1%2, %%mm5\n\t"
  380. "movq %%mm0, %%mm2\n\t"
  381. "movq %%mm1, %%mm3\n\t"
  382. "punpcklbw %%mm7, %%mm0\n\t"
  383. "punpcklbw %%mm7, %%mm1\n\t"
  384. "punpckhbw %%mm7, %%mm2\n\t"
  385. "punpckhbw %%mm7, %%mm3\n\t"
  386. "paddusw %%mm1, %%mm0\n\t"
  387. "paddusw %%mm3, %%mm2\n\t"
  388. "movq %%mm4, %%mm1\n\t"
  389. "movq %%mm5, %%mm3\n\t"
  390. "punpcklbw %%mm7, %%mm4\n\t"
  391. "punpcklbw %%mm7, %%mm5\n\t"
  392. "punpckhbw %%mm7, %%mm1\n\t"
  393. "punpckhbw %%mm7, %%mm3\n\t"
  394. "paddusw %%mm5, %%mm4\n\t"
  395. "paddusw %%mm3, %%mm1\n\t"
  396. "paddusw %%mm6, %%mm4\n\t"
  397. "paddusw %%mm6, %%mm1\n\t"
  398. "paddusw %%mm4, %%mm0\n\t"
  399. "paddusw %%mm1, %%mm2\n\t"
  400. "psrlw $2, %%mm0\n\t"
  401. "psrlw $2, %%mm2\n\t"
  402. "packuswb %%mm2, %%mm0\n\t"
  403. "movq %%mm0, %0\n\t"
  404. :"=m"(*p)
  405. :"m"(*pix),
  406. "m"(*(pix+line_size))
  407. :"memory");
  408. pix += line_size;
  409. p += line_size;
  410. } while(--h);
  411. }
  412. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  413. {
  414. UINT8 *p;
  415. const UINT8 *pix;
  416. p = block;
  417. pix = pixels;
  418. MOVQ_ZERO(mm7);
  419. do {
  420. __asm __volatile(
  421. "movq %1, %%mm0\n\t"
  422. "movq 1%1, %%mm1\n\t"
  423. "movq %%mm0, %%mm2\n\t"
  424. "movq %%mm1, %%mm3\n\t"
  425. "punpcklbw %%mm7, %%mm0\n\t"
  426. "punpcklbw %%mm7, %%mm1\n\t"
  427. "punpckhbw %%mm7, %%mm2\n\t"
  428. "punpckhbw %%mm7, %%mm3\n\t"
  429. "paddusw %%mm1, %%mm0\n\t"
  430. "paddusw %%mm3, %%mm2\n\t"
  431. "psrlw $1, %%mm0\n\t"
  432. "psrlw $1, %%mm2\n\t"
  433. "packuswb %%mm2, %%mm0\n\t"
  434. "movq %%mm0, %0\n\t"
  435. :"=m"(*p)
  436. :"m"(*pix)
  437. :"memory");
  438. pix += line_size;
  439. p += line_size;
  440. } while (--h);
  441. }
  442. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  443. {
  444. UINT8 *p;
  445. const UINT8 *pix;
  446. p = block;
  447. pix = pixels;
  448. MOVQ_ZERO(mm7);
  449. JUMPALIGN();
  450. do {
  451. __asm __volatile(
  452. "movq %1, %%mm0\n\t"
  453. "movq %2, %%mm1\n\t"
  454. "movq %%mm0, %%mm2\n\t"
  455. "movq %%mm1, %%mm3\n\t"
  456. "punpcklbw %%mm7, %%mm0\n\t"
  457. "punpcklbw %%mm7, %%mm1\n\t"
  458. "punpckhbw %%mm7, %%mm2\n\t"
  459. "punpckhbw %%mm7, %%mm3\n\t"
  460. "paddusw %%mm1, %%mm0\n\t"
  461. "paddusw %%mm3, %%mm2\n\t"
  462. "psrlw $1, %%mm0\n\t"
  463. "psrlw $1, %%mm2\n\t"
  464. "packuswb %%mm2, %%mm0\n\t"
  465. "movq %%mm0, %0\n\t"
  466. :"=m"(*p)
  467. :"m"(*pix),
  468. "m"(*(pix+line_size))
  469. :"memory");
  470. pix += line_size;
  471. p += line_size;
  472. } while(--h);
  473. }
  474. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  475. {
  476. UINT8 *p;
  477. const UINT8 *pix;
  478. p = block;
  479. pix = pixels;
  480. MOVQ_ZERO(mm7);
  481. MOVQ_WONE(mm6);
  482. JUMPALIGN();
  483. do {
  484. __asm __volatile(
  485. "movq %1, %%mm0\n\t"
  486. "movq %2, %%mm1\n\t"
  487. "movq 1%1, %%mm4\n\t"
  488. "movq 1%2, %%mm5\n\t"
  489. "movq %%mm0, %%mm2\n\t"
  490. "movq %%mm1, %%mm3\n\t"
  491. "punpcklbw %%mm7, %%mm0\n\t"
  492. "punpcklbw %%mm7, %%mm1\n\t"
  493. "punpckhbw %%mm7, %%mm2\n\t"
  494. "punpckhbw %%mm7, %%mm3\n\t"
  495. "paddusw %%mm1, %%mm0\n\t"
  496. "paddusw %%mm3, %%mm2\n\t"
  497. "movq %%mm4, %%mm1\n\t"
  498. "movq %%mm5, %%mm3\n\t"
  499. "punpcklbw %%mm7, %%mm4\n\t"
  500. "punpcklbw %%mm7, %%mm5\n\t"
  501. "punpckhbw %%mm7, %%mm1\n\t"
  502. "punpckhbw %%mm7, %%mm3\n\t"
  503. "paddusw %%mm5, %%mm4\n\t"
  504. "paddusw %%mm3, %%mm1\n\t"
  505. "paddusw %%mm6, %%mm4\n\t"
  506. "paddusw %%mm6, %%mm1\n\t"
  507. "paddusw %%mm4, %%mm0\n\t"
  508. "paddusw %%mm1, %%mm2\n\t"
  509. "psrlw $2, %%mm0\n\t"
  510. "psrlw $2, %%mm2\n\t"
  511. "packuswb %%mm2, %%mm0\n\t"
  512. "movq %%mm0, %0\n\t"
  513. :"=m"(*p)
  514. :"m"(*pix),
  515. "m"(*(pix+line_size))
  516. :"memory");
  517. pix += line_size;
  518. p += line_size;
  519. } while(--h);
  520. }
  521. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  522. {
  523. UINT8 *p;
  524. const UINT8 *pix;
  525. p = block;
  526. pix = pixels;
  527. MOVQ_ZERO(mm7);
  528. MOVQ_WONE(mm6);
  529. JUMPALIGN();
  530. do {
  531. __asm __volatile(
  532. "movq %0, %%mm0\n\t"
  533. "movq %1, %%mm1\n\t"
  534. "movq %%mm0, %%mm2\n\t"
  535. "movq %%mm1, %%mm3\n\t"
  536. "punpcklbw %%mm7, %%mm0\n\t"
  537. "punpcklbw %%mm7, %%mm1\n\t"
  538. "punpckhbw %%mm7, %%mm2\n\t"
  539. "punpckhbw %%mm7, %%mm3\n\t"
  540. "paddusw %%mm1, %%mm0\n\t"
  541. "paddusw %%mm3, %%mm2\n\t"
  542. "paddusw %%mm6, %%mm0\n\t"
  543. "paddusw %%mm6, %%mm2\n\t"
  544. "psrlw $1, %%mm0\n\t"
  545. "psrlw $1, %%mm2\n\t"
  546. "packuswb %%mm2, %%mm0\n\t"
  547. "movq %%mm0, %0\n\t"
  548. :"+m"(*p)
  549. :"m"(*pix)
  550. :"memory");
  551. pix += line_size;
  552. p += line_size;
  553. }
  554. while (--h);
  555. }
  556. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  557. {
  558. UINT8 *p;
  559. const UINT8 *pix;
  560. p = block;
  561. pix = pixels;
  562. MOVQ_ZERO(mm7);
  563. MOVQ_WONE(mm6);
  564. JUMPALIGN();
  565. do {
  566. __asm __volatile(
  567. "movq %1, %%mm1\n\t"
  568. "movq %0, %%mm0\n\t"
  569. "movq 1%1, %%mm4\n\t"
  570. "movq %%mm0, %%mm2\n\t"
  571. "movq %%mm1, %%mm3\n\t"
  572. "movq %%mm4, %%mm5\n\t"
  573. "punpcklbw %%mm7, %%mm1\n\t"
  574. "punpckhbw %%mm7, %%mm3\n\t"
  575. "punpcklbw %%mm7, %%mm4\n\t"
  576. "punpckhbw %%mm7, %%mm5\n\t"
  577. "punpcklbw %%mm7, %%mm0\n\t"
  578. "punpckhbw %%mm7, %%mm2\n\t"
  579. "paddusw %%mm4, %%mm1\n\t"
  580. "paddusw %%mm5, %%mm3\n\t"
  581. "paddusw %%mm6, %%mm1\n\t"
  582. "paddusw %%mm6, %%mm3\n\t"
  583. "psrlw $1, %%mm1\n\t"
  584. "psrlw $1, %%mm3\n\t"
  585. "paddusw %%mm6, %%mm0\n\t"
  586. "paddusw %%mm6, %%mm2\n\t"
  587. "paddusw %%mm1, %%mm0\n\t"
  588. "paddusw %%mm3, %%mm2\n\t"
  589. "psrlw $1, %%mm0\n\t"
  590. "psrlw $1, %%mm2\n\t"
  591. "packuswb %%mm2, %%mm0\n\t"
  592. "movq %%mm0, %0\n\t"
  593. :"+m"(*p)
  594. :"m"(*pix)
  595. :"memory");
  596. pix += line_size;
  597. p += line_size;
  598. } while (--h);
  599. }
  600. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  601. {
  602. UINT8 *p;
  603. const UINT8 *pix;
  604. p = block;
  605. pix = pixels;
  606. MOVQ_ZERO(mm7);
  607. MOVQ_WONE(mm6);
  608. JUMPALIGN();
  609. do {
  610. __asm __volatile(
  611. "movq %1, %%mm1\n\t"
  612. "movq %0, %%mm0\n\t"
  613. "movq %2, %%mm4\n\t"
  614. "movq %%mm0, %%mm2\n\t"
  615. "movq %%mm1, %%mm3\n\t"
  616. "movq %%mm4, %%mm5\n\t"
  617. "punpcklbw %%mm7, %%mm1\n\t"
  618. "punpckhbw %%mm7, %%mm3\n\t"
  619. "punpcklbw %%mm7, %%mm4\n\t"
  620. "punpckhbw %%mm7, %%mm5\n\t"
  621. "punpcklbw %%mm7, %%mm0\n\t"
  622. "punpckhbw %%mm7, %%mm2\n\t"
  623. "paddusw %%mm4, %%mm1\n\t"
  624. "paddusw %%mm5, %%mm3\n\t"
  625. "paddusw %%mm6, %%mm1\n\t"
  626. "paddusw %%mm6, %%mm3\n\t"
  627. "psrlw $1, %%mm1\n\t"
  628. "psrlw $1, %%mm3\n\t"
  629. "paddusw %%mm6, %%mm0\n\t"
  630. "paddusw %%mm6, %%mm2\n\t"
  631. "paddusw %%mm1, %%mm0\n\t"
  632. "paddusw %%mm3, %%mm2\n\t"
  633. "psrlw $1, %%mm0\n\t"
  634. "psrlw $1, %%mm2\n\t"
  635. "packuswb %%mm2, %%mm0\n\t"
  636. "movq %%mm0, %0\n\t"
  637. :"+m"(*p)
  638. :"m"(*pix), "m"(*(pix+line_size))
  639. :"memory");
  640. pix += line_size;
  641. p += line_size ;
  642. } while(--h);
  643. }
  644. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  645. {
  646. UINT8 *p;
  647. const UINT8 *pix;
  648. p = block;
  649. pix = pixels;
  650. MOVQ_ZERO(mm7);
  651. // this doesn't seem to be used offten - so
  652. // the inside usage of mm_wone is not optimized
  653. MOVQ_WTWO(mm6);
  654. do {
  655. __asm __volatile(
  656. "movq %1, %%mm0\n\t"
  657. "movq %2, %%mm1\n\t"
  658. "movq 1%1, %%mm4\n\t"
  659. "movq 1%2, %%mm5\n\t"
  660. "movq %%mm0, %%mm2\n\t"
  661. "movq %%mm1, %%mm3\n\t"
  662. "punpcklbw %%mm7, %%mm0\n\t"
  663. "punpcklbw %%mm7, %%mm1\n\t"
  664. "punpckhbw %%mm7, %%mm2\n\t"
  665. "punpckhbw %%mm7, %%mm3\n\t"
  666. "paddusw %%mm1, %%mm0\n\t"
  667. "paddusw %%mm3, %%mm2\n\t"
  668. "movq %%mm4, %%mm1\n\t"
  669. "movq %%mm5, %%mm3\n\t"
  670. "punpcklbw %%mm7, %%mm4\n\t"
  671. "punpcklbw %%mm7, %%mm5\n\t"
  672. "punpckhbw %%mm7, %%mm1\n\t"
  673. "punpckhbw %%mm7, %%mm3\n\t"
  674. "paddusw %%mm5, %%mm4\n\t"
  675. "paddusw %%mm3, %%mm1\n\t"
  676. "paddusw %%mm6, %%mm4\n\t"
  677. "paddusw %%mm6, %%mm1\n\t"
  678. "paddusw %%mm4, %%mm0\n\t"
  679. "paddusw %%mm1, %%mm2\n\t"
  680. "movq %3, %%mm5\n\t"
  681. "psrlw $2, %%mm0\n\t"
  682. "movq %0, %%mm1\n\t"
  683. "psrlw $2, %%mm2\n\t"
  684. "movq %%mm1, %%mm3\n\t"
  685. "punpcklbw %%mm7, %%mm1\n\t"
  686. "punpckhbw %%mm7, %%mm3\n\t"
  687. "paddusw %%mm1, %%mm0\n\t"
  688. "paddusw %%mm3, %%mm2\n\t"
  689. "paddusw %%mm5, %%mm0\n\t"
  690. "paddusw %%mm5, %%mm2\n\t"
  691. "psrlw $1, %%mm0\n\t"
  692. "psrlw $1, %%mm2\n\t"
  693. "packuswb %%mm2, %%mm0\n\t"
  694. "movq %%mm0, %0\n\t"
  695. :"+m"(*p)
  696. :"m"(*pix),
  697. "m"(*(pix+line_size)), "m"(mm_wone)
  698. :"memory");
  699. pix += line_size;
  700. p += line_size ;
  701. } while(--h);
  702. }
  703. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  704. {
  705. UINT8 *p;
  706. const UINT8 *pix;
  707. p = block;
  708. pix = pixels;
  709. MOVQ_ZERO(mm7);
  710. do {
  711. __asm __volatile(
  712. "movq %1, %%mm0\n\t"
  713. "movq %0, %%mm1\n\t"
  714. "movq %%mm0, %%mm2\n\t"
  715. "movq %%mm1, %%mm3\n\t"
  716. "punpcklbw %%mm7, %%mm0\n\t"
  717. "punpcklbw %%mm7, %%mm1\n\t"
  718. "punpckhbw %%mm7, %%mm2\n\t"
  719. "punpckhbw %%mm7, %%mm3\n\t"
  720. "paddusw %%mm1, %%mm0\n\t"
  721. "paddusw %%mm3, %%mm2\n\t"
  722. "psrlw $1, %%mm0\n\t"
  723. "psrlw $1, %%mm2\n\t"
  724. "packuswb %%mm2, %%mm0\n\t"
  725. "movq %%mm0, %0\n\t"
  726. :"+m"(*p)
  727. :"m"(*pix)
  728. :"memory");
  729. pix += line_size;
  730. p += line_size ;
  731. } while (--h);
  732. }
  733. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  734. {
  735. UINT8 *p;
  736. const UINT8 *pix;
  737. p = block;
  738. pix = pixels;
  739. MOVQ_ZERO(mm7);
  740. do {
  741. __asm __volatile(
  742. "movq %1, %%mm0\n\t"
  743. "movq 1%1, %%mm1\n\t"
  744. "movq %0, %%mm4\n\t"
  745. "movq %%mm0, %%mm2\n\t"
  746. "movq %%mm1, %%mm3\n\t"
  747. "movq %%mm4, %%mm5\n\t"
  748. "punpcklbw %%mm7, %%mm0\n\t"
  749. "punpcklbw %%mm7, %%mm1\n\t"
  750. "punpckhbw %%mm7, %%mm2\n\t"
  751. "punpckhbw %%mm7, %%mm3\n\t"
  752. "punpcklbw %%mm7, %%mm4\n\t"
  753. "punpckhbw %%mm7, %%mm5\n\t"
  754. "paddusw %%mm1, %%mm0\n\t"
  755. "paddusw %%mm3, %%mm2\n\t"
  756. "psrlw $1, %%mm0\n\t"
  757. "psrlw $1, %%mm2\n\t"
  758. "paddusw %%mm4, %%mm0\n\t"
  759. "paddusw %%mm5, %%mm2\n\t"
  760. "psrlw $1, %%mm0\n\t"
  761. "psrlw $1, %%mm2\n\t"
  762. "packuswb %%mm2, %%mm0\n\t"
  763. "movq %%mm0, %0\n\t"
  764. :"+m"(*p)
  765. :"m"(*pix)
  766. :"memory");
  767. pix += line_size;
  768. p += line_size;
  769. } while (--h);
  770. }
  771. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  772. {
  773. UINT8 *p;
  774. const UINT8 *pix;
  775. p = block;
  776. pix = pixels;
  777. MOVQ_ZERO(mm7);
  778. do {
  779. __asm __volatile(
  780. "movq %1, %%mm0\n\t"
  781. "movq %2, %%mm1\n\t"
  782. "movq %0, %%mm4\n\t"
  783. "movq %%mm0, %%mm2\n\t"
  784. "movq %%mm1, %%mm3\n\t"
  785. "movq %%mm4, %%mm5\n\t"
  786. "punpcklbw %%mm7, %%mm0\n\t"
  787. "punpcklbw %%mm7, %%mm1\n\t"
  788. "punpckhbw %%mm7, %%mm2\n\t"
  789. "punpckhbw %%mm7, %%mm3\n\t"
  790. "punpcklbw %%mm7, %%mm4\n\t"
  791. "punpckhbw %%mm7, %%mm5\n\t"
  792. "paddusw %%mm1, %%mm0\n\t"
  793. "paddusw %%mm3, %%mm2\n\t"
  794. "psrlw $1, %%mm0\n\t"
  795. "psrlw $1, %%mm2\n\t"
  796. "paddusw %%mm4, %%mm0\n\t"
  797. "paddusw %%mm5, %%mm2\n\t"
  798. "psrlw $1, %%mm0\n\t"
  799. "psrlw $1, %%mm2\n\t"
  800. "packuswb %%mm2, %%mm0\n\t"
  801. "movq %%mm0, %0\n\t"
  802. :"+m"(*p)
  803. :"m"(*pix), "m"(*(pix+line_size))
  804. :"memory");
  805. pix += line_size;
  806. p += line_size ;
  807. } while(--h);
  808. }
  809. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  810. {
  811. UINT8 *p;
  812. const UINT8 *pix;
  813. p = block;
  814. pix = pixels;
  815. MOVQ_ZERO(mm7);
  816. MOVQ_WONE(mm6);
  817. JUMPALIGN();
  818. do {
  819. __asm __volatile(
  820. "movq %1, %%mm0\n\t"
  821. "movq %2, %%mm1\n\t"
  822. "movq 1%1, %%mm4\n\t"
  823. "movq 1%2, %%mm5\n\t"
  824. "movq %%mm0, %%mm2\n\t"
  825. "movq %%mm1, %%mm3\n\t"
  826. "punpcklbw %%mm7, %%mm0\n\t"
  827. "punpcklbw %%mm7, %%mm1\n\t"
  828. "punpckhbw %%mm7, %%mm2\n\t"
  829. "punpckhbw %%mm7, %%mm3\n\t"
  830. "paddusw %%mm1, %%mm0\n\t"
  831. "paddusw %%mm3, %%mm2\n\t"
  832. "movq %%mm4, %%mm1\n\t"
  833. "movq %%mm5, %%mm3\n\t"
  834. "punpcklbw %%mm7, %%mm4\n\t"
  835. "punpcklbw %%mm7, %%mm5\n\t"
  836. "punpckhbw %%mm7, %%mm1\n\t"
  837. "punpckhbw %%mm7, %%mm3\n\t"
  838. "paddusw %%mm5, %%mm4\n\t"
  839. "paddusw %%mm3, %%mm1\n\t"
  840. "paddusw %%mm6, %%mm4\n\t"
  841. "paddusw %%mm6, %%mm1\n\t"
  842. "paddusw %%mm4, %%mm0\n\t"
  843. "paddusw %%mm1, %%mm2\n\t"
  844. "movq %0, %%mm1\n\t"
  845. "psrlw $2, %%mm0\n\t"
  846. "movq %%mm1, %%mm3\n\t"
  847. "psrlw $2, %%mm2\n\t"
  848. "punpcklbw %%mm7, %%mm1\n\t"
  849. "punpckhbw %%mm7, %%mm3\n\t"
  850. "paddusw %%mm1, %%mm0\n\t"
  851. "paddusw %%mm3, %%mm2\n\t"
  852. "psrlw $1, %%mm0\n\t"
  853. "psrlw $1, %%mm2\n\t"
  854. "packuswb %%mm2, %%mm0\n\t"
  855. "movq %%mm0, %0\n\t"
  856. :"+m"(*p)
  857. :"m"(*pix),
  858. "m"(*(pix+line_size))
  859. :"memory");
  860. pix += line_size;
  861. p += line_size;
  862. } while(--h);
  863. }
  864. static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  865. {
  866. DCTELEM *p;
  867. const UINT8 *pix;
  868. p = block;
  869. pix = pixels;
  870. MOVQ_ZERO(mm7);
  871. do {
  872. __asm __volatile(
  873. "movq %0, %%mm0\n\t"
  874. "movq %1, %%mm2\n\t"
  875. "movq 8%0, %%mm1\n\t"
  876. "movq %%mm2, %%mm3\n\t"
  877. "punpcklbw %%mm7, %%mm2\n\t"
  878. "punpckhbw %%mm7, %%mm3\n\t"
  879. "psubsw %%mm2, %%mm0\n\t"
  880. "psubsw %%mm3, %%mm1\n\t"
  881. "movq %%mm0, %0\n\t"
  882. "movq %%mm1, 8%0\n\t"
  883. :"+m"(*p)
  884. :"m"(*pix)
  885. :"memory");
  886. pix += line_size;
  887. p += 8;
  888. } while (--h);
  889. }
  890. static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  891. {
  892. DCTELEM *p;
  893. const UINT8 *pix;
  894. p = block;
  895. pix = pixels;
  896. MOVQ_ZERO(mm7);
  897. MOVQ_WONE(mm6);
  898. JUMPALIGN();
  899. do {
  900. __asm __volatile(
  901. "movq %0, %%mm0\n\t"
  902. "movq %1, %%mm2\n\t"
  903. "movq 8%0, %%mm1\n\t"
  904. "movq 1%1, %%mm4\n\t"
  905. "movq %%mm2, %%mm3\n\t"
  906. "movq %%mm4, %%mm5\n\t"
  907. "punpcklbw %%mm7, %%mm2\n\t"
  908. "punpckhbw %%mm7, %%mm3\n\t"
  909. "punpcklbw %%mm7, %%mm4\n\t"
  910. "punpckhbw %%mm7, %%mm5\n\t"
  911. "paddusw %%mm4, %%mm2\n\t"
  912. "paddusw %%mm5, %%mm3\n\t"
  913. "paddusw %%mm6, %%mm2\n\t"
  914. "paddusw %%mm6, %%mm3\n\t"
  915. "psrlw $1, %%mm2\n\t"
  916. "psrlw $1, %%mm3\n\t"
  917. "psubsw %%mm2, %%mm0\n\t"
  918. "psubsw %%mm3, %%mm1\n\t"
  919. "movq %%mm0, %0\n\t"
  920. "movq %%mm1, 8%0\n\t"
  921. :"+m"(*p)
  922. :"m"(*pix)
  923. :"memory");
  924. pix += line_size;
  925. p += 8;
  926. } while (--h);
  927. }
  928. static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  929. {
  930. DCTELEM *p;
  931. const UINT8 *pix;
  932. p = block;
  933. pix = pixels;
  934. MOVQ_ZERO(mm7);
  935. MOVQ_WONE(mm6);
  936. do {
  937. __asm __volatile(
  938. "movq %0, %%mm0\n\t"
  939. "movq %1, %%mm2\n\t"
  940. "movq 8%0, %%mm1\n\t"
  941. "movq %2, %%mm4\n\t"
  942. "movq %%mm2, %%mm3\n\t"
  943. "movq %%mm4, %%mm5\n\t"
  944. "punpcklbw %%mm7, %%mm2\n\t"
  945. "punpckhbw %%mm7, %%mm3\n\t"
  946. "punpcklbw %%mm7, %%mm4\n\t"
  947. "punpckhbw %%mm7, %%mm5\n\t"
  948. "paddusw %%mm4, %%mm2\n\t"
  949. "paddusw %%mm5, %%mm3\n\t"
  950. "paddusw %%mm6, %%mm2\n\t"
  951. "paddusw %%mm6, %%mm3\n\t"
  952. "psrlw $1, %%mm2\n\t"
  953. "psrlw $1, %%mm3\n\t"
  954. "psubsw %%mm2, %%mm0\n\t"
  955. "psubsw %%mm3, %%mm1\n\t"
  956. "movq %%mm0, %0\n\t"
  957. "movq %%mm1, 8%0\n\t"
  958. :"+m"(*p)
  959. :"m"(*pix), "m"(*(pix+line_size))
  960. :"memory");
  961. pix += line_size;
  962. p += 8;
  963. } while (--h);
  964. }
  965. static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
  966. {
  967. DCTELEM *p;
  968. const UINT8 *pix;
  969. p = block;
  970. pix = pixels;
  971. MOVQ_ZERO(mm7);
  972. MOVQ_WTWO(mm6);
  973. JUMPALIGN();
  974. do {
  975. __asm __volatile(
  976. "movq %1, %%mm0\n\t"
  977. "movq %2, %%mm1\n\t"
  978. "movq 1%1, %%mm4\n\t"
  979. "movq 1%2, %%mm5\n\t"
  980. "movq %%mm0, %%mm2\n\t"
  981. "movq %%mm1, %%mm3\n\t"
  982. "punpcklbw %%mm7, %%mm0\n\t"
  983. "punpcklbw %%mm7, %%mm1\n\t"
  984. "punpckhbw %%mm7, %%mm2\n\t"
  985. "punpckhbw %%mm7, %%mm3\n\t"
  986. "paddusw %%mm1, %%mm0\n\t"
  987. "paddusw %%mm3, %%mm2\n\t"
  988. "movq %%mm4, %%mm1\n\t"
  989. "movq %%mm5, %%mm3\n\t"
  990. "punpcklbw %%mm7, %%mm4\n\t"
  991. "punpcklbw %%mm7, %%mm5\n\t"
  992. "punpckhbw %%mm7, %%mm1\n\t"
  993. "punpckhbw %%mm7, %%mm3\n\t"
  994. "paddusw %%mm5, %%mm4\n\t"
  995. "paddusw %%mm3, %%mm1\n\t"
  996. "paddusw %%mm6, %%mm4\n\t"
  997. "paddusw %%mm6, %%mm1\n\t"
  998. "paddusw %%mm4, %%mm0\n\t"
  999. "paddusw %%mm1, %%mm2\n\t"
  1000. "movq %0, %%mm1\n\t"
  1001. "movq 8%0, %%mm3\n\t"
  1002. "psrlw $2, %%mm0\n\t"
  1003. "psrlw $2, %%mm2\n\t"
  1004. "psubsw %%mm0, %%mm1\n\t"
  1005. "psubsw %%mm2, %%mm3\n\t"
  1006. "movq %%mm1, %0\n\t"
  1007. "movq %%mm3, 8%0\n\t"
  1008. :"+m"(*p)
  1009. :"m"(*pix),
  1010. "m"(*(pix+line_size))
  1011. :"memory");
  1012. pix += line_size;
  1013. p += 8 ;
  1014. } while(--h);
  1015. }
  1016. static void clear_blocks_mmx(DCTELEM *blocks)
  1017. {
  1018. asm volatile(
  1019. "pxor %%mm7, %%mm7 \n\t"
  1020. "movl $-128*6, %%eax \n\t"
  1021. "1: \n\t"
  1022. "movq %%mm7, (%0, %%eax) \n\t"
  1023. "movq %%mm7, 8(%0, %%eax) \n\t"
  1024. "movq %%mm7, 16(%0, %%eax) \n\t"
  1025. "movq %%mm7, 24(%0, %%eax) \n\t"
  1026. "addl $32, %%eax \n\t"
  1027. " js 1b \n\t"
  1028. : : "r" (((int)blocks)+128*6)
  1029. : "%eax"
  1030. );
  1031. }
  1032. #if 0
  1033. static void just_return() { return; }
  1034. #endif
  1035. void dsputil_init_mmx(void)
  1036. {
  1037. mm_flags = mm_support();
  1038. #if 1
  1039. printf("libavcodec: CPU flags:");
  1040. if (mm_flags & MM_MMX)
  1041. printf(" mmx");
  1042. if (mm_flags & MM_MMXEXT)
  1043. printf(" mmxext");
  1044. if (mm_flags & MM_3DNOW)
  1045. printf(" 3dnow");
  1046. if (mm_flags & MM_SSE)
  1047. printf(" sse");
  1048. if (mm_flags & MM_SSE2)
  1049. printf(" sse2");
  1050. printf("\n");
  1051. #endif
  1052. if (mm_flags & MM_MMX) {
  1053. get_pixels = get_pixels_mmx;
  1054. diff_pixels = diff_pixels_mmx;
  1055. put_pixels_clamped = put_pixels_clamped_mmx;
  1056. add_pixels_clamped = add_pixels_clamped_mmx;
  1057. clear_blocks= clear_blocks_mmx;
  1058. pix_abs16x16 = pix_abs16x16_mmx;
  1059. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  1060. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  1061. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  1062. pix_abs8x8 = pix_abs8x8_mmx;
  1063. pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  1064. pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  1065. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  1066. av_fdct = fdct_mmx;
  1067. put_pixels_tab[0] = put_pixels_mmx;
  1068. put_pixels_tab[1] = put_pixels_x2_mmx;
  1069. put_pixels_tab[2] = put_pixels_y2_mmx;
  1070. put_pixels_tab[3] = put_pixels_xy2_mmx;
  1071. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  1072. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1073. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1074. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  1075. avg_pixels_tab[0] = avg_pixels_mmx;
  1076. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  1077. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  1078. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1079. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  1080. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  1081. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  1082. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  1083. sub_pixels_tab[0] = sub_pixels_mmx;
  1084. sub_pixels_tab[1] = sub_pixels_x2_mmx;
  1085. sub_pixels_tab[2] = sub_pixels_y2_mmx;
  1086. sub_pixels_tab[3] = sub_pixels_xy2_mmx;
  1087. if (mm_flags & MM_MMXEXT) {
  1088. pix_abs16x16 = pix_abs16x16_mmx2;
  1089. pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  1090. pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  1091. pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
  1092. pix_abs8x8 = pix_abs8x8_mmx2;
  1093. pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  1094. pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  1095. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
  1096. put_pixels_tab[1] = put_pixels_x2_mmx2;
  1097. put_pixels_tab[2] = put_pixels_y2_mmx2;
  1098. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
  1099. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
  1100. avg_pixels_tab[0] = avg_pixels_mmx2;
  1101. avg_pixels_tab[1] = avg_pixels_x2_mmx2;
  1102. avg_pixels_tab[2] = avg_pixels_y2_mmx2;
  1103. avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
  1104. sub_pixels_tab[1] = sub_pixels_x2_mmx2;
  1105. sub_pixels_tab[2] = sub_pixels_y2_mmx2;
  1106. } else if (mm_flags & MM_3DNOW) {
  1107. put_pixels_tab[1] = put_pixels_x2_3dnow;
  1108. put_pixels_tab[2] = put_pixels_y2_3dnow;
  1109. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
  1110. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
  1111. avg_pixels_tab[0] = avg_pixels_3dnow;
  1112. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  1113. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  1114. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  1115. sub_pixels_tab[1] = sub_pixels_x2_3dnow;
  1116. sub_pixels_tab[2] = sub_pixels_y2_3dnow;
  1117. }
  1118. /* idct */
  1119. if (mm_flags & MM_MMXEXT) {
  1120. ff_idct = ff_mmxext_idct;
  1121. } else {
  1122. ff_idct = ff_mmx_idct;
  1123. }
  1124. #ifdef SIMPLE_IDCT
  1125. // ff_idct = simple_idct;
  1126. ff_idct = simple_idct_mmx;
  1127. #endif
  1128. }
  1129. #if 0
  1130. // for speed testing
  1131. get_pixels = just_return;
  1132. put_pixels_clamped = just_return;
  1133. add_pixels_clamped = just_return;
  1134. pix_abs16x16 = just_return;
  1135. pix_abs16x16_x2 = just_return;
  1136. pix_abs16x16_y2 = just_return;
  1137. pix_abs16x16_xy2 = just_return;
  1138. put_pixels_tab[0] = just_return;
  1139. put_pixels_tab[1] = just_return;
  1140. put_pixels_tab[2] = just_return;
  1141. put_pixels_tab[3] = just_return;
  1142. put_no_rnd_pixels_tab[0] = just_return;
  1143. put_no_rnd_pixels_tab[1] = just_return;
  1144. put_no_rnd_pixels_tab[2] = just_return;
  1145. put_no_rnd_pixels_tab[3] = just_return;
  1146. avg_pixels_tab[0] = just_return;
  1147. avg_pixels_tab[1] = just_return;
  1148. avg_pixels_tab[2] = just_return;
  1149. avg_pixels_tab[3] = just_return;
  1150. avg_no_rnd_pixels_tab[0] = just_return;
  1151. avg_no_rnd_pixels_tab[1] = just_return;
  1152. avg_no_rnd_pixels_tab[2] = just_return;
  1153. avg_no_rnd_pixels_tab[3] = just_return;
  1154. sub_pixels_tab[0] = just_return;
  1155. sub_pixels_tab[1] = just_return;
  1156. sub_pixels_tab[2] = just_return;
  1157. sub_pixels_tab[3] = just_return;
  1158. //av_fdct = just_return;
  1159. //ff_idct = just_return;
  1160. #endif
  1161. }