You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1127 lines
29KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. #include "../mangle.h"
  24. int mm_flags; /* multimedia extension flags */
  25. int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  26. int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  27. int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  28. int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  29. int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  30. int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  31. int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  32. int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  33. int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  34. int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  35. int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  36. int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
  37. int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  38. int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  39. int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  40. int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
  41. /* external functions, from idct_mmx.c */
  42. void ff_mmx_idct(DCTELEM *block);
  43. void ff_mmxext_idct(DCTELEM *block);
  44. /* pixel operations */
  45. static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  46. static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  47. static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  48. //static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 };
  49. //static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 };
  50. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  51. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  52. #ifndef PIC
  53. #define MOVQ_WONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wone))
  54. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  55. #define MOVQ_BONE(regd) "movq "MANGLE(mm_bone)", "#regd" \n\t"
  56. #else
  57. // for shared library it's better to use this way for accessing constants
  58. // pcmpeqd -> -1
  59. #define MOVQ_WONE(regd) \
  60. __asm __volatile ( \
  61. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  62. "psrlw $15, %%" #regd ::)
  63. #define MOVQ_WTWO(regd) \
  64. __asm __volatile ( \
  65. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  66. "psrlw $15, %%" #regd " \n\t" \
  67. "psllw $1, %%" #regd ::)
  68. #define MOVQ_BONE(regd) \
  69. "pcmpeqd " #regd ", " #regd " \n\t" \
  70. "psrlw $15, " #regd " \n\t"\
  71. "packuswb " #regd ", " #regd " \n\t"
  72. #endif
  73. /***********************************/
  74. /* 3Dnow specific */
  75. #define DEF(x) x ## _3dnow
  76. /* for Athlons PAVGUSB is prefered */
  77. #define PAVGB "pavgusb"
  78. #include "dsputil_mmx_avg.h"
  79. #undef DEF
  80. #undef PAVGB
  81. /***********************************/
  82. /* MMX2 specific */
  83. #define DEF(x) x ## _mmx2
  84. /* Introduced only in MMX2 set */
  85. #define PAVGB "pavgb"
  86. #include "dsputil_mmx_avg.h"
  87. #undef DEF
  88. #undef PAVGB
  89. /***********************************/
  90. /* standard MMX */
  91. static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
  92. {
  93. asm volatile(
  94. "movl $-128, %%eax \n\t"
  95. "pxor %%mm7, %%mm7 \n\t"
  96. ".balign 16 \n\t"
  97. "1: \n\t"
  98. "movq (%0), %%mm0 \n\t"
  99. "movq (%0, %2), %%mm2 \n\t"
  100. "movq %%mm0, %%mm1 \n\t"
  101. "movq %%mm2, %%mm3 \n\t"
  102. "punpcklbw %%mm7, %%mm0 \n\t"
  103. "punpckhbw %%mm7, %%mm1 \n\t"
  104. "punpcklbw %%mm7, %%mm2 \n\t"
  105. "punpckhbw %%mm7, %%mm3 \n\t"
  106. "movq %%mm0, (%1, %%eax)\n\t"
  107. "movq %%mm1, 8(%1, %%eax)\n\t"
  108. "movq %%mm2, 16(%1, %%eax)\n\t"
  109. "movq %%mm3, 24(%1, %%eax)\n\t"
  110. "addl %3, %0 \n\t"
  111. "addl $32, %%eax \n\t"
  112. "js 1b \n\t"
  113. : "+r" (pixels)
  114. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  115. : "%eax"
  116. );
  117. }
  118. static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
  119. {
  120. asm volatile(
  121. "pxor %%mm7, %%mm7 \n\t"
  122. "movl $-128, %%eax \n\t"
  123. ".balign 16 \n\t"
  124. "1: \n\t"
  125. "movq (%0), %%mm0 \n\t"
  126. "movq (%1), %%mm2 \n\t"
  127. "movq %%mm0, %%mm1 \n\t"
  128. "movq %%mm2, %%mm3 \n\t"
  129. "punpcklbw %%mm7, %%mm0 \n\t"
  130. "punpckhbw %%mm7, %%mm1 \n\t"
  131. "punpcklbw %%mm7, %%mm2 \n\t"
  132. "punpckhbw %%mm7, %%mm3 \n\t"
  133. "psubw %%mm2, %%mm0 \n\t"
  134. "psubw %%mm3, %%mm1 \n\t"
  135. "movq %%mm0, (%2, %%eax)\n\t"
  136. "movq %%mm1, 8(%2, %%eax)\n\t"
  137. "addl %3, %0 \n\t"
  138. "addl %3, %1 \n\t"
  139. "addl $16, %%eax \n\t"
  140. "jnz 1b \n\t"
  141. : "+r" (s1), "+r" (s2)
  142. : "r" (block+64), "r" (stride)
  143. : "%eax"
  144. );
  145. }
  146. static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  147. {
  148. const DCTELEM *p;
  149. UINT8 *pix;
  150. /* read the pixels */
  151. p = block;
  152. pix = pixels;
  153. /* unrolled loop */
  154. __asm __volatile(
  155. "movq %3, %%mm0\n\t"
  156. "movq 8%3, %%mm1\n\t"
  157. "movq 16%3, %%mm2\n\t"
  158. "movq 24%3, %%mm3\n\t"
  159. "movq 32%3, %%mm4\n\t"
  160. "movq 40%3, %%mm5\n\t"
  161. "movq 48%3, %%mm6\n\t"
  162. "movq 56%3, %%mm7\n\t"
  163. "packuswb %%mm1, %%mm0\n\t"
  164. "packuswb %%mm3, %%mm2\n\t"
  165. "packuswb %%mm5, %%mm4\n\t"
  166. "packuswb %%mm7, %%mm6\n\t"
  167. "movq %%mm0, (%0)\n\t"
  168. "movq %%mm2, (%0, %1)\n\t"
  169. "movq %%mm4, (%0, %1, 2)\n\t"
  170. "movq %%mm6, (%0, %2)\n\t"
  171. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  172. :"memory");
  173. pix += line_size*4;
  174. p += 32;
  175. // if here would be an exact copy of the code above
  176. // compiler would generate some very strange code
  177. // thus using "r"
  178. __asm __volatile(
  179. "movq (%3), %%mm0\n\t"
  180. "movq 8(%3), %%mm1\n\t"
  181. "movq 16(%3), %%mm2\n\t"
  182. "movq 24(%3), %%mm3\n\t"
  183. "movq 32(%3), %%mm4\n\t"
  184. "movq 40(%3), %%mm5\n\t"
  185. "movq 48(%3), %%mm6\n\t"
  186. "movq 56(%3), %%mm7\n\t"
  187. "packuswb %%mm1, %%mm0\n\t"
  188. "packuswb %%mm3, %%mm2\n\t"
  189. "packuswb %%mm5, %%mm4\n\t"
  190. "packuswb %%mm7, %%mm6\n\t"
  191. "movq %%mm0, (%0)\n\t"
  192. "movq %%mm2, (%0, %1)\n\t"
  193. "movq %%mm4, (%0, %1, 2)\n\t"
  194. "movq %%mm6, (%0, %2)\n\t"
  195. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  196. :"memory");
  197. }
  198. static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
  199. {
  200. const DCTELEM *p;
  201. UINT8 *pix;
  202. int i;
  203. /* read the pixels */
  204. p = block;
  205. pix = pixels;
  206. MOVQ_ZERO(mm7);
  207. i = 4;
  208. do {
  209. __asm __volatile(
  210. "movq (%2), %%mm0\n\t"
  211. "movq 8(%2), %%mm1\n\t"
  212. "movq 16(%2), %%mm2\n\t"
  213. "movq 24(%2), %%mm3\n\t"
  214. "movq %0, %%mm4\n\t"
  215. "movq %1, %%mm6\n\t"
  216. "movq %%mm4, %%mm5\n\t"
  217. "punpcklbw %%mm7, %%mm4\n\t"
  218. "punpckhbw %%mm7, %%mm5\n\t"
  219. "paddsw %%mm4, %%mm0\n\t"
  220. "paddsw %%mm5, %%mm1\n\t"
  221. "movq %%mm6, %%mm5\n\t"
  222. "punpcklbw %%mm7, %%mm6\n\t"
  223. "punpckhbw %%mm7, %%mm5\n\t"
  224. "paddsw %%mm6, %%mm2\n\t"
  225. "paddsw %%mm5, %%mm3\n\t"
  226. "packuswb %%mm1, %%mm0\n\t"
  227. "packuswb %%mm3, %%mm2\n\t"
  228. "movq %%mm0, %0\n\t"
  229. "movq %%mm2, %1\n\t"
  230. :"+m"(*pix), "+m"(*(pix+line_size))
  231. :"r"(p)
  232. :"memory");
  233. pix += line_size*2;
  234. p += 16;
  235. } while (--i);
  236. }
  237. static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  238. {
  239. #if 0 //FIXME h==4 case
  240. asm volatile(
  241. "xorl %%eax, %%eax \n\t"
  242. "movl %3, %%esi \n\t"
  243. "1: \n\t"
  244. "movq (%1, %%eax), %%mm0 \n\t"
  245. "movq %%mm0, (%0, %%eax) \n\t"
  246. "addl %2, %%eax \n\t"
  247. "movq (%1, %%eax), %%mm0 \n\t"
  248. "movq %%mm0, (%0, %%eax) \n\t"
  249. "addl %2, %%eax \n\t"
  250. "movq (%1, %%eax), %%mm0 \n\t"
  251. "movq %%mm0, (%0, %%eax) \n\t"
  252. "addl %2, %%eax \n\t"
  253. "movq (%1, %%eax), %%mm0 \n\t"
  254. "movq %%mm0, (%0, %%eax) \n\t"
  255. "addl %2, %%eax \n\t"
  256. "movq (%1, %%eax), %%mm0 \n\t"
  257. "movq %%mm0, (%0, %%eax) \n\t"
  258. "addl %2, %%eax \n\t"
  259. "movq (%1, %%eax), %%mm0 \n\t"
  260. "movq %%mm0, (%0, %%eax) \n\t"
  261. "addl %2, %%eax \n\t"
  262. "movq (%1, %%eax), %%mm0 \n\t"
  263. "movq %%mm0, (%0, %%eax) \n\t"
  264. "addl %2, %%eax \n\t"
  265. "movq (%1, %%eax), %%mm0 \n\t"
  266. "movq %%mm0, (%0, %%eax) \n\t"
  267. "addl %2, %%eax \n\t"
  268. "subl $8, %%esi \n\t"
  269. " jnz 1b \n\t"
  270. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  271. : "%eax", "%esi", "memory"
  272. );
  273. #else
  274. asm volatile(
  275. "xorl %%eax, %%eax \n\t"
  276. "movl %3, %%esi \n\t"
  277. "1: \n\t"
  278. "movq (%1, %%eax), %%mm0 \n\t"
  279. "movq %%mm0, (%0, %%eax) \n\t"
  280. "addl %2, %%eax \n\t"
  281. "movq (%1, %%eax), %%mm0 \n\t"
  282. "movq %%mm0, (%0, %%eax) \n\t"
  283. "addl %2, %%eax \n\t"
  284. "movq (%1, %%eax), %%mm0 \n\t"
  285. "movq %%mm0, (%0, %%eax) \n\t"
  286. "addl %2, %%eax \n\t"
  287. "movq (%1, %%eax), %%mm0 \n\t"
  288. "movq %%mm0, (%0, %%eax) \n\t"
  289. "addl %2, %%eax \n\t"
  290. "subl $4, %%esi \n\t"
  291. " jnz 1b \n\t"
  292. :: "r" (block), "r" (pixels), "r"(line_size), "m"(h)
  293. : "%eax", "%esi", "memory"
  294. );
  295. #endif
  296. }
  297. static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  298. {
  299. UINT8 *p;
  300. const UINT8 *pix;
  301. p = block;
  302. pix = pixels;
  303. MOVQ_ZERO(mm7);
  304. MOVQ_WONE(mm4);
  305. JUMPALIGN();
  306. do {
  307. __asm __volatile(
  308. "movq %1, %%mm0\n\t"
  309. "movq 1%1, %%mm1\n\t"
  310. "movq %%mm0, %%mm2\n\t"
  311. "movq %%mm1, %%mm3\n\t"
  312. "punpcklbw %%mm7, %%mm0\n\t"
  313. "punpcklbw %%mm7, %%mm1\n\t"
  314. "punpckhbw %%mm7, %%mm2\n\t"
  315. "punpckhbw %%mm7, %%mm3\n\t"
  316. "paddusw %%mm1, %%mm0\n\t"
  317. "paddusw %%mm3, %%mm2\n\t"
  318. "paddusw %%mm4, %%mm0\n\t"
  319. "paddusw %%mm4, %%mm2\n\t"
  320. "psrlw $1, %%mm0\n\t"
  321. "psrlw $1, %%mm2\n\t"
  322. "packuswb %%mm2, %%mm0\n\t"
  323. "movq %%mm0, %0\n\t"
  324. :"=m"(*p)
  325. :"m"(*pix)
  326. :"memory");
  327. pix += line_size; p += line_size;
  328. } while (--h);
  329. }
  330. static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  331. {
  332. UINT8 *p;
  333. const UINT8 *pix;
  334. p = block;
  335. pix = pixels;
  336. MOVQ_ZERO(mm7);
  337. MOVQ_WONE(mm4);
  338. JUMPALIGN();
  339. do {
  340. __asm __volatile(
  341. "movq %1, %%mm0\n\t"
  342. "movq %2, %%mm1\n\t"
  343. "movq %%mm0, %%mm2\n\t"
  344. "movq %%mm1, %%mm3\n\t"
  345. "punpcklbw %%mm7, %%mm0\n\t"
  346. "punpcklbw %%mm7, %%mm1\n\t"
  347. "punpckhbw %%mm7, %%mm2\n\t"
  348. "punpckhbw %%mm7, %%mm3\n\t"
  349. "paddusw %%mm1, %%mm0\n\t"
  350. "paddusw %%mm3, %%mm2\n\t"
  351. "paddusw %%mm4, %%mm0\n\t"
  352. "paddusw %%mm4, %%mm2\n\t"
  353. "psrlw $1, %%mm0\n\t"
  354. "psrlw $1, %%mm2\n\t"
  355. "packuswb %%mm2, %%mm0\n\t"
  356. "movq %%mm0, %0\n\t"
  357. :"=m"(*p)
  358. :"m"(*pix),
  359. "m"(*(pix+line_size))
  360. :"memory");
  361. pix += line_size;
  362. p += line_size;
  363. } while (--h);
  364. }
  365. static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  366. {
  367. UINT8 *p;
  368. const UINT8 *pix;
  369. p = block;
  370. pix = pixels; // 1s
  371. MOVQ_ZERO(mm7);
  372. MOVQ_WTWO(mm6);
  373. JUMPALIGN();
  374. do {
  375. __asm __volatile(
  376. "movq %1, %%mm0\n\t"
  377. "movq %2, %%mm1\n\t"
  378. "movq 1%1, %%mm4\n\t"
  379. "movq 1%2, %%mm5\n\t"
  380. "movq %%mm0, %%mm2\n\t"
  381. "movq %%mm1, %%mm3\n\t"
  382. "punpcklbw %%mm7, %%mm0\n\t"
  383. "punpcklbw %%mm7, %%mm1\n\t"
  384. "punpckhbw %%mm7, %%mm2\n\t"
  385. "punpckhbw %%mm7, %%mm3\n\t"
  386. "paddusw %%mm1, %%mm0\n\t"
  387. "paddusw %%mm3, %%mm2\n\t"
  388. "movq %%mm4, %%mm1\n\t"
  389. "movq %%mm5, %%mm3\n\t"
  390. "punpcklbw %%mm7, %%mm4\n\t"
  391. "punpcklbw %%mm7, %%mm5\n\t"
  392. "punpckhbw %%mm7, %%mm1\n\t"
  393. "punpckhbw %%mm7, %%mm3\n\t"
  394. "paddusw %%mm5, %%mm4\n\t"
  395. "paddusw %%mm3, %%mm1\n\t"
  396. "paddusw %%mm6, %%mm4\n\t"
  397. "paddusw %%mm6, %%mm1\n\t"
  398. "paddusw %%mm4, %%mm0\n\t"
  399. "paddusw %%mm1, %%mm2\n\t"
  400. "psrlw $2, %%mm0\n\t"
  401. "psrlw $2, %%mm2\n\t"
  402. "packuswb %%mm2, %%mm0\n\t"
  403. "movq %%mm0, %0\n\t"
  404. :"=m"(*p)
  405. :"m"(*pix),
  406. "m"(*(pix+line_size))
  407. :"memory");
  408. pix += line_size;
  409. p += line_size;
  410. } while(--h);
  411. }
  412. static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  413. {
  414. UINT8 *p;
  415. const UINT8 *pix;
  416. p = block;
  417. pix = pixels;
  418. MOVQ_ZERO(mm7);
  419. do {
  420. __asm __volatile(
  421. "movq %1, %%mm0\n\t"
  422. "movq 1%1, %%mm1\n\t"
  423. "movq %%mm0, %%mm2\n\t"
  424. "movq %%mm1, %%mm3\n\t"
  425. "punpcklbw %%mm7, %%mm0\n\t"
  426. "punpcklbw %%mm7, %%mm1\n\t"
  427. "punpckhbw %%mm7, %%mm2\n\t"
  428. "punpckhbw %%mm7, %%mm3\n\t"
  429. "paddusw %%mm1, %%mm0\n\t"
  430. "paddusw %%mm3, %%mm2\n\t"
  431. "psrlw $1, %%mm0\n\t"
  432. "psrlw $1, %%mm2\n\t"
  433. "packuswb %%mm2, %%mm0\n\t"
  434. "movq %%mm0, %0\n\t"
  435. :"=m"(*p)
  436. :"m"(*pix)
  437. :"memory");
  438. pix += line_size;
  439. p += line_size;
  440. } while (--h);
  441. }
  442. static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  443. {
  444. UINT8 *p;
  445. const UINT8 *pix;
  446. p = block;
  447. pix = pixels;
  448. MOVQ_ZERO(mm7);
  449. JUMPALIGN();
  450. do {
  451. __asm __volatile(
  452. "movq %1, %%mm0\n\t"
  453. "movq %2, %%mm1\n\t"
  454. "movq %%mm0, %%mm2\n\t"
  455. "movq %%mm1, %%mm3\n\t"
  456. "punpcklbw %%mm7, %%mm0\n\t"
  457. "punpcklbw %%mm7, %%mm1\n\t"
  458. "punpckhbw %%mm7, %%mm2\n\t"
  459. "punpckhbw %%mm7, %%mm3\n\t"
  460. "paddusw %%mm1, %%mm0\n\t"
  461. "paddusw %%mm3, %%mm2\n\t"
  462. "psrlw $1, %%mm0\n\t"
  463. "psrlw $1, %%mm2\n\t"
  464. "packuswb %%mm2, %%mm0\n\t"
  465. "movq %%mm0, %0\n\t"
  466. :"=m"(*p)
  467. :"m"(*pix),
  468. "m"(*(pix+line_size))
  469. :"memory");
  470. pix += line_size;
  471. p += line_size;
  472. } while(--h);
  473. }
  474. static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  475. {
  476. UINT8 *p;
  477. const UINT8 *pix;
  478. p = block;
  479. pix = pixels;
  480. MOVQ_ZERO(mm7);
  481. MOVQ_WONE(mm6);
  482. JUMPALIGN();
  483. do {
  484. __asm __volatile(
  485. "movq %1, %%mm0\n\t"
  486. "movq %2, %%mm1\n\t"
  487. "movq 1%1, %%mm4\n\t"
  488. "movq 1%2, %%mm5\n\t"
  489. "movq %%mm0, %%mm2\n\t"
  490. "movq %%mm1, %%mm3\n\t"
  491. "punpcklbw %%mm7, %%mm0\n\t"
  492. "punpcklbw %%mm7, %%mm1\n\t"
  493. "punpckhbw %%mm7, %%mm2\n\t"
  494. "punpckhbw %%mm7, %%mm3\n\t"
  495. "paddusw %%mm1, %%mm0\n\t"
  496. "paddusw %%mm3, %%mm2\n\t"
  497. "movq %%mm4, %%mm1\n\t"
  498. "movq %%mm5, %%mm3\n\t"
  499. "punpcklbw %%mm7, %%mm4\n\t"
  500. "punpcklbw %%mm7, %%mm5\n\t"
  501. "punpckhbw %%mm7, %%mm1\n\t"
  502. "punpckhbw %%mm7, %%mm3\n\t"
  503. "paddusw %%mm5, %%mm4\n\t"
  504. "paddusw %%mm3, %%mm1\n\t"
  505. "paddusw %%mm6, %%mm4\n\t"
  506. "paddusw %%mm6, %%mm1\n\t"
  507. "paddusw %%mm4, %%mm0\n\t"
  508. "paddusw %%mm1, %%mm2\n\t"
  509. "psrlw $2, %%mm0\n\t"
  510. "psrlw $2, %%mm2\n\t"
  511. "packuswb %%mm2, %%mm0\n\t"
  512. "movq %%mm0, %0\n\t"
  513. :"=m"(*p)
  514. :"m"(*pix),
  515. "m"(*(pix+line_size))
  516. :"memory");
  517. pix += line_size;
  518. p += line_size;
  519. } while(--h);
  520. }
  521. static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
  522. {
  523. UINT8 *p;
  524. const UINT8 *pix;
  525. p = block;
  526. pix = pixels;
  527. MOVQ_ZERO(mm7);
  528. MOVQ_WONE(mm6);
  529. JUMPALIGN();
  530. do {
  531. __asm __volatile(
  532. "movq %0, %%mm0\n\t"
  533. "movq %1, %%mm1\n\t"
  534. "movq %%mm0, %%mm2\n\t"
  535. "movq %%mm1, %%mm3\n\t"
  536. "punpcklbw %%mm7, %%mm0\n\t"
  537. "punpcklbw %%mm7, %%mm1\n\t"
  538. "punpckhbw %%mm7, %%mm2\n\t"
  539. "punpckhbw %%mm7, %%mm3\n\t"
  540. "paddusw %%mm1, %%mm0\n\t"
  541. "paddusw %%mm3, %%mm2\n\t"
  542. "paddusw %%mm6, %%mm0\n\t"
  543. "paddusw %%mm6, %%mm2\n\t"
  544. "psrlw $1, %%mm0\n\t"
  545. "psrlw $1, %%mm2\n\t"
  546. "packuswb %%mm2, %%mm0\n\t"
  547. "movq %%mm0, %0\n\t"
  548. :"+m"(*p)
  549. :"m"(*pix)
  550. :"memory");
  551. pix += line_size;
  552. p += line_size;
  553. }
  554. while (--h);
  555. }
  556. static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  557. {
  558. UINT8 *p;
  559. const UINT8 *pix;
  560. p = block;
  561. pix = pixels;
  562. MOVQ_ZERO(mm7);
  563. MOVQ_WONE(mm6);
  564. JUMPALIGN();
  565. do {
  566. __asm __volatile(
  567. "movq %1, %%mm1\n\t"
  568. "movq %0, %%mm0\n\t"
  569. "movq 1%1, %%mm4\n\t"
  570. "movq %%mm0, %%mm2\n\t"
  571. "movq %%mm1, %%mm3\n\t"
  572. "movq %%mm4, %%mm5\n\t"
  573. "punpcklbw %%mm7, %%mm1\n\t"
  574. "punpckhbw %%mm7, %%mm3\n\t"
  575. "punpcklbw %%mm7, %%mm4\n\t"
  576. "punpckhbw %%mm7, %%mm5\n\t"
  577. "punpcklbw %%mm7, %%mm0\n\t"
  578. "punpckhbw %%mm7, %%mm2\n\t"
  579. "paddusw %%mm4, %%mm1\n\t"
  580. "paddusw %%mm5, %%mm3\n\t"
  581. "paddusw %%mm6, %%mm1\n\t"
  582. "paddusw %%mm6, %%mm3\n\t"
  583. "psrlw $1, %%mm1\n\t"
  584. "psrlw $1, %%mm3\n\t"
  585. "paddusw %%mm6, %%mm0\n\t"
  586. "paddusw %%mm6, %%mm2\n\t"
  587. "paddusw %%mm1, %%mm0\n\t"
  588. "paddusw %%mm3, %%mm2\n\t"
  589. "psrlw $1, %%mm0\n\t"
  590. "psrlw $1, %%mm2\n\t"
  591. "packuswb %%mm2, %%mm0\n\t"
  592. "movq %%mm0, %0\n\t"
  593. :"+m"(*p)
  594. :"m"(*pix)
  595. :"memory");
  596. pix += line_size;
  597. p += line_size;
  598. } while (--h);
  599. }
  600. static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  601. {
  602. UINT8 *p;
  603. const UINT8 *pix;
  604. p = block;
  605. pix = pixels;
  606. MOVQ_ZERO(mm7);
  607. MOVQ_WONE(mm6);
  608. JUMPALIGN();
  609. do {
  610. __asm __volatile(
  611. "movq %1, %%mm1\n\t"
  612. "movq %0, %%mm0\n\t"
  613. "movq %2, %%mm4\n\t"
  614. "movq %%mm0, %%mm2\n\t"
  615. "movq %%mm1, %%mm3\n\t"
  616. "movq %%mm4, %%mm5\n\t"
  617. "punpcklbw %%mm7, %%mm1\n\t"
  618. "punpckhbw %%mm7, %%mm3\n\t"
  619. "punpcklbw %%mm7, %%mm4\n\t"
  620. "punpckhbw %%mm7, %%mm5\n\t"
  621. "punpcklbw %%mm7, %%mm0\n\t"
  622. "punpckhbw %%mm7, %%mm2\n\t"
  623. "paddusw %%mm4, %%mm1\n\t"
  624. "paddusw %%mm5, %%mm3\n\t"
  625. "paddusw %%mm6, %%mm1\n\t"
  626. "paddusw %%mm6, %%mm3\n\t"
  627. "psrlw $1, %%mm1\n\t"
  628. "psrlw $1, %%mm3\n\t"
  629. "paddusw %%mm6, %%mm0\n\t"
  630. "paddusw %%mm6, %%mm2\n\t"
  631. "paddusw %%mm1, %%mm0\n\t"
  632. "paddusw %%mm3, %%mm2\n\t"
  633. "psrlw $1, %%mm0\n\t"
  634. "psrlw $1, %%mm2\n\t"
  635. "packuswb %%mm2, %%mm0\n\t"
  636. "movq %%mm0, %0\n\t"
  637. :"+m"(*p)
  638. :"m"(*pix), "m"(*(pix+line_size))
  639. :"memory");
  640. pix += line_size;
  641. p += line_size ;
  642. } while(--h);
  643. }
  644. static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  645. {
  646. UINT8 *p;
  647. const UINT8 *pix;
  648. p = block;
  649. pix = pixels;
  650. MOVQ_ZERO(mm7);
  651. // this doesn't seem to be used offten - so
  652. // the inside usage of mm_wone is not optimized
  653. MOVQ_WTWO(mm6);
  654. do {
  655. __asm __volatile(
  656. "movq %1, %%mm0\n\t"
  657. "movq %2, %%mm1\n\t"
  658. "movq 1%1, %%mm4\n\t"
  659. "movq 1%2, %%mm5\n\t"
  660. "movq %%mm0, %%mm2\n\t"
  661. "movq %%mm1, %%mm3\n\t"
  662. "punpcklbw %%mm7, %%mm0\n\t"
  663. "punpcklbw %%mm7, %%mm1\n\t"
  664. "punpckhbw %%mm7, %%mm2\n\t"
  665. "punpckhbw %%mm7, %%mm3\n\t"
  666. "paddusw %%mm1, %%mm0\n\t"
  667. "paddusw %%mm3, %%mm2\n\t"
  668. "movq %%mm4, %%mm1\n\t"
  669. "movq %%mm5, %%mm3\n\t"
  670. "punpcklbw %%mm7, %%mm4\n\t"
  671. "punpcklbw %%mm7, %%mm5\n\t"
  672. "punpckhbw %%mm7, %%mm1\n\t"
  673. "punpckhbw %%mm7, %%mm3\n\t"
  674. "paddusw %%mm5, %%mm4\n\t"
  675. "paddusw %%mm3, %%mm1\n\t"
  676. "paddusw %%mm6, %%mm4\n\t"
  677. "paddusw %%mm6, %%mm1\n\t"
  678. "paddusw %%mm4, %%mm0\n\t"
  679. "paddusw %%mm1, %%mm2\n\t"
  680. "movq %3, %%mm5\n\t"
  681. "psrlw $2, %%mm0\n\t"
  682. "movq %0, %%mm1\n\t"
  683. "psrlw $2, %%mm2\n\t"
  684. "movq %%mm1, %%mm3\n\t"
  685. "punpcklbw %%mm7, %%mm1\n\t"
  686. "punpckhbw %%mm7, %%mm3\n\t"
  687. "paddusw %%mm1, %%mm0\n\t"
  688. "paddusw %%mm3, %%mm2\n\t"
  689. "paddusw %%mm5, %%mm0\n\t"
  690. "paddusw %%mm5, %%mm2\n\t"
  691. "psrlw $1, %%mm0\n\t"
  692. "psrlw $1, %%mm2\n\t"
  693. "packuswb %%mm2, %%mm0\n\t"
  694. "movq %%mm0, %0\n\t"
  695. :"+m"(*p)
  696. :"m"(*pix),
  697. "m"(*(pix+line_size)), "m"(mm_wone)
  698. :"memory");
  699. pix += line_size;
  700. p += line_size ;
  701. } while(--h);
  702. }
  703. static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  704. {
  705. UINT8 *p;
  706. const UINT8 *pix;
  707. p = block;
  708. pix = pixels;
  709. MOVQ_ZERO(mm7);
  710. do {
  711. __asm __volatile(
  712. "movq %1, %%mm0\n\t"
  713. "movq %0, %%mm1\n\t"
  714. "movq %%mm0, %%mm2\n\t"
  715. "movq %%mm1, %%mm3\n\t"
  716. "punpcklbw %%mm7, %%mm0\n\t"
  717. "punpcklbw %%mm7, %%mm1\n\t"
  718. "punpckhbw %%mm7, %%mm2\n\t"
  719. "punpckhbw %%mm7, %%mm3\n\t"
  720. "paddusw %%mm1, %%mm0\n\t"
  721. "paddusw %%mm3, %%mm2\n\t"
  722. "psrlw $1, %%mm0\n\t"
  723. "psrlw $1, %%mm2\n\t"
  724. "packuswb %%mm2, %%mm0\n\t"
  725. "movq %%mm0, %0\n\t"
  726. :"+m"(*p)
  727. :"m"(*pix)
  728. :"memory");
  729. pix += line_size;
  730. p += line_size ;
  731. } while (--h);
  732. }
  733. static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  734. {
  735. UINT8 *p;
  736. const UINT8 *pix;
  737. p = block;
  738. pix = pixels;
  739. MOVQ_ZERO(mm7);
  740. do {
  741. __asm __volatile(
  742. "movq %1, %%mm0\n\t"
  743. "movq 1%1, %%mm1\n\t"
  744. "movq %0, %%mm4\n\t"
  745. "movq %%mm0, %%mm2\n\t"
  746. "movq %%mm1, %%mm3\n\t"
  747. "movq %%mm4, %%mm5\n\t"
  748. "punpcklbw %%mm7, %%mm0\n\t"
  749. "punpcklbw %%mm7, %%mm1\n\t"
  750. "punpckhbw %%mm7, %%mm2\n\t"
  751. "punpckhbw %%mm7, %%mm3\n\t"
  752. "punpcklbw %%mm7, %%mm4\n\t"
  753. "punpckhbw %%mm7, %%mm5\n\t"
  754. "paddusw %%mm1, %%mm0\n\t"
  755. "paddusw %%mm3, %%mm2\n\t"
  756. "psrlw $1, %%mm0\n\t"
  757. "psrlw $1, %%mm2\n\t"
  758. "paddusw %%mm4, %%mm0\n\t"
  759. "paddusw %%mm5, %%mm2\n\t"
  760. "psrlw $1, %%mm0\n\t"
  761. "psrlw $1, %%mm2\n\t"
  762. "packuswb %%mm2, %%mm0\n\t"
  763. "movq %%mm0, %0\n\t"
  764. :"+m"(*p)
  765. :"m"(*pix)
  766. :"memory");
  767. pix += line_size;
  768. p += line_size;
  769. } while (--h);
  770. }
  771. static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  772. {
  773. UINT8 *p;
  774. const UINT8 *pix;
  775. p = block;
  776. pix = pixels;
  777. MOVQ_ZERO(mm7);
  778. do {
  779. __asm __volatile(
  780. "movq %1, %%mm0\n\t"
  781. "movq %2, %%mm1\n\t"
  782. "movq %0, %%mm4\n\t"
  783. "movq %%mm0, %%mm2\n\t"
  784. "movq %%mm1, %%mm3\n\t"
  785. "movq %%mm4, %%mm5\n\t"
  786. "punpcklbw %%mm7, %%mm0\n\t"
  787. "punpcklbw %%mm7, %%mm1\n\t"
  788. "punpckhbw %%mm7, %%mm2\n\t"
  789. "punpckhbw %%mm7, %%mm3\n\t"
  790. "punpcklbw %%mm7, %%mm4\n\t"
  791. "punpckhbw %%mm7, %%mm5\n\t"
  792. "paddusw %%mm1, %%mm0\n\t"
  793. "paddusw %%mm3, %%mm2\n\t"
  794. "psrlw $1, %%mm0\n\t"
  795. "psrlw $1, %%mm2\n\t"
  796. "paddusw %%mm4, %%mm0\n\t"
  797. "paddusw %%mm5, %%mm2\n\t"
  798. "psrlw $1, %%mm0\n\t"
  799. "psrlw $1, %%mm2\n\t"
  800. "packuswb %%mm2, %%mm0\n\t"
  801. "movq %%mm0, %0\n\t"
  802. :"+m"(*p)
  803. :"m"(*pix), "m"(*(pix+line_size))
  804. :"memory");
  805. pix += line_size;
  806. p += line_size ;
  807. } while(--h);
  808. }
  809. static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h)
  810. {
  811. UINT8 *p;
  812. const UINT8 *pix;
  813. p = block;
  814. pix = pixels;
  815. MOVQ_ZERO(mm7);
  816. MOVQ_WONE(mm6);
  817. JUMPALIGN();
  818. do {
  819. __asm __volatile(
  820. "movq %1, %%mm0\n\t"
  821. "movq %2, %%mm1\n\t"
  822. "movq 1%1, %%mm4\n\t"
  823. "movq 1%2, %%mm5\n\t"
  824. "movq %%mm0, %%mm2\n\t"
  825. "movq %%mm1, %%mm3\n\t"
  826. "punpcklbw %%mm7, %%mm0\n\t"
  827. "punpcklbw %%mm7, %%mm1\n\t"
  828. "punpckhbw %%mm7, %%mm2\n\t"
  829. "punpckhbw %%mm7, %%mm3\n\t"
  830. "paddusw %%mm1, %%mm0\n\t"
  831. "paddusw %%mm3, %%mm2\n\t"
  832. "movq %%mm4, %%mm1\n\t"
  833. "movq %%mm5, %%mm3\n\t"
  834. "punpcklbw %%mm7, %%mm4\n\t"
  835. "punpcklbw %%mm7, %%mm5\n\t"
  836. "punpckhbw %%mm7, %%mm1\n\t"
  837. "punpckhbw %%mm7, %%mm3\n\t"
  838. "paddusw %%mm5, %%mm4\n\t"
  839. "paddusw %%mm3, %%mm1\n\t"
  840. "paddusw %%mm6, %%mm4\n\t"
  841. "paddusw %%mm6, %%mm1\n\t"
  842. "paddusw %%mm4, %%mm0\n\t"
  843. "paddusw %%mm1, %%mm2\n\t"
  844. "movq %0, %%mm1\n\t"
  845. "psrlw $2, %%mm0\n\t"
  846. "movq %%mm1, %%mm3\n\t"
  847. "psrlw $2, %%mm2\n\t"
  848. "punpcklbw %%mm7, %%mm1\n\t"
  849. "punpckhbw %%mm7, %%mm3\n\t"
  850. "paddusw %%mm1, %%mm0\n\t"
  851. "paddusw %%mm3, %%mm2\n\t"
  852. "psrlw $1, %%mm0\n\t"
  853. "psrlw $1, %%mm2\n\t"
  854. "packuswb %%mm2, %%mm0\n\t"
  855. "movq %%mm0, %0\n\t"
  856. :"+m"(*p)
  857. :"m"(*pix),
  858. "m"(*(pix+line_size))
  859. :"memory");
  860. pix += line_size;
  861. p += line_size;
  862. } while(--h);
  863. }
  864. static void clear_blocks_mmx(DCTELEM *blocks)
  865. {
  866. asm volatile(
  867. "pxor %%mm7, %%mm7 \n\t"
  868. "movl $-128*6, %%eax \n\t"
  869. "1: \n\t"
  870. "movq %%mm7, (%0, %%eax) \n\t"
  871. "movq %%mm7, 8(%0, %%eax) \n\t"
  872. "movq %%mm7, 16(%0, %%eax) \n\t"
  873. "movq %%mm7, 24(%0, %%eax) \n\t"
  874. "addl $32, %%eax \n\t"
  875. " js 1b \n\t"
  876. : : "r" (((int)blocks)+128*6)
  877. : "%eax"
  878. );
  879. }
  880. #if 0
  881. static void just_return() { return; }
  882. #endif
  883. #ifndef TESTCPU_MAIN
  884. void dsputil_init_mmx(void)
  885. {
  886. mm_flags = mm_support();
  887. #if 1
  888. printf("libavcodec: CPU flags:");
  889. if (mm_flags & MM_MMX)
  890. printf(" mmx");
  891. if (mm_flags & MM_MMXEXT)
  892. printf(" mmxext");
  893. if (mm_flags & MM_3DNOW)
  894. printf(" 3dnow");
  895. if (mm_flags & MM_SSE)
  896. printf(" sse");
  897. if (mm_flags & MM_SSE2)
  898. printf(" sse2");
  899. printf("\n");
  900. #endif
  901. if (mm_flags & MM_MMX) {
  902. get_pixels = get_pixels_mmx;
  903. diff_pixels = diff_pixels_mmx;
  904. put_pixels_clamped = put_pixels_clamped_mmx;
  905. add_pixels_clamped = add_pixels_clamped_mmx;
  906. clear_blocks= clear_blocks_mmx;
  907. pix_abs16x16 = pix_abs16x16_mmx;
  908. pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
  909. pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
  910. pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
  911. pix_abs8x8 = pix_abs8x8_mmx;
  912. pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
  913. pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
  914. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
  915. av_fdct = fdct_mmx;
  916. put_pixels_tab[0] = put_pixels_mmx;
  917. put_pixels_tab[1] = put_pixels_x2_mmx;
  918. put_pixels_tab[2] = put_pixels_y2_mmx;
  919. put_pixels_tab[3] = put_pixels_xy2_mmx;
  920. put_no_rnd_pixels_tab[0] = put_pixels_mmx;
  921. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  922. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  923. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx;
  924. avg_pixels_tab[0] = avg_pixels_mmx;
  925. avg_pixels_tab[1] = avg_pixels_x2_mmx;
  926. avg_pixels_tab[2] = avg_pixels_y2_mmx;
  927. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  928. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx;
  929. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx;
  930. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx;
  931. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx;
  932. if (mm_flags & MM_MMXEXT) {
  933. pix_abs16x16 = pix_abs16x16_mmx2;
  934. pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
  935. pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
  936. pix_abs16x16_xy2= pix_abs16x16_xy2_mmx2;
  937. pix_abs8x8 = pix_abs8x8_mmx2;
  938. pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
  939. pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
  940. pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2;
  941. put_pixels_tab[1] = put_pixels_x2_mmx2;
  942. put_pixels_tab[2] = put_pixels_y2_mmx2;
  943. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2;
  944. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2;
  945. avg_pixels_tab[0] = avg_pixels_mmx2;
  946. avg_pixels_tab[1] = avg_pixels_x2_mmx2;
  947. avg_pixels_tab[2] = avg_pixels_y2_mmx2;
  948. avg_pixels_tab[3] = avg_pixels_xy2_mmx2;
  949. } else if (mm_flags & MM_3DNOW) {
  950. put_pixels_tab[1] = put_pixels_x2_3dnow;
  951. put_pixels_tab[2] = put_pixels_y2_3dnow;
  952. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow;
  953. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow;
  954. avg_pixels_tab[0] = avg_pixels_3dnow;
  955. avg_pixels_tab[1] = avg_pixels_x2_3dnow;
  956. avg_pixels_tab[2] = avg_pixels_y2_3dnow;
  957. avg_pixels_tab[3] = avg_pixels_xy2_3dnow;
  958. }
  959. /* idct */
  960. if (mm_flags & MM_MMXEXT) {
  961. ff_idct = ff_mmxext_idct;
  962. } else {
  963. ff_idct = ff_mmx_idct;
  964. }
  965. #ifdef SIMPLE_IDCT
  966. // ff_idct = simple_idct;
  967. ff_idct = simple_idct_mmx;
  968. #endif
  969. }
  970. #if 0
  971. // for speed testing
  972. get_pixels = just_return;
  973. put_pixels_clamped = just_return;
  974. add_pixels_clamped = just_return;
  975. pix_abs16x16 = just_return;
  976. pix_abs16x16_x2 = just_return;
  977. pix_abs16x16_y2 = just_return;
  978. pix_abs16x16_xy2 = just_return;
  979. put_pixels_tab[0] = just_return;
  980. put_pixels_tab[1] = just_return;
  981. put_pixels_tab[2] = just_return;
  982. put_pixels_tab[3] = just_return;
  983. put_no_rnd_pixels_tab[0] = just_return;
  984. put_no_rnd_pixels_tab[1] = just_return;
  985. put_no_rnd_pixels_tab[2] = just_return;
  986. put_no_rnd_pixels_tab[3] = just_return;
  987. avg_pixels_tab[0] = just_return;
  988. avg_pixels_tab[1] = just_return;
  989. avg_pixels_tab[2] = just_return;
  990. avg_pixels_tab[3] = just_return;
  991. avg_no_rnd_pixels_tab[0] = just_return;
  992. avg_no_rnd_pixels_tab[1] = just_return;
  993. avg_no_rnd_pixels_tab[2] = just_return;
  994. avg_no_rnd_pixels_tab[3] = just_return;
  995. //av_fdct = just_return;
  996. //ff_idct = just_return;
  997. #endif
  998. }
  999. /* remove any non bit exact operation (testing purpose). NOTE that
  1000. this function should be kept as small as possible because it is
  1001. always difficult to test automatically non bit exact cases. */
  1002. void dsputil_set_bit_exact_mmx(void)
  1003. {
  1004. if (mm_flags & MM_MMX) {
  1005. if (mm_flags & MM_MMXEXT) {
  1006. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1007. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1008. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1009. } else if (mm_flags & MM_3DNOW) {
  1010. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx;
  1011. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx;
  1012. avg_pixels_tab[3] = avg_pixels_xy2_mmx;
  1013. }
  1014. }
  1015. }
  1016. #else // TESTCPU_MAIN
  1017. /*
  1018. * for testing speed of various routine - should be probably extended
  1019. * for a general purpose regression test later
  1020. *
  1021. * for now use it this way:
  1022. *
  1023. * gcc -O4 -fomit-frame-pointer -DHAVE_AV_CONFIG_H -DTESTCPU_MAIN -I../.. -o test dsputil_mmx.c
  1024. *
  1025. * in libavcodec/i386 directory - then run ./test
  1026. */
  1027. static inline long long rdtsc()
  1028. {
  1029. long long l;
  1030. asm volatile( "rdtsc\n\t"
  1031. : "=A" (l)
  1032. );
  1033. return l;
  1034. }
  1035. int main(int argc, char* argv[])
  1036. {
  1037. volatile int v;
  1038. int i;
  1039. const int linesize = 720;
  1040. char empty[32768];
  1041. uint64_t te, ts = rdtsc();
  1042. char* im, *bu = empty;
  1043. op_pixels_func fc = put_pixels_y2_mmx2;
  1044. bu += 32;
  1045. bu =(char*)(((long)bu) & ~0xf); // 16 bytes alignment
  1046. im = bu;
  1047. for(i=0; i<1000000; i++){
  1048. fc(im, im + 1000, linesize, 16);
  1049. im += 4; //
  1050. if (im > bu + 10000)
  1051. im = bu;
  1052. }
  1053. te = rdtsc();
  1054. printf("CPU Ticks: %7d\n", (int)(te - ts));
  1055. }
  1056. #endif