You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2046 lines
76KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../simple_idct.h"
  23. extern const uint8_t ff_h263_loop_filter_strength[32];
  24. int mm_flags; /* multimedia extension flags */
  25. /* pixel operations */
  26. static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  27. static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  28. static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  29. static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
  30. static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
  31. static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
  32. static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
  33. static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
  34. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  35. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  36. #define MOVQ_WONE(regd) \
  37. __asm __volatile ( \
  38. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  39. "psrlw $15, %%" #regd ::)
  40. #define MOVQ_BFE(regd) \
  41. __asm __volatile ( \
  42. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  43. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  44. #ifndef PIC
  45. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
  46. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  47. #else
  48. // for shared library it's better to use this way for accessing constants
  49. // pcmpeqd -> -1
  50. #define MOVQ_BONE(regd) \
  51. __asm __volatile ( \
  52. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  53. "psrlw $15, %%" #regd " \n\t" \
  54. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  55. #define MOVQ_WTWO(regd) \
  56. __asm __volatile ( \
  57. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  58. "psrlw $15, %%" #regd " \n\t" \
  59. "psllw $1, %%" #regd " \n\t"::)
  60. #endif
  61. // using regr as temporary and for the output result
  62. // first argument is unmodifed and second is trashed
  63. // regfe is supposed to contain 0xfefefefefefefefe
  64. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  65. "movq " #rega ", " #regr " \n\t"\
  66. "pand " #regb ", " #regr " \n\t"\
  67. "pxor " #rega ", " #regb " \n\t"\
  68. "pand " #regfe "," #regb " \n\t"\
  69. "psrlq $1, " #regb " \n\t"\
  70. "paddb " #regb ", " #regr " \n\t"
  71. #define PAVGB_MMX(rega, regb, regr, regfe) \
  72. "movq " #rega ", " #regr " \n\t"\
  73. "por " #regb ", " #regr " \n\t"\
  74. "pxor " #rega ", " #regb " \n\t"\
  75. "pand " #regfe "," #regb " \n\t"\
  76. "psrlq $1, " #regb " \n\t"\
  77. "psubb " #regb ", " #regr " \n\t"
  78. // mm6 is supposed to contain 0xfefefefefefefefe
  79. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  80. "movq " #rega ", " #regr " \n\t"\
  81. "movq " #regc ", " #regp " \n\t"\
  82. "pand " #regb ", " #regr " \n\t"\
  83. "pand " #regd ", " #regp " \n\t"\
  84. "pxor " #rega ", " #regb " \n\t"\
  85. "pxor " #regc ", " #regd " \n\t"\
  86. "pand %%mm6, " #regb " \n\t"\
  87. "pand %%mm6, " #regd " \n\t"\
  88. "psrlq $1, " #regb " \n\t"\
  89. "psrlq $1, " #regd " \n\t"\
  90. "paddb " #regb ", " #regr " \n\t"\
  91. "paddb " #regd ", " #regp " \n\t"
  92. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  93. "movq " #rega ", " #regr " \n\t"\
  94. "movq " #regc ", " #regp " \n\t"\
  95. "por " #regb ", " #regr " \n\t"\
  96. "por " #regd ", " #regp " \n\t"\
  97. "pxor " #rega ", " #regb " \n\t"\
  98. "pxor " #regc ", " #regd " \n\t"\
  99. "pand %%mm6, " #regb " \n\t"\
  100. "pand %%mm6, " #regd " \n\t"\
  101. "psrlq $1, " #regd " \n\t"\
  102. "psrlq $1, " #regb " \n\t"\
  103. "psubb " #regb ", " #regr " \n\t"\
  104. "psubb " #regd ", " #regp " \n\t"
  105. /***********************************/
  106. /* MMX no rounding */
  107. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  108. #define SET_RND MOVQ_WONE
  109. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  110. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  111. #include "dsputil_mmx_rnd.h"
  112. #undef DEF
  113. #undef SET_RND
  114. #undef PAVGBP
  115. #undef PAVGB
  116. /***********************************/
  117. /* MMX rounding */
  118. #define DEF(x, y) x ## _ ## y ##_mmx
  119. #define SET_RND MOVQ_WTWO
  120. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  121. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  122. #include "dsputil_mmx_rnd.h"
  123. #undef DEF
  124. #undef SET_RND
  125. #undef PAVGBP
  126. #undef PAVGB
  127. /***********************************/
  128. /* 3Dnow specific */
  129. #define DEF(x) x ## _3dnow
  130. /* for Athlons PAVGUSB is prefered */
  131. #define PAVGB "pavgusb"
  132. #include "dsputil_mmx_avg.h"
  133. #undef DEF
  134. #undef PAVGB
  135. /***********************************/
  136. /* MMX2 specific */
  137. #define DEF(x) x ## _mmx2
  138. /* Introduced only in MMX2 set */
  139. #define PAVGB "pavgb"
  140. #include "dsputil_mmx_avg.h"
  141. #undef DEF
  142. #undef PAVGB
  143. /***********************************/
  144. /* standard MMX */
  145. #ifdef CONFIG_ENCODERS
  146. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  147. {
  148. asm volatile(
  149. "movl $-128, %%eax \n\t"
  150. "pxor %%mm7, %%mm7 \n\t"
  151. ".balign 16 \n\t"
  152. "1: \n\t"
  153. "movq (%0), %%mm0 \n\t"
  154. "movq (%0, %2), %%mm2 \n\t"
  155. "movq %%mm0, %%mm1 \n\t"
  156. "movq %%mm2, %%mm3 \n\t"
  157. "punpcklbw %%mm7, %%mm0 \n\t"
  158. "punpckhbw %%mm7, %%mm1 \n\t"
  159. "punpcklbw %%mm7, %%mm2 \n\t"
  160. "punpckhbw %%mm7, %%mm3 \n\t"
  161. "movq %%mm0, (%1, %%eax)\n\t"
  162. "movq %%mm1, 8(%1, %%eax)\n\t"
  163. "movq %%mm2, 16(%1, %%eax)\n\t"
  164. "movq %%mm3, 24(%1, %%eax)\n\t"
  165. "addl %3, %0 \n\t"
  166. "addl $32, %%eax \n\t"
  167. "js 1b \n\t"
  168. : "+r" (pixels)
  169. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  170. : "%eax"
  171. );
  172. }
  173. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  174. {
  175. asm volatile(
  176. "pxor %%mm7, %%mm7 \n\t"
  177. "movl $-128, %%eax \n\t"
  178. ".balign 16 \n\t"
  179. "1: \n\t"
  180. "movq (%0), %%mm0 \n\t"
  181. "movq (%1), %%mm2 \n\t"
  182. "movq %%mm0, %%mm1 \n\t"
  183. "movq %%mm2, %%mm3 \n\t"
  184. "punpcklbw %%mm7, %%mm0 \n\t"
  185. "punpckhbw %%mm7, %%mm1 \n\t"
  186. "punpcklbw %%mm7, %%mm2 \n\t"
  187. "punpckhbw %%mm7, %%mm3 \n\t"
  188. "psubw %%mm2, %%mm0 \n\t"
  189. "psubw %%mm3, %%mm1 \n\t"
  190. "movq %%mm0, (%2, %%eax)\n\t"
  191. "movq %%mm1, 8(%2, %%eax)\n\t"
  192. "addl %3, %0 \n\t"
  193. "addl %3, %1 \n\t"
  194. "addl $16, %%eax \n\t"
  195. "jnz 1b \n\t"
  196. : "+r" (s1), "+r" (s2)
  197. : "r" (block+64), "r" (stride)
  198. : "%eax"
  199. );
  200. }
  201. #endif //CONFIG_ENCODERS
  202. void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  203. {
  204. const DCTELEM *p;
  205. uint8_t *pix;
  206. /* read the pixels */
  207. p = block;
  208. pix = pixels;
  209. /* unrolled loop */
  210. __asm __volatile(
  211. "movq %3, %%mm0\n\t"
  212. "movq 8%3, %%mm1\n\t"
  213. "movq 16%3, %%mm2\n\t"
  214. "movq 24%3, %%mm3\n\t"
  215. "movq 32%3, %%mm4\n\t"
  216. "movq 40%3, %%mm5\n\t"
  217. "movq 48%3, %%mm6\n\t"
  218. "movq 56%3, %%mm7\n\t"
  219. "packuswb %%mm1, %%mm0\n\t"
  220. "packuswb %%mm3, %%mm2\n\t"
  221. "packuswb %%mm5, %%mm4\n\t"
  222. "packuswb %%mm7, %%mm6\n\t"
  223. "movq %%mm0, (%0)\n\t"
  224. "movq %%mm2, (%0, %1)\n\t"
  225. "movq %%mm4, (%0, %1, 2)\n\t"
  226. "movq %%mm6, (%0, %2)\n\t"
  227. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  228. :"memory");
  229. pix += line_size*4;
  230. p += 32;
  231. // if here would be an exact copy of the code above
  232. // compiler would generate some very strange code
  233. // thus using "r"
  234. __asm __volatile(
  235. "movq (%3), %%mm0\n\t"
  236. "movq 8(%3), %%mm1\n\t"
  237. "movq 16(%3), %%mm2\n\t"
  238. "movq 24(%3), %%mm3\n\t"
  239. "movq 32(%3), %%mm4\n\t"
  240. "movq 40(%3), %%mm5\n\t"
  241. "movq 48(%3), %%mm6\n\t"
  242. "movq 56(%3), %%mm7\n\t"
  243. "packuswb %%mm1, %%mm0\n\t"
  244. "packuswb %%mm3, %%mm2\n\t"
  245. "packuswb %%mm5, %%mm4\n\t"
  246. "packuswb %%mm7, %%mm6\n\t"
  247. "movq %%mm0, (%0)\n\t"
  248. "movq %%mm2, (%0, %1)\n\t"
  249. "movq %%mm4, (%0, %1, 2)\n\t"
  250. "movq %%mm6, (%0, %2)\n\t"
  251. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  252. :"memory");
  253. }
  254. void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  255. {
  256. const DCTELEM *p;
  257. uint8_t *pix;
  258. int i;
  259. /* read the pixels */
  260. p = block;
  261. pix = pixels;
  262. MOVQ_ZERO(mm7);
  263. i = 4;
  264. do {
  265. __asm __volatile(
  266. "movq (%2), %%mm0\n\t"
  267. "movq 8(%2), %%mm1\n\t"
  268. "movq 16(%2), %%mm2\n\t"
  269. "movq 24(%2), %%mm3\n\t"
  270. "movq %0, %%mm4\n\t"
  271. "movq %1, %%mm6\n\t"
  272. "movq %%mm4, %%mm5\n\t"
  273. "punpcklbw %%mm7, %%mm4\n\t"
  274. "punpckhbw %%mm7, %%mm5\n\t"
  275. "paddsw %%mm4, %%mm0\n\t"
  276. "paddsw %%mm5, %%mm1\n\t"
  277. "movq %%mm6, %%mm5\n\t"
  278. "punpcklbw %%mm7, %%mm6\n\t"
  279. "punpckhbw %%mm7, %%mm5\n\t"
  280. "paddsw %%mm6, %%mm2\n\t"
  281. "paddsw %%mm5, %%mm3\n\t"
  282. "packuswb %%mm1, %%mm0\n\t"
  283. "packuswb %%mm3, %%mm2\n\t"
  284. "movq %%mm0, %0\n\t"
  285. "movq %%mm2, %1\n\t"
  286. :"+m"(*pix), "+m"(*(pix+line_size))
  287. :"r"(p)
  288. :"memory");
  289. pix += line_size*2;
  290. p += 16;
  291. } while (--i);
  292. }
  293. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  294. {
  295. __asm __volatile(
  296. "lea (%3, %3), %%eax \n\t"
  297. ".balign 8 \n\t"
  298. "1: \n\t"
  299. "movq (%1), %%mm0 \n\t"
  300. "movq (%1, %3), %%mm1 \n\t"
  301. "movq %%mm0, (%2) \n\t"
  302. "movq %%mm1, (%2, %3) \n\t"
  303. "addl %%eax, %1 \n\t"
  304. "addl %%eax, %2 \n\t"
  305. "movq (%1), %%mm0 \n\t"
  306. "movq (%1, %3), %%mm1 \n\t"
  307. "movq %%mm0, (%2) \n\t"
  308. "movq %%mm1, (%2, %3) \n\t"
  309. "addl %%eax, %1 \n\t"
  310. "addl %%eax, %2 \n\t"
  311. "subl $4, %0 \n\t"
  312. "jnz 1b \n\t"
  313. : "+g"(h), "+r" (pixels), "+r" (block)
  314. : "r"(line_size)
  315. : "%eax", "memory"
  316. );
  317. }
  318. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  319. {
  320. __asm __volatile(
  321. "lea (%3, %3), %%eax \n\t"
  322. ".balign 8 \n\t"
  323. "1: \n\t"
  324. "movq (%1), %%mm0 \n\t"
  325. "movq 8(%1), %%mm4 \n\t"
  326. "movq (%1, %3), %%mm1 \n\t"
  327. "movq 8(%1, %3), %%mm5 \n\t"
  328. "movq %%mm0, (%2) \n\t"
  329. "movq %%mm4, 8(%2) \n\t"
  330. "movq %%mm1, (%2, %3) \n\t"
  331. "movq %%mm5, 8(%2, %3) \n\t"
  332. "addl %%eax, %1 \n\t"
  333. "addl %%eax, %2 \n\t"
  334. "movq (%1), %%mm0 \n\t"
  335. "movq 8(%1), %%mm4 \n\t"
  336. "movq (%1, %3), %%mm1 \n\t"
  337. "movq 8(%1, %3), %%mm5 \n\t"
  338. "movq %%mm0, (%2) \n\t"
  339. "movq %%mm4, 8(%2) \n\t"
  340. "movq %%mm1, (%2, %3) \n\t"
  341. "movq %%mm5, 8(%2, %3) \n\t"
  342. "addl %%eax, %1 \n\t"
  343. "addl %%eax, %2 \n\t"
  344. "subl $4, %0 \n\t"
  345. "jnz 1b \n\t"
  346. : "+g"(h), "+r" (pixels), "+r" (block)
  347. : "r"(line_size)
  348. : "%eax", "memory"
  349. );
  350. }
  351. static void clear_blocks_mmx(DCTELEM *blocks)
  352. {
  353. __asm __volatile(
  354. "pxor %%mm7, %%mm7 \n\t"
  355. "movl $-128*6, %%eax \n\t"
  356. "1: \n\t"
  357. "movq %%mm7, (%0, %%eax) \n\t"
  358. "movq %%mm7, 8(%0, %%eax) \n\t"
  359. "movq %%mm7, 16(%0, %%eax) \n\t"
  360. "movq %%mm7, 24(%0, %%eax) \n\t"
  361. "addl $32, %%eax \n\t"
  362. " js 1b \n\t"
  363. : : "r" (((int)blocks)+128*6)
  364. : "%eax"
  365. );
  366. }
  367. #ifdef CONFIG_ENCODERS
  368. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  369. const int h=16;
  370. int sum;
  371. int index= -line_size*h;
  372. __asm __volatile(
  373. "pxor %%mm7, %%mm7 \n\t"
  374. "pxor %%mm6, %%mm6 \n\t"
  375. "1: \n\t"
  376. "movq (%2, %1), %%mm0 \n\t"
  377. "movq (%2, %1), %%mm1 \n\t"
  378. "movq 8(%2, %1), %%mm2 \n\t"
  379. "movq 8(%2, %1), %%mm3 \n\t"
  380. "punpcklbw %%mm7, %%mm0 \n\t"
  381. "punpckhbw %%mm7, %%mm1 \n\t"
  382. "punpcklbw %%mm7, %%mm2 \n\t"
  383. "punpckhbw %%mm7, %%mm3 \n\t"
  384. "paddw %%mm0, %%mm1 \n\t"
  385. "paddw %%mm2, %%mm3 \n\t"
  386. "paddw %%mm1, %%mm3 \n\t"
  387. "paddw %%mm3, %%mm6 \n\t"
  388. "addl %3, %1 \n\t"
  389. " js 1b \n\t"
  390. "movq %%mm6, %%mm5 \n\t"
  391. "psrlq $32, %%mm6 \n\t"
  392. "paddw %%mm5, %%mm6 \n\t"
  393. "movq %%mm6, %%mm5 \n\t"
  394. "psrlq $16, %%mm6 \n\t"
  395. "paddw %%mm5, %%mm6 \n\t"
  396. "movd %%mm6, %0 \n\t"
  397. "andl $0xFFFF, %0 \n\t"
  398. : "=&r" (sum), "+r" (index)
  399. : "r" (pix - index), "r" (line_size)
  400. );
  401. return sum;
  402. }
  403. #endif //CONFIG_ENCODERS
  404. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  405. int i=0;
  406. asm volatile(
  407. "1: \n\t"
  408. "movq (%1, %0), %%mm0 \n\t"
  409. "movq (%2, %0), %%mm1 \n\t"
  410. "paddb %%mm0, %%mm1 \n\t"
  411. "movq %%mm1, (%2, %0) \n\t"
  412. "movq 8(%1, %0), %%mm0 \n\t"
  413. "movq 8(%2, %0), %%mm1 \n\t"
  414. "paddb %%mm0, %%mm1 \n\t"
  415. "movq %%mm1, 8(%2, %0) \n\t"
  416. "addl $16, %0 \n\t"
  417. "cmpl %3, %0 \n\t"
  418. " jb 1b \n\t"
  419. : "+r" (i)
  420. : "r"(src), "r"(dst), "r"(w-15)
  421. );
  422. for(; i<w; i++)
  423. dst[i+0] += src[i+0];
  424. }
  425. #define H263_LOOP_FILTER \
  426. "pxor %%mm7, %%mm7 \n\t"\
  427. "movq %0, %%mm0 \n\t"\
  428. "movq %0, %%mm1 \n\t"\
  429. "movq %3, %%mm2 \n\t"\
  430. "movq %3, %%mm3 \n\t"\
  431. "punpcklbw %%mm7, %%mm0 \n\t"\
  432. "punpckhbw %%mm7, %%mm1 \n\t"\
  433. "punpcklbw %%mm7, %%mm2 \n\t"\
  434. "punpckhbw %%mm7, %%mm3 \n\t"\
  435. "psubw %%mm2, %%mm0 \n\t"\
  436. "psubw %%mm3, %%mm1 \n\t"\
  437. "movq %1, %%mm2 \n\t"\
  438. "movq %1, %%mm3 \n\t"\
  439. "movq %2, %%mm4 \n\t"\
  440. "movq %2, %%mm5 \n\t"\
  441. "punpcklbw %%mm7, %%mm2 \n\t"\
  442. "punpckhbw %%mm7, %%mm3 \n\t"\
  443. "punpcklbw %%mm7, %%mm4 \n\t"\
  444. "punpckhbw %%mm7, %%mm5 \n\t"\
  445. "psubw %%mm2, %%mm4 \n\t"\
  446. "psubw %%mm3, %%mm5 \n\t"\
  447. "psllw $2, %%mm4 \n\t"\
  448. "psllw $2, %%mm5 \n\t"\
  449. "paddw %%mm0, %%mm4 \n\t"\
  450. "paddw %%mm1, %%mm5 \n\t"\
  451. "pxor %%mm6, %%mm6 \n\t"\
  452. "pcmpgtw %%mm4, %%mm6 \n\t"\
  453. "pcmpgtw %%mm5, %%mm7 \n\t"\
  454. "pxor %%mm6, %%mm4 \n\t"\
  455. "pxor %%mm7, %%mm5 \n\t"\
  456. "psubw %%mm6, %%mm4 \n\t"\
  457. "psubw %%mm7, %%mm5 \n\t"\
  458. "psrlw $3, %%mm4 \n\t"\
  459. "psrlw $3, %%mm5 \n\t"\
  460. "packuswb %%mm5, %%mm4 \n\t"\
  461. "packsswb %%mm7, %%mm6 \n\t"\
  462. "pxor %%mm7, %%mm7 \n\t"\
  463. "movd %4, %%mm2 \n\t"\
  464. "punpcklbw %%mm2, %%mm2 \n\t"\
  465. "punpcklbw %%mm2, %%mm2 \n\t"\
  466. "punpcklbw %%mm2, %%mm2 \n\t"\
  467. "psubusb %%mm4, %%mm2 \n\t"\
  468. "movq %%mm2, %%mm3 \n\t"\
  469. "psubusb %%mm4, %%mm3 \n\t"\
  470. "psubb %%mm3, %%mm2 \n\t"\
  471. "movq %1, %%mm3 \n\t"\
  472. "movq %2, %%mm4 \n\t"\
  473. "pxor %%mm6, %%mm3 \n\t"\
  474. "pxor %%mm6, %%mm4 \n\t"\
  475. "paddusb %%mm2, %%mm3 \n\t"\
  476. "psubusb %%mm2, %%mm4 \n\t"\
  477. "pxor %%mm6, %%mm3 \n\t"\
  478. "pxor %%mm6, %%mm4 \n\t"\
  479. "paddusb %%mm2, %%mm2 \n\t"\
  480. "packsswb %%mm1, %%mm0 \n\t"\
  481. "pcmpgtb %%mm0, %%mm7 \n\t"\
  482. "pxor %%mm7, %%mm0 \n\t"\
  483. "psubb %%mm7, %%mm0 \n\t"\
  484. "movq %%mm0, %%mm1 \n\t"\
  485. "psubusb %%mm2, %%mm0 \n\t"\
  486. "psubb %%mm0, %%mm1 \n\t"\
  487. "pand %5, %%mm1 \n\t"\
  488. "psrlw $2, %%mm1 \n\t"\
  489. "pxor %%mm7, %%mm1 \n\t"\
  490. "psubb %%mm7, %%mm1 \n\t"\
  491. "movq %0, %%mm5 \n\t"\
  492. "movq %3, %%mm6 \n\t"\
  493. "psubb %%mm1, %%mm5 \n\t"\
  494. "paddb %%mm1, %%mm6 \n\t"
  495. static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  496. const int strength= ff_h263_loop_filter_strength[qscale];
  497. asm volatile(
  498. H263_LOOP_FILTER
  499. "movq %%mm3, %1 \n\t"
  500. "movq %%mm4, %2 \n\t"
  501. "movq %%mm5, %0 \n\t"
  502. "movq %%mm6, %3 \n\t"
  503. : "+m" (*(uint64_t*)(src - 2*stride)),
  504. "+m" (*(uint64_t*)(src - 1*stride)),
  505. "+m" (*(uint64_t*)(src + 0*stride)),
  506. "+m" (*(uint64_t*)(src + 1*stride))
  507. : "g" (2*strength), "m"(ff_pb_FC)
  508. );
  509. }
  510. static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
  511. asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
  512. "movd %4, %%mm0 \n\t"
  513. "movd %5, %%mm1 \n\t"
  514. "movd %6, %%mm2 \n\t"
  515. "movd %7, %%mm3 \n\t"
  516. "punpcklbw %%mm1, %%mm0 \n\t"
  517. "punpcklbw %%mm3, %%mm2 \n\t"
  518. "movq %%mm0, %%mm1 \n\t"
  519. "punpcklwd %%mm2, %%mm0 \n\t"
  520. "punpckhwd %%mm2, %%mm1 \n\t"
  521. "movd %%mm0, %0 \n\t"
  522. "punpckhdq %%mm0, %%mm0 \n\t"
  523. "movd %%mm0, %1 \n\t"
  524. "movd %%mm1, %2 \n\t"
  525. "punpckhdq %%mm1, %%mm1 \n\t"
  526. "movd %%mm1, %3 \n\t"
  527. : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
  528. "=m" (*(uint32_t*)(dst + 1*dst_stride)),
  529. "=m" (*(uint32_t*)(dst + 2*dst_stride)),
  530. "=m" (*(uint32_t*)(dst + 3*dst_stride))
  531. : "m" (*(uint32_t*)(src + 0*src_stride)),
  532. "m" (*(uint32_t*)(src + 1*src_stride)),
  533. "m" (*(uint32_t*)(src + 2*src_stride)),
  534. "m" (*(uint32_t*)(src + 3*src_stride))
  535. );
  536. }
  537. static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  538. const int strength= ff_h263_loop_filter_strength[qscale];
  539. uint64_t temp[4] __attribute__ ((aligned(8)));
  540. uint8_t *btemp= (uint8_t*)temp;
  541. src -= 2;
  542. transpose4x4(btemp , src , 8, stride);
  543. transpose4x4(btemp+4, src + 4*stride, 8, stride);
  544. asm volatile(
  545. H263_LOOP_FILTER // 5 3 4 6
  546. : "+m" (temp[0]),
  547. "+m" (temp[1]),
  548. "+m" (temp[2]),
  549. "+m" (temp[3])
  550. : "g" (2*strength), "m"(ff_pb_FC)
  551. );
  552. asm volatile(
  553. "movq %%mm5, %%mm1 \n\t"
  554. "movq %%mm4, %%mm0 \n\t"
  555. "punpcklbw %%mm3, %%mm5 \n\t"
  556. "punpcklbw %%mm6, %%mm4 \n\t"
  557. "punpckhbw %%mm3, %%mm1 \n\t"
  558. "punpckhbw %%mm6, %%mm0 \n\t"
  559. "movq %%mm5, %%mm3 \n\t"
  560. "movq %%mm1, %%mm6 \n\t"
  561. "punpcklwd %%mm4, %%mm5 \n\t"
  562. "punpcklwd %%mm0, %%mm1 \n\t"
  563. "punpckhwd %%mm4, %%mm3 \n\t"
  564. "punpckhwd %%mm0, %%mm6 \n\t"
  565. "movd %%mm5, %0 \n\t"
  566. "punpckhdq %%mm5, %%mm5 \n\t"
  567. "movd %%mm5, %1 \n\t"
  568. "movd %%mm3, %2 \n\t"
  569. "punpckhdq %%mm3, %%mm3 \n\t"
  570. "movd %%mm3, %3 \n\t"
  571. "movd %%mm1, %4 \n\t"
  572. "punpckhdq %%mm1, %%mm1 \n\t"
  573. "movd %%mm1, %5 \n\t"
  574. "movd %%mm6, %6 \n\t"
  575. "punpckhdq %%mm6, %%mm6 \n\t"
  576. "movd %%mm6, %7 \n\t"
  577. : "=m" (*(uint32_t*)(src + 0*stride)),
  578. "=m" (*(uint32_t*)(src + 1*stride)),
  579. "=m" (*(uint32_t*)(src + 2*stride)),
  580. "=m" (*(uint32_t*)(src + 3*stride)),
  581. "=m" (*(uint32_t*)(src + 4*stride)),
  582. "=m" (*(uint32_t*)(src + 5*stride)),
  583. "=m" (*(uint32_t*)(src + 6*stride)),
  584. "=m" (*(uint32_t*)(src + 7*stride))
  585. );
  586. }
  587. #ifdef CONFIG_ENCODERS
  588. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  589. int tmp;
  590. asm volatile (
  591. "movl $16,%%ecx\n"
  592. "pxor %%mm0,%%mm0\n"
  593. "pxor %%mm7,%%mm7\n"
  594. "1:\n"
  595. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  596. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  597. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  598. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  599. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  600. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  601. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  602. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  603. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  604. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  605. "pmaddwd %%mm3,%%mm3\n"
  606. "pmaddwd %%mm4,%%mm4\n"
  607. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  608. pix2^2+pix3^2+pix6^2+pix7^2) */
  609. "paddd %%mm3,%%mm4\n"
  610. "paddd %%mm2,%%mm7\n"
  611. "addl %2, %0\n"
  612. "paddd %%mm4,%%mm7\n"
  613. "dec %%ecx\n"
  614. "jnz 1b\n"
  615. "movq %%mm7,%%mm1\n"
  616. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  617. "paddd %%mm7,%%mm1\n"
  618. "movd %%mm1,%1\n"
  619. : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
  620. return tmp;
  621. }
  622. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  623. int tmp;
  624. asm volatile (
  625. "movl %4,%%ecx\n"
  626. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  627. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  628. "1:\n"
  629. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  630. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  631. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  632. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  633. /* todo: mm1-mm2, mm3-mm4 */
  634. /* algo: substract mm1 from mm2 with saturation and vice versa */
  635. /* OR the results to get absolute difference */
  636. "movq %%mm1,%%mm5\n"
  637. "movq %%mm3,%%mm6\n"
  638. "psubusb %%mm2,%%mm1\n"
  639. "psubusb %%mm4,%%mm3\n"
  640. "psubusb %%mm5,%%mm2\n"
  641. "psubusb %%mm6,%%mm4\n"
  642. "por %%mm1,%%mm2\n"
  643. "por %%mm3,%%mm4\n"
  644. /* now convert to 16-bit vectors so we can square them */
  645. "movq %%mm2,%%mm1\n"
  646. "movq %%mm4,%%mm3\n"
  647. "punpckhbw %%mm0,%%mm2\n"
  648. "punpckhbw %%mm0,%%mm4\n"
  649. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  650. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  651. "pmaddwd %%mm2,%%mm2\n"
  652. "pmaddwd %%mm4,%%mm4\n"
  653. "pmaddwd %%mm1,%%mm1\n"
  654. "pmaddwd %%mm3,%%mm3\n"
  655. "addl %3,%0\n"
  656. "addl %3,%1\n"
  657. "paddd %%mm2,%%mm1\n"
  658. "paddd %%mm4,%%mm3\n"
  659. "paddd %%mm1,%%mm7\n"
  660. "paddd %%mm3,%%mm7\n"
  661. "decl %%ecx\n"
  662. "jnz 1b\n"
  663. "movq %%mm7,%%mm1\n"
  664. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  665. "paddd %%mm7,%%mm1\n"
  666. "movd %%mm1,%2\n"
  667. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  668. : "r" (line_size) , "m" (h)
  669. : "%ecx");
  670. return tmp;
  671. }
  672. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  673. int i=0;
  674. asm volatile(
  675. "1: \n\t"
  676. "movq (%2, %0), %%mm0 \n\t"
  677. "movq (%1, %0), %%mm1 \n\t"
  678. "psubb %%mm0, %%mm1 \n\t"
  679. "movq %%mm1, (%3, %0) \n\t"
  680. "movq 8(%2, %0), %%mm0 \n\t"
  681. "movq 8(%1, %0), %%mm1 \n\t"
  682. "psubb %%mm0, %%mm1 \n\t"
  683. "movq %%mm1, 8(%3, %0) \n\t"
  684. "addl $16, %0 \n\t"
  685. "cmpl %4, %0 \n\t"
  686. " jb 1b \n\t"
  687. : "+r" (i)
  688. : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
  689. );
  690. for(; i<w; i++)
  691. dst[i+0] = src1[i+0]-src2[i+0];
  692. }
  693. static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
  694. int i=0;
  695. uint8_t l, lt;
  696. asm volatile(
  697. "1: \n\t"
  698. "movq -1(%1, %0), %%mm0 \n\t" // LT
  699. "movq (%1, %0), %%mm1 \n\t" // T
  700. "movq -1(%2, %0), %%mm2 \n\t" // L
  701. "movq (%2, %0), %%mm3 \n\t" // X
  702. "movq %%mm2, %%mm4 \n\t" // L
  703. "psubb %%mm0, %%mm2 \n\t"
  704. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  705. "movq %%mm4, %%mm5 \n\t" // L
  706. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  707. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  708. "pminub %%mm2, %%mm4 \n\t"
  709. "pmaxub %%mm1, %%mm4 \n\t"
  710. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  711. "movq %%mm3, (%3, %0) \n\t"
  712. "addl $8, %0 \n\t"
  713. "cmpl %4, %0 \n\t"
  714. " jb 1b \n\t"
  715. : "+r" (i)
  716. : "r"(src1), "r"(src2), "r"(dst), "r"(w)
  717. );
  718. l= *left;
  719. lt= *left_top;
  720. dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  721. *left_top= src1[w-1];
  722. *left = src2[w-1];
  723. }
  724. #define LBUTTERFLY2(a1,b1,a2,b2)\
  725. "paddw " #b1 ", " #a1 " \n\t"\
  726. "paddw " #b2 ", " #a2 " \n\t"\
  727. "paddw " #b1 ", " #b1 " \n\t"\
  728. "paddw " #b2 ", " #b2 " \n\t"\
  729. "psubw " #a1 ", " #b1 " \n\t"\
  730. "psubw " #a2 ", " #b2 " \n\t"
  731. #define HADAMARD48\
  732. LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
  733. LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
  734. LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
  735. LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
  736. LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
  737. LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
  738. #define MMABS(a,z)\
  739. "pxor " #z ", " #z " \n\t"\
  740. "pcmpgtw " #a ", " #z " \n\t"\
  741. "pxor " #z ", " #a " \n\t"\
  742. "psubw " #z ", " #a " \n\t"
  743. #define MMABS_SUM(a,z, sum)\
  744. "pxor " #z ", " #z " \n\t"\
  745. "pcmpgtw " #a ", " #z " \n\t"\
  746. "pxor " #z ", " #a " \n\t"\
  747. "psubw " #z ", " #a " \n\t"\
  748. "paddusw " #a ", " #sum " \n\t"
  749. #define MMABS_MMX2(a,z)\
  750. "pxor " #z ", " #z " \n\t"\
  751. "psubw " #a ", " #z " \n\t"\
  752. "pmaxsw " #z ", " #a " \n\t"
  753. #define MMABS_SUM_MMX2(a,z, sum)\
  754. "pxor " #z ", " #z " \n\t"\
  755. "psubw " #a ", " #z " \n\t"\
  756. "pmaxsw " #z ", " #a " \n\t"\
  757. "paddusw " #a ", " #sum " \n\t"
  758. #define SBUTTERFLY(a,b,t,n)\
  759. "movq " #a ", " #t " \n\t" /* abcd */\
  760. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  761. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  762. #define TRANSPOSE4(a,b,c,d,t)\
  763. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  764. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  765. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  766. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  767. #define LOAD4(o, a, b, c, d)\
  768. "movq "#o"(%1), " #a " \n\t"\
  769. "movq "#o"+16(%1), " #b " \n\t"\
  770. "movq "#o"+32(%1), " #c " \n\t"\
  771. "movq "#o"+48(%1), " #d " \n\t"
  772. #define STORE4(o, a, b, c, d)\
  773. "movq "#a", "#o"(%1) \n\t"\
  774. "movq "#b", "#o"+16(%1) \n\t"\
  775. "movq "#c", "#o"+32(%1) \n\t"\
  776. "movq "#d", "#o"+48(%1) \n\t"\
  777. static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  778. uint64_t temp[16] __align8;
  779. int sum=0;
  780. assert(h==8);
  781. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  782. asm volatile(
  783. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  784. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  785. HADAMARD48
  786. "movq %%mm7, 112(%1) \n\t"
  787. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  788. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  789. "movq 112(%1), %%mm7 \n\t"
  790. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  791. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  792. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  793. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  794. HADAMARD48
  795. "movq %%mm7, 120(%1) \n\t"
  796. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  797. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  798. "movq 120(%1), %%mm7 \n\t"
  799. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  800. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  801. "movq %%mm6, %%mm7 \n\t"
  802. "movq %%mm0, %%mm6 \n\t"
  803. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  804. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  805. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  806. HADAMARD48
  807. "movq %%mm7, 64(%1) \n\t"
  808. MMABS(%%mm0, %%mm7)
  809. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  810. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  811. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  812. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  813. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  814. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  815. "movq 64(%1), %%mm1 \n\t"
  816. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  817. "movq %%mm0, 64(%1) \n\t"
  818. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  819. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  820. HADAMARD48
  821. "movq %%mm7, (%1) \n\t"
  822. MMABS(%%mm0, %%mm7)
  823. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  824. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  825. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  826. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  827. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  828. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  829. "movq (%1), %%mm1 \n\t"
  830. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  831. "movq 64(%1), %%mm1 \n\t"
  832. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  833. "movq %%mm0, %%mm1 \n\t"
  834. "psrlq $32, %%mm0 \n\t"
  835. "paddusw %%mm1, %%mm0 \n\t"
  836. "movq %%mm0, %%mm1 \n\t"
  837. "psrlq $16, %%mm0 \n\t"
  838. "paddusw %%mm1, %%mm0 \n\t"
  839. "movd %%mm0, %0 \n\t"
  840. : "=r" (sum)
  841. : "r"(temp)
  842. );
  843. return sum&0xFFFF;
  844. }
  845. static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  846. uint64_t temp[16] __align8;
  847. int sum=0;
  848. assert(h==8);
  849. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  850. asm volatile(
  851. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  852. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  853. HADAMARD48
  854. "movq %%mm7, 112(%1) \n\t"
  855. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  856. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  857. "movq 112(%1), %%mm7 \n\t"
  858. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  859. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  860. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  861. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  862. HADAMARD48
  863. "movq %%mm7, 120(%1) \n\t"
  864. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  865. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  866. "movq 120(%1), %%mm7 \n\t"
  867. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  868. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  869. "movq %%mm6, %%mm7 \n\t"
  870. "movq %%mm0, %%mm6 \n\t"
  871. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  872. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  873. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  874. HADAMARD48
  875. "movq %%mm7, 64(%1) \n\t"
  876. MMABS_MMX2(%%mm0, %%mm7)
  877. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  878. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  879. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  880. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  881. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  882. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  883. "movq 64(%1), %%mm1 \n\t"
  884. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  885. "movq %%mm0, 64(%1) \n\t"
  886. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  887. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  888. HADAMARD48
  889. "movq %%mm7, (%1) \n\t"
  890. MMABS_MMX2(%%mm0, %%mm7)
  891. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  892. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  893. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  894. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  895. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  896. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  897. "movq (%1), %%mm1 \n\t"
  898. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  899. "movq 64(%1), %%mm1 \n\t"
  900. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  901. "movq %%mm0, %%mm1 \n\t"
  902. "psrlq $32, %%mm0 \n\t"
  903. "paddusw %%mm1, %%mm0 \n\t"
  904. "movq %%mm0, %%mm1 \n\t"
  905. "psrlq $16, %%mm0 \n\t"
  906. "paddusw %%mm1, %%mm0 \n\t"
  907. "movd %%mm0, %0 \n\t"
  908. : "=r" (sum)
  909. : "r"(temp)
  910. );
  911. return sum&0xFFFF;
  912. }
  913. WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
  914. WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
  915. #endif //CONFIG_ENCODERS
  916. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  917. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  918. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  919. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  920. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  921. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  922. "movq "#in7", " #m3 " \n\t" /* d */\
  923. "movq "#in0", %%mm5 \n\t" /* D */\
  924. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  925. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  926. "movq "#in1", %%mm5 \n\t" /* C */\
  927. "movq "#in2", %%mm6 \n\t" /* B */\
  928. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  929. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  930. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  931. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  932. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  933. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  934. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  935. "psraw $5, %%mm5 \n\t"\
  936. "packuswb %%mm5, %%mm5 \n\t"\
  937. OP(%%mm5, out, %%mm7, d)
  938. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  939. static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  940. uint64_t temp;\
  941. \
  942. asm volatile(\
  943. "pxor %%mm7, %%mm7 \n\t"\
  944. "1: \n\t"\
  945. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  946. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  947. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  948. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  949. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  950. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  951. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  952. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  953. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  954. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  955. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  956. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  957. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  958. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  959. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  960. "paddw %%mm3, %%mm5 \n\t" /* b */\
  961. "paddw %%mm2, %%mm6 \n\t" /* c */\
  962. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  963. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  964. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  965. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  966. "paddw %%mm4, %%mm0 \n\t" /* a */\
  967. "paddw %%mm1, %%mm5 \n\t" /* d */\
  968. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  969. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  970. "paddw %6, %%mm6 \n\t"\
  971. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  972. "psraw $5, %%mm0 \n\t"\
  973. "movq %%mm0, %5 \n\t"\
  974. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  975. \
  976. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  977. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  978. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  979. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  980. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  981. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  982. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  983. "paddw %%mm0, %%mm2 \n\t" /* b */\
  984. "paddw %%mm5, %%mm3 \n\t" /* c */\
  985. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  986. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  987. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  988. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  989. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  990. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  991. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  992. "paddw %%mm2, %%mm1 \n\t" /* a */\
  993. "paddw %%mm6, %%mm4 \n\t" /* d */\
  994. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  995. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  996. "paddw %6, %%mm1 \n\t"\
  997. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  998. "psraw $5, %%mm3 \n\t"\
  999. "movq %5, %%mm1 \n\t"\
  1000. "packuswb %%mm3, %%mm1 \n\t"\
  1001. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  1002. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  1003. \
  1004. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  1005. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  1006. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  1007. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  1008. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  1009. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  1010. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  1011. "paddw %%mm1, %%mm5 \n\t" /* b */\
  1012. "paddw %%mm4, %%mm0 \n\t" /* c */\
  1013. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1014. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  1015. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  1016. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  1017. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  1018. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  1019. "paddw %%mm3, %%mm2 \n\t" /* d */\
  1020. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  1021. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  1022. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  1023. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  1024. "paddw %%mm2, %%mm6 \n\t" /* a */\
  1025. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  1026. "paddw %6, %%mm0 \n\t"\
  1027. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1028. "psraw $5, %%mm0 \n\t"\
  1029. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  1030. \
  1031. "paddw %%mm5, %%mm3 \n\t" /* a */\
  1032. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  1033. "paddw %%mm4, %%mm6 \n\t" /* b */\
  1034. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  1035. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  1036. "paddw %%mm1, %%mm4 \n\t" /* c */\
  1037. "paddw %%mm2, %%mm5 \n\t" /* d */\
  1038. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  1039. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  1040. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  1041. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  1042. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  1043. "paddw %6, %%mm4 \n\t"\
  1044. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  1045. "psraw $5, %%mm4 \n\t"\
  1046. "packuswb %%mm4, %%mm0 \n\t"\
  1047. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  1048. \
  1049. "addl %3, %0 \n\t"\
  1050. "addl %4, %1 \n\t"\
  1051. "decl %2 \n\t"\
  1052. " jnz 1b \n\t"\
  1053. : "+a"(src), "+c"(dst), "+m"(h)\
  1054. : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1055. : "memory"\
  1056. );\
  1057. }\
  1058. \
  1059. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1060. int i;\
  1061. int16_t temp[16];\
  1062. /* quick HACK, XXX FIXME MUST be optimized */\
  1063. for(i=0; i<h; i++)\
  1064. {\
  1065. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1066. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1067. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1068. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1069. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1070. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  1071. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  1072. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  1073. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  1074. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  1075. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  1076. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  1077. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  1078. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  1079. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  1080. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  1081. asm volatile(\
  1082. "movq (%0), %%mm0 \n\t"\
  1083. "movq 8(%0), %%mm1 \n\t"\
  1084. "paddw %2, %%mm0 \n\t"\
  1085. "paddw %2, %%mm1 \n\t"\
  1086. "psraw $5, %%mm0 \n\t"\
  1087. "psraw $5, %%mm1 \n\t"\
  1088. "packuswb %%mm1, %%mm0 \n\t"\
  1089. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1090. "movq 16(%0), %%mm0 \n\t"\
  1091. "movq 24(%0), %%mm1 \n\t"\
  1092. "paddw %2, %%mm0 \n\t"\
  1093. "paddw %2, %%mm1 \n\t"\
  1094. "psraw $5, %%mm0 \n\t"\
  1095. "psraw $5, %%mm1 \n\t"\
  1096. "packuswb %%mm1, %%mm0 \n\t"\
  1097. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  1098. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1099. : "memory"\
  1100. );\
  1101. dst+=dstStride;\
  1102. src+=srcStride;\
  1103. }\
  1104. }\
  1105. \
  1106. static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1107. uint64_t temp;\
  1108. \
  1109. asm volatile(\
  1110. "pxor %%mm7, %%mm7 \n\t"\
  1111. "1: \n\t"\
  1112. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1113. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1114. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1115. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1116. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1117. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1118. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1119. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1120. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1121. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1122. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1123. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1124. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1125. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1126. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1127. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1128. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1129. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1130. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1131. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1132. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1133. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1134. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1135. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1136. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1137. "paddw %6, %%mm6 \n\t"\
  1138. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1139. "psraw $5, %%mm0 \n\t"\
  1140. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1141. \
  1142. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  1143. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  1144. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  1145. "paddw %%mm5, %%mm1 \n\t" /* a */\
  1146. "paddw %%mm6, %%mm2 \n\t" /* b */\
  1147. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  1148. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  1149. "paddw %%mm6, %%mm3 \n\t" /* c */\
  1150. "paddw %%mm5, %%mm4 \n\t" /* d */\
  1151. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1152. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1153. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1154. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1155. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  1156. "paddw %6, %%mm1 \n\t"\
  1157. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  1158. "psraw $5, %%mm3 \n\t"\
  1159. "packuswb %%mm3, %%mm0 \n\t"\
  1160. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  1161. \
  1162. "addl %3, %0 \n\t"\
  1163. "addl %4, %1 \n\t"\
  1164. "decl %2 \n\t"\
  1165. " jnz 1b \n\t"\
  1166. : "+a"(src), "+c"(dst), "+m"(h)\
  1167. : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1168. : "memory"\
  1169. );\
  1170. }\
  1171. \
  1172. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1173. int i;\
  1174. int16_t temp[8];\
  1175. /* quick HACK, XXX FIXME MUST be optimized */\
  1176. for(i=0; i<h; i++)\
  1177. {\
  1178. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1179. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1180. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1181. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1182. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1183. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  1184. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  1185. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  1186. asm volatile(\
  1187. "movq (%0), %%mm0 \n\t"\
  1188. "movq 8(%0), %%mm1 \n\t"\
  1189. "paddw %2, %%mm0 \n\t"\
  1190. "paddw %2, %%mm1 \n\t"\
  1191. "psraw $5, %%mm0 \n\t"\
  1192. "psraw $5, %%mm1 \n\t"\
  1193. "packuswb %%mm1, %%mm0 \n\t"\
  1194. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1195. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1196. :"memory"\
  1197. );\
  1198. dst+=dstStride;\
  1199. src+=srcStride;\
  1200. }\
  1201. }
  1202. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  1203. \
  1204. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1205. uint64_t temp[17*4];\
  1206. uint64_t *temp_ptr= temp;\
  1207. int count= 17;\
  1208. \
  1209. /*FIXME unroll */\
  1210. asm volatile(\
  1211. "pxor %%mm7, %%mm7 \n\t"\
  1212. "1: \n\t"\
  1213. "movq (%0), %%mm0 \n\t"\
  1214. "movq (%0), %%mm1 \n\t"\
  1215. "movq 8(%0), %%mm2 \n\t"\
  1216. "movq 8(%0), %%mm3 \n\t"\
  1217. "punpcklbw %%mm7, %%mm0 \n\t"\
  1218. "punpckhbw %%mm7, %%mm1 \n\t"\
  1219. "punpcklbw %%mm7, %%mm2 \n\t"\
  1220. "punpckhbw %%mm7, %%mm3 \n\t"\
  1221. "movq %%mm0, (%1) \n\t"\
  1222. "movq %%mm1, 17*8(%1) \n\t"\
  1223. "movq %%mm2, 2*17*8(%1) \n\t"\
  1224. "movq %%mm3, 3*17*8(%1) \n\t"\
  1225. "addl $8, %1 \n\t"\
  1226. "addl %3, %0 \n\t"\
  1227. "decl %2 \n\t"\
  1228. " jnz 1b \n\t"\
  1229. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1230. : "r" (srcStride)\
  1231. : "memory"\
  1232. );\
  1233. \
  1234. temp_ptr= temp;\
  1235. count=4;\
  1236. \
  1237. /*FIXME reorder for speed */\
  1238. asm volatile(\
  1239. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1240. "1: \n\t"\
  1241. "movq (%0), %%mm0 \n\t"\
  1242. "movq 8(%0), %%mm1 \n\t"\
  1243. "movq 16(%0), %%mm2 \n\t"\
  1244. "movq 24(%0), %%mm3 \n\t"\
  1245. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1246. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1247. "addl %4, %1 \n\t"\
  1248. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1249. \
  1250. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1251. "addl %4, %1 \n\t"\
  1252. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1253. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  1254. "addl %4, %1 \n\t"\
  1255. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  1256. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  1257. "addl %4, %1 \n\t"\
  1258. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  1259. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  1260. "addl %4, %1 \n\t"\
  1261. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  1262. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  1263. "addl %4, %1 \n\t"\
  1264. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  1265. \
  1266. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  1267. "addl %4, %1 \n\t" \
  1268. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  1269. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  1270. \
  1271. "addl $136, %0 \n\t"\
  1272. "addl %6, %1 \n\t"\
  1273. "decl %2 \n\t"\
  1274. " jnz 1b \n\t"\
  1275. \
  1276. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1277. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
  1278. :"memory"\
  1279. );\
  1280. }\
  1281. \
  1282. static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1283. uint64_t temp[9*4];\
  1284. uint64_t *temp_ptr= temp;\
  1285. int count= 9;\
  1286. \
  1287. /*FIXME unroll */\
  1288. asm volatile(\
  1289. "pxor %%mm7, %%mm7 \n\t"\
  1290. "1: \n\t"\
  1291. "movq (%0), %%mm0 \n\t"\
  1292. "movq (%0), %%mm1 \n\t"\
  1293. "punpcklbw %%mm7, %%mm0 \n\t"\
  1294. "punpckhbw %%mm7, %%mm1 \n\t"\
  1295. "movq %%mm0, (%1) \n\t"\
  1296. "movq %%mm1, 9*8(%1) \n\t"\
  1297. "addl $8, %1 \n\t"\
  1298. "addl %3, %0 \n\t"\
  1299. "decl %2 \n\t"\
  1300. " jnz 1b \n\t"\
  1301. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1302. : "r" (srcStride)\
  1303. : "memory"\
  1304. );\
  1305. \
  1306. temp_ptr= temp;\
  1307. count=2;\
  1308. \
  1309. /*FIXME reorder for speed */\
  1310. asm volatile(\
  1311. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1312. "1: \n\t"\
  1313. "movq (%0), %%mm0 \n\t"\
  1314. "movq 8(%0), %%mm1 \n\t"\
  1315. "movq 16(%0), %%mm2 \n\t"\
  1316. "movq 24(%0), %%mm3 \n\t"\
  1317. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1318. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1319. "addl %4, %1 \n\t"\
  1320. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1321. \
  1322. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1323. "addl %4, %1 \n\t"\
  1324. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1325. \
  1326. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  1327. "addl %4, %1 \n\t"\
  1328. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  1329. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  1330. \
  1331. "addl $72, %0 \n\t"\
  1332. "addl %6, %1 \n\t"\
  1333. "decl %2 \n\t"\
  1334. " jnz 1b \n\t"\
  1335. \
  1336. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1337. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
  1338. : "memory"\
  1339. );\
  1340. }\
  1341. \
  1342. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1343. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  1344. }\
  1345. \
  1346. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1347. uint64_t temp[8];\
  1348. uint8_t * const half= (uint8_t*)temp;\
  1349. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1350. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1351. }\
  1352. \
  1353. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1354. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  1355. }\
  1356. \
  1357. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1358. uint64_t temp[8];\
  1359. uint8_t * const half= (uint8_t*)temp;\
  1360. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1361. OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
  1362. }\
  1363. \
  1364. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1365. uint64_t temp[8];\
  1366. uint8_t * const half= (uint8_t*)temp;\
  1367. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1368. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1369. }\
  1370. \
  1371. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1372. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1373. }\
  1374. \
  1375. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1376. uint64_t temp[8];\
  1377. uint8_t * const half= (uint8_t*)temp;\
  1378. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1379. OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
  1380. }\
  1381. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1382. uint64_t half[8 + 9];\
  1383. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1384. uint8_t * const halfHV= ((uint8_t*)half);\
  1385. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1386. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1387. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1388. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1389. }\
  1390. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1391. uint64_t half[8 + 9];\
  1392. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1393. uint8_t * const halfHV= ((uint8_t*)half);\
  1394. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1395. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1396. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1397. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1398. }\
  1399. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1400. uint64_t half[8 + 9];\
  1401. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1402. uint8_t * const halfHV= ((uint8_t*)half);\
  1403. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1404. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1405. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1406. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1407. }\
  1408. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1409. uint64_t half[8 + 9];\
  1410. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1411. uint8_t * const halfHV= ((uint8_t*)half);\
  1412. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1413. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1414. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1415. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1416. }\
  1417. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1418. uint64_t half[8 + 9];\
  1419. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1420. uint8_t * const halfHV= ((uint8_t*)half);\
  1421. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1422. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1423. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1424. }\
  1425. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1426. uint64_t half[8 + 9];\
  1427. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1428. uint8_t * const halfHV= ((uint8_t*)half);\
  1429. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1430. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1431. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1432. }\
  1433. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1434. uint64_t half[8 + 9];\
  1435. uint8_t * const halfH= ((uint8_t*)half);\
  1436. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1437. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1438. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1439. }\
  1440. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1441. uint64_t half[8 + 9];\
  1442. uint8_t * const halfH= ((uint8_t*)half);\
  1443. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1444. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1445. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1446. }\
  1447. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1448. uint64_t half[9];\
  1449. uint8_t * const halfH= ((uint8_t*)half);\
  1450. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1451. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1452. }\
  1453. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1454. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  1455. }\
  1456. \
  1457. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1458. uint64_t temp[32];\
  1459. uint8_t * const half= (uint8_t*)temp;\
  1460. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1461. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1462. }\
  1463. \
  1464. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1465. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  1466. }\
  1467. \
  1468. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1469. uint64_t temp[32];\
  1470. uint8_t * const half= (uint8_t*)temp;\
  1471. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1472. OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
  1473. }\
  1474. \
  1475. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1476. uint64_t temp[32];\
  1477. uint8_t * const half= (uint8_t*)temp;\
  1478. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1479. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1480. }\
  1481. \
  1482. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1483. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1484. }\
  1485. \
  1486. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1487. uint64_t temp[32];\
  1488. uint8_t * const half= (uint8_t*)temp;\
  1489. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1490. OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
  1491. }\
  1492. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1493. uint64_t half[16*2 + 17*2];\
  1494. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1495. uint8_t * const halfHV= ((uint8_t*)half);\
  1496. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1497. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1498. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1499. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1500. }\
  1501. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1502. uint64_t half[16*2 + 17*2];\
  1503. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1504. uint8_t * const halfHV= ((uint8_t*)half);\
  1505. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1506. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1507. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1508. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1509. }\
  1510. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1511. uint64_t half[16*2 + 17*2];\
  1512. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1513. uint8_t * const halfHV= ((uint8_t*)half);\
  1514. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1515. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1516. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1517. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1518. }\
  1519. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1520. uint64_t half[16*2 + 17*2];\
  1521. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1522. uint8_t * const halfHV= ((uint8_t*)half);\
  1523. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1524. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1525. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1526. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1527. }\
  1528. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1529. uint64_t half[16*2 + 17*2];\
  1530. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1531. uint8_t * const halfHV= ((uint8_t*)half);\
  1532. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1533. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1534. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1535. }\
  1536. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1537. uint64_t half[16*2 + 17*2];\
  1538. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1539. uint8_t * const halfHV= ((uint8_t*)half);\
  1540. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1541. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1542. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1543. }\
  1544. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1545. uint64_t half[17*2];\
  1546. uint8_t * const halfH= ((uint8_t*)half);\
  1547. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1548. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1549. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1550. }\
  1551. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1552. uint64_t half[17*2];\
  1553. uint8_t * const halfH= ((uint8_t*)half);\
  1554. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1555. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1556. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1557. }\
  1558. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1559. uint64_t half[17*2];\
  1560. uint8_t * const halfH= ((uint8_t*)half);\
  1561. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1562. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1563. }
  1564. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  1565. #define AVG_3DNOW_OP(a,b,temp, size) \
  1566. "mov" #size " " #b ", " #temp " \n\t"\
  1567. "pavgusb " #temp ", " #a " \n\t"\
  1568. "mov" #size " " #a ", " #b " \n\t"
  1569. #define AVG_MMX2_OP(a,b,temp, size) \
  1570. "mov" #size " " #b ", " #temp " \n\t"\
  1571. "pavgb " #temp ", " #a " \n\t"\
  1572. "mov" #size " " #a ", " #b " \n\t"
  1573. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  1574. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  1575. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  1576. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  1577. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  1578. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  1579. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  1580. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  1581. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  1582. #if 0
  1583. static void just_return() { return; }
  1584. #endif
  1585. #define SET_QPEL_FUNC(postfix1, postfix2) \
  1586. c->put_ ## postfix1 = put_ ## postfix2;\
  1587. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  1588. c->avg_ ## postfix1 = avg_ ## postfix2;
  1589. /* external functions, from idct_mmx.c */
  1590. void ff_mmx_idct(DCTELEM *block);
  1591. void ff_mmxext_idct(DCTELEM *block);
  1592. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  1593. converted */
  1594. static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1595. {
  1596. ff_mmx_idct (block);
  1597. put_pixels_clamped_mmx(block, dest, line_size);
  1598. }
  1599. static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1600. {
  1601. ff_mmx_idct (block);
  1602. add_pixels_clamped_mmx(block, dest, line_size);
  1603. }
  1604. static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1605. {
  1606. ff_mmxext_idct (block);
  1607. put_pixels_clamped_mmx(block, dest, line_size);
  1608. }
  1609. static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1610. {
  1611. ff_mmxext_idct (block);
  1612. add_pixels_clamped_mmx(block, dest, line_size);
  1613. }
  1614. void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  1615. {
  1616. mm_flags = mm_support();
  1617. if (avctx->dsp_mask) {
  1618. if (avctx->dsp_mask & FF_MM_FORCE)
  1619. mm_flags |= (avctx->dsp_mask & 0xffff);
  1620. else
  1621. mm_flags &= ~(avctx->dsp_mask & 0xffff);
  1622. }
  1623. #if 0
  1624. fprintf(stderr, "libavcodec: CPU flags:");
  1625. if (mm_flags & MM_MMX)
  1626. fprintf(stderr, " mmx");
  1627. if (mm_flags & MM_MMXEXT)
  1628. fprintf(stderr, " mmxext");
  1629. if (mm_flags & MM_3DNOW)
  1630. fprintf(stderr, " 3dnow");
  1631. if (mm_flags & MM_SSE)
  1632. fprintf(stderr, " sse");
  1633. if (mm_flags & MM_SSE2)
  1634. fprintf(stderr, " sse2");
  1635. fprintf(stderr, "\n");
  1636. #endif
  1637. if (mm_flags & MM_MMX) {
  1638. const int dct_algo = avctx->dct_algo;
  1639. const int idct_algo= avctx->idct_algo;
  1640. #ifdef CONFIG_ENCODERS
  1641. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
  1642. if(mm_flags & MM_MMXEXT){
  1643. c->fdct = ff_fdct_mmx2;
  1644. }else{
  1645. c->fdct = ff_fdct_mmx;
  1646. }
  1647. }
  1648. #endif //CONFIG_ENCODERS
  1649. if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
  1650. c->idct_put= ff_simple_idct_put_mmx;
  1651. c->idct_add= ff_simple_idct_add_mmx;
  1652. c->idct = ff_simple_idct_mmx;
  1653. c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
  1654. }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
  1655. if(mm_flags & MM_MMXEXT){
  1656. c->idct_put= ff_libmpeg2mmx2_idct_put;
  1657. c->idct_add= ff_libmpeg2mmx2_idct_add;
  1658. c->idct = ff_mmxext_idct;
  1659. }else{
  1660. c->idct_put= ff_libmpeg2mmx_idct_put;
  1661. c->idct_add= ff_libmpeg2mmx_idct_add;
  1662. c->idct = ff_mmx_idct;
  1663. }
  1664. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  1665. }
  1666. #ifdef CONFIG_ENCODERS
  1667. c->get_pixels = get_pixels_mmx;
  1668. c->diff_pixels = diff_pixels_mmx;
  1669. #endif //CONFIG_ENCODERS
  1670. c->put_pixels_clamped = put_pixels_clamped_mmx;
  1671. c->add_pixels_clamped = add_pixels_clamped_mmx;
  1672. c->clear_blocks = clear_blocks_mmx;
  1673. #ifdef CONFIG_ENCODERS
  1674. c->pix_sum = pix_sum16_mmx;
  1675. #endif //CONFIG_ENCODERS
  1676. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  1677. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  1678. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  1679. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  1680. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  1681. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  1682. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  1683. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  1684. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  1685. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  1686. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  1687. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  1688. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  1689. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  1690. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  1691. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  1692. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  1693. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  1694. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  1695. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  1696. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  1697. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  1698. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  1699. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  1700. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  1701. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  1702. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  1703. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  1704. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  1705. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  1706. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  1707. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  1708. c->add_bytes= add_bytes_mmx;
  1709. #ifdef CONFIG_ENCODERS
  1710. c->diff_bytes= diff_bytes_mmx;
  1711. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  1712. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  1713. c->pix_norm1 = pix_norm1_mmx;
  1714. c->sse[0] = sse16_mmx;
  1715. #endif //CONFIG_ENCODERS
  1716. c->h263_v_loop_filter= h263_v_loop_filter_mmx;
  1717. c->h263_h_loop_filter= h263_h_loop_filter_mmx;
  1718. if (mm_flags & MM_MMXEXT) {
  1719. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  1720. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  1721. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  1722. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  1723. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  1724. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  1725. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  1726. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  1727. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  1728. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  1729. #ifdef CONFIG_ENCODERS
  1730. c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
  1731. c->hadamard8_diff[1]= hadamard8_diff_mmx2;
  1732. #endif //CONFIG_ENCODERS
  1733. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1734. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  1735. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  1736. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  1737. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  1738. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  1739. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  1740. }
  1741. #if 1
  1742. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  1743. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  1744. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  1745. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  1746. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  1747. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  1748. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  1749. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  1750. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  1751. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  1752. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  1753. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  1754. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  1755. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  1756. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  1757. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  1758. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  1759. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  1760. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  1761. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  1762. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  1763. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  1764. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  1765. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  1766. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  1767. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  1768. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  1769. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  1770. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  1771. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  1772. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  1773. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  1774. #endif
  1775. #ifdef CONFIG_ENCODERS
  1776. c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  1777. #endif //CONFIG_ENCODERS
  1778. } else if (mm_flags & MM_3DNOW) {
  1779. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  1780. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  1781. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  1782. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  1783. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  1784. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  1785. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  1786. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  1787. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  1788. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  1789. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1790. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  1791. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  1792. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  1793. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  1794. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  1795. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  1796. }
  1797. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  1798. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  1799. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  1800. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  1801. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  1802. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  1803. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  1804. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  1805. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  1806. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  1807. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  1808. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  1809. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  1810. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  1811. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  1812. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  1813. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  1814. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  1815. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  1816. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  1817. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  1818. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  1819. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  1820. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  1821. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  1822. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  1823. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  1824. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  1825. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  1826. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  1827. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  1828. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  1829. }
  1830. }
  1831. #ifdef CONFIG_ENCODERS
  1832. dsputil_init_pix_mmx(c, avctx);
  1833. #endif //CONFIG_ENCODERS
  1834. #if 0
  1835. // for speed testing
  1836. get_pixels = just_return;
  1837. put_pixels_clamped = just_return;
  1838. add_pixels_clamped = just_return;
  1839. pix_abs16x16 = just_return;
  1840. pix_abs16x16_x2 = just_return;
  1841. pix_abs16x16_y2 = just_return;
  1842. pix_abs16x16_xy2 = just_return;
  1843. put_pixels_tab[0] = just_return;
  1844. put_pixels_tab[1] = just_return;
  1845. put_pixels_tab[2] = just_return;
  1846. put_pixels_tab[3] = just_return;
  1847. put_no_rnd_pixels_tab[0] = just_return;
  1848. put_no_rnd_pixels_tab[1] = just_return;
  1849. put_no_rnd_pixels_tab[2] = just_return;
  1850. put_no_rnd_pixels_tab[3] = just_return;
  1851. avg_pixels_tab[0] = just_return;
  1852. avg_pixels_tab[1] = just_return;
  1853. avg_pixels_tab[2] = just_return;
  1854. avg_pixels_tab[3] = just_return;
  1855. avg_no_rnd_pixels_tab[0] = just_return;
  1856. avg_no_rnd_pixels_tab[1] = just_return;
  1857. avg_no_rnd_pixels_tab[2] = just_return;
  1858. avg_no_rnd_pixels_tab[3] = just_return;
  1859. //av_fdct = just_return;
  1860. //ff_idct = just_return;
  1861. #endif
  1862. }