You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2989 lines
115KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. */
  22. #include "../dsputil.h"
  23. #include "../simple_idct.h"
  24. #include "../mpegvideo.h"
  25. #include "mmx.h"
  26. //#undef NDEBUG
  27. //#include <assert.h>
  28. extern const uint8_t ff_h263_loop_filter_strength[32];
  29. extern void ff_idct_xvid_mmx(short *block);
  30. extern void ff_idct_xvid_mmx2(short *block);
  31. int mm_flags; /* multimedia extension flags */
  32. /* pixel operations */
  33. static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  34. static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  35. static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  36. static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
  37. static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
  38. static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
  39. static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
  40. static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL;
  41. static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
  42. static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
  43. static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
  44. static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
  45. static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
  46. static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
  47. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  48. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  49. #define MOVQ_WONE(regd) \
  50. __asm __volatile ( \
  51. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  52. "psrlw $15, %%" #regd ::)
  53. #define MOVQ_BFE(regd) \
  54. __asm __volatile ( \
  55. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  56. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  57. #ifndef PIC
  58. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
  59. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  60. #else
  61. // for shared library it's better to use this way for accessing constants
  62. // pcmpeqd -> -1
  63. #define MOVQ_BONE(regd) \
  64. __asm __volatile ( \
  65. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  66. "psrlw $15, %%" #regd " \n\t" \
  67. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  68. #define MOVQ_WTWO(regd) \
  69. __asm __volatile ( \
  70. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  71. "psrlw $15, %%" #regd " \n\t" \
  72. "psllw $1, %%" #regd " \n\t"::)
  73. #endif
  74. // using regr as temporary and for the output result
  75. // first argument is unmodifed and second is trashed
  76. // regfe is supposed to contain 0xfefefefefefefefe
  77. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  78. "movq " #rega ", " #regr " \n\t"\
  79. "pand " #regb ", " #regr " \n\t"\
  80. "pxor " #rega ", " #regb " \n\t"\
  81. "pand " #regfe "," #regb " \n\t"\
  82. "psrlq $1, " #regb " \n\t"\
  83. "paddb " #regb ", " #regr " \n\t"
  84. #define PAVGB_MMX(rega, regb, regr, regfe) \
  85. "movq " #rega ", " #regr " \n\t"\
  86. "por " #regb ", " #regr " \n\t"\
  87. "pxor " #rega ", " #regb " \n\t"\
  88. "pand " #regfe "," #regb " \n\t"\
  89. "psrlq $1, " #regb " \n\t"\
  90. "psubb " #regb ", " #regr " \n\t"
  91. // mm6 is supposed to contain 0xfefefefefefefefe
  92. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  93. "movq " #rega ", " #regr " \n\t"\
  94. "movq " #regc ", " #regp " \n\t"\
  95. "pand " #regb ", " #regr " \n\t"\
  96. "pand " #regd ", " #regp " \n\t"\
  97. "pxor " #rega ", " #regb " \n\t"\
  98. "pxor " #regc ", " #regd " \n\t"\
  99. "pand %%mm6, " #regb " \n\t"\
  100. "pand %%mm6, " #regd " \n\t"\
  101. "psrlq $1, " #regb " \n\t"\
  102. "psrlq $1, " #regd " \n\t"\
  103. "paddb " #regb ", " #regr " \n\t"\
  104. "paddb " #regd ", " #regp " \n\t"
  105. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  106. "movq " #rega ", " #regr " \n\t"\
  107. "movq " #regc ", " #regp " \n\t"\
  108. "por " #regb ", " #regr " \n\t"\
  109. "por " #regd ", " #regp " \n\t"\
  110. "pxor " #rega ", " #regb " \n\t"\
  111. "pxor " #regc ", " #regd " \n\t"\
  112. "pand %%mm6, " #regb " \n\t"\
  113. "pand %%mm6, " #regd " \n\t"\
  114. "psrlq $1, " #regd " \n\t"\
  115. "psrlq $1, " #regb " \n\t"\
  116. "psubb " #regb ", " #regr " \n\t"\
  117. "psubb " #regd ", " #regp " \n\t"
  118. /***********************************/
  119. /* MMX no rounding */
  120. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  121. #define SET_RND MOVQ_WONE
  122. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  123. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  124. #include "dsputil_mmx_rnd.h"
  125. #undef DEF
  126. #undef SET_RND
  127. #undef PAVGBP
  128. #undef PAVGB
  129. /***********************************/
  130. /* MMX rounding */
  131. #define DEF(x, y) x ## _ ## y ##_mmx
  132. #define SET_RND MOVQ_WTWO
  133. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  134. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  135. #include "dsputil_mmx_rnd.h"
  136. #undef DEF
  137. #undef SET_RND
  138. #undef PAVGBP
  139. #undef PAVGB
  140. /***********************************/
  141. /* 3Dnow specific */
  142. #define DEF(x) x ## _3dnow
  143. /* for Athlons PAVGUSB is prefered */
  144. #define PAVGB "pavgusb"
  145. #include "dsputil_mmx_avg.h"
  146. #undef DEF
  147. #undef PAVGB
  148. /***********************************/
  149. /* MMX2 specific */
  150. #define DEF(x) x ## _mmx2
  151. /* Introduced only in MMX2 set */
  152. #define PAVGB "pavgb"
  153. #include "dsputil_mmx_avg.h"
  154. #undef DEF
  155. #undef PAVGB
  156. /***********************************/
  157. /* standard MMX */
  158. #ifdef CONFIG_ENCODERS
  159. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  160. {
  161. asm volatile(
  162. "mov $-128, %%"REG_a" \n\t"
  163. "pxor %%mm7, %%mm7 \n\t"
  164. ".balign 16 \n\t"
  165. "1: \n\t"
  166. "movq (%0), %%mm0 \n\t"
  167. "movq (%0, %2), %%mm2 \n\t"
  168. "movq %%mm0, %%mm1 \n\t"
  169. "movq %%mm2, %%mm3 \n\t"
  170. "punpcklbw %%mm7, %%mm0 \n\t"
  171. "punpckhbw %%mm7, %%mm1 \n\t"
  172. "punpcklbw %%mm7, %%mm2 \n\t"
  173. "punpckhbw %%mm7, %%mm3 \n\t"
  174. "movq %%mm0, (%1, %%"REG_a") \n\t"
  175. "movq %%mm1, 8(%1, %%"REG_a") \n\t"
  176. "movq %%mm2, 16(%1, %%"REG_a") \n\t"
  177. "movq %%mm3, 24(%1, %%"REG_a") \n\t"
  178. "add %3, %0 \n\t"
  179. "add $32, %%"REG_a" \n\t"
  180. "js 1b \n\t"
  181. : "+r" (pixels)
  182. : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
  183. : "%"REG_a
  184. );
  185. }
  186. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  187. {
  188. asm volatile(
  189. "pxor %%mm7, %%mm7 \n\t"
  190. "mov $-128, %%"REG_a" \n\t"
  191. ".balign 16 \n\t"
  192. "1: \n\t"
  193. "movq (%0), %%mm0 \n\t"
  194. "movq (%1), %%mm2 \n\t"
  195. "movq %%mm0, %%mm1 \n\t"
  196. "movq %%mm2, %%mm3 \n\t"
  197. "punpcklbw %%mm7, %%mm0 \n\t"
  198. "punpckhbw %%mm7, %%mm1 \n\t"
  199. "punpcklbw %%mm7, %%mm2 \n\t"
  200. "punpckhbw %%mm7, %%mm3 \n\t"
  201. "psubw %%mm2, %%mm0 \n\t"
  202. "psubw %%mm3, %%mm1 \n\t"
  203. "movq %%mm0, (%2, %%"REG_a") \n\t"
  204. "movq %%mm1, 8(%2, %%"REG_a") \n\t"
  205. "add %3, %0 \n\t"
  206. "add %3, %1 \n\t"
  207. "add $16, %%"REG_a" \n\t"
  208. "jnz 1b \n\t"
  209. : "+r" (s1), "+r" (s2)
  210. : "r" (block+64), "r" ((long)stride)
  211. : "%"REG_a
  212. );
  213. }
  214. #endif //CONFIG_ENCODERS
  215. void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  216. {
  217. const DCTELEM *p;
  218. uint8_t *pix;
  219. /* read the pixels */
  220. p = block;
  221. pix = pixels;
  222. /* unrolled loop */
  223. __asm __volatile(
  224. "movq %3, %%mm0 \n\t"
  225. "movq 8%3, %%mm1 \n\t"
  226. "movq 16%3, %%mm2 \n\t"
  227. "movq 24%3, %%mm3 \n\t"
  228. "movq 32%3, %%mm4 \n\t"
  229. "movq 40%3, %%mm5 \n\t"
  230. "movq 48%3, %%mm6 \n\t"
  231. "movq 56%3, %%mm7 \n\t"
  232. "packuswb %%mm1, %%mm0 \n\t"
  233. "packuswb %%mm3, %%mm2 \n\t"
  234. "packuswb %%mm5, %%mm4 \n\t"
  235. "packuswb %%mm7, %%mm6 \n\t"
  236. "movq %%mm0, (%0) \n\t"
  237. "movq %%mm2, (%0, %1) \n\t"
  238. "movq %%mm4, (%0, %1, 2) \n\t"
  239. "movq %%mm6, (%0, %2) \n\t"
  240. ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
  241. :"memory");
  242. pix += line_size*4;
  243. p += 32;
  244. // if here would be an exact copy of the code above
  245. // compiler would generate some very strange code
  246. // thus using "r"
  247. __asm __volatile(
  248. "movq (%3), %%mm0 \n\t"
  249. "movq 8(%3), %%mm1 \n\t"
  250. "movq 16(%3), %%mm2 \n\t"
  251. "movq 24(%3), %%mm3 \n\t"
  252. "movq 32(%3), %%mm4 \n\t"
  253. "movq 40(%3), %%mm5 \n\t"
  254. "movq 48(%3), %%mm6 \n\t"
  255. "movq 56(%3), %%mm7 \n\t"
  256. "packuswb %%mm1, %%mm0 \n\t"
  257. "packuswb %%mm3, %%mm2 \n\t"
  258. "packuswb %%mm5, %%mm4 \n\t"
  259. "packuswb %%mm7, %%mm6 \n\t"
  260. "movq %%mm0, (%0) \n\t"
  261. "movq %%mm2, (%0, %1) \n\t"
  262. "movq %%mm4, (%0, %1, 2) \n\t"
  263. "movq %%mm6, (%0, %2) \n\t"
  264. ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
  265. :"memory");
  266. }
  267. static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
  268. { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
  269. void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  270. {
  271. int i;
  272. movq_m2r(*vector128, mm1);
  273. for (i = 0; i < 8; i++) {
  274. movq_m2r(*(block), mm0);
  275. packsswb_m2r(*(block + 4), mm0);
  276. block += 8;
  277. paddb_r2r(mm1, mm0);
  278. movq_r2m(mm0, *pixels);
  279. pixels += line_size;
  280. }
  281. }
  282. void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  283. {
  284. const DCTELEM *p;
  285. uint8_t *pix;
  286. int i;
  287. /* read the pixels */
  288. p = block;
  289. pix = pixels;
  290. MOVQ_ZERO(mm7);
  291. i = 4;
  292. do {
  293. __asm __volatile(
  294. "movq (%2), %%mm0 \n\t"
  295. "movq 8(%2), %%mm1 \n\t"
  296. "movq 16(%2), %%mm2 \n\t"
  297. "movq 24(%2), %%mm3 \n\t"
  298. "movq %0, %%mm4 \n\t"
  299. "movq %1, %%mm6 \n\t"
  300. "movq %%mm4, %%mm5 \n\t"
  301. "punpcklbw %%mm7, %%mm4 \n\t"
  302. "punpckhbw %%mm7, %%mm5 \n\t"
  303. "paddsw %%mm4, %%mm0 \n\t"
  304. "paddsw %%mm5, %%mm1 \n\t"
  305. "movq %%mm6, %%mm5 \n\t"
  306. "punpcklbw %%mm7, %%mm6 \n\t"
  307. "punpckhbw %%mm7, %%mm5 \n\t"
  308. "paddsw %%mm6, %%mm2 \n\t"
  309. "paddsw %%mm5, %%mm3 \n\t"
  310. "packuswb %%mm1, %%mm0 \n\t"
  311. "packuswb %%mm3, %%mm2 \n\t"
  312. "movq %%mm0, %0 \n\t"
  313. "movq %%mm2, %1 \n\t"
  314. :"+m"(*pix), "+m"(*(pix+line_size))
  315. :"r"(p)
  316. :"memory");
  317. pix += line_size*2;
  318. p += 16;
  319. } while (--i);
  320. }
  321. static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  322. {
  323. __asm __volatile(
  324. "lea (%3, %3), %%"REG_a" \n\t"
  325. ".balign 8 \n\t"
  326. "1: \n\t"
  327. "movd (%1), %%mm0 \n\t"
  328. "movd (%1, %3), %%mm1 \n\t"
  329. "movd %%mm0, (%2) \n\t"
  330. "movd %%mm1, (%2, %3) \n\t"
  331. "add %%"REG_a", %1 \n\t"
  332. "add %%"REG_a", %2 \n\t"
  333. "movd (%1), %%mm0 \n\t"
  334. "movd (%1, %3), %%mm1 \n\t"
  335. "movd %%mm0, (%2) \n\t"
  336. "movd %%mm1, (%2, %3) \n\t"
  337. "add %%"REG_a", %1 \n\t"
  338. "add %%"REG_a", %2 \n\t"
  339. "subl $4, %0 \n\t"
  340. "jnz 1b \n\t"
  341. : "+g"(h), "+r" (pixels), "+r" (block)
  342. : "r"((long)line_size)
  343. : "%"REG_a, "memory"
  344. );
  345. }
  346. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  347. {
  348. __asm __volatile(
  349. "lea (%3, %3), %%"REG_a" \n\t"
  350. ".balign 8 \n\t"
  351. "1: \n\t"
  352. "movq (%1), %%mm0 \n\t"
  353. "movq (%1, %3), %%mm1 \n\t"
  354. "movq %%mm0, (%2) \n\t"
  355. "movq %%mm1, (%2, %3) \n\t"
  356. "add %%"REG_a", %1 \n\t"
  357. "add %%"REG_a", %2 \n\t"
  358. "movq (%1), %%mm0 \n\t"
  359. "movq (%1, %3), %%mm1 \n\t"
  360. "movq %%mm0, (%2) \n\t"
  361. "movq %%mm1, (%2, %3) \n\t"
  362. "add %%"REG_a", %1 \n\t"
  363. "add %%"REG_a", %2 \n\t"
  364. "subl $4, %0 \n\t"
  365. "jnz 1b \n\t"
  366. : "+g"(h), "+r" (pixels), "+r" (block)
  367. : "r"((long)line_size)
  368. : "%"REG_a, "memory"
  369. );
  370. }
  371. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  372. {
  373. __asm __volatile(
  374. "lea (%3, %3), %%"REG_a" \n\t"
  375. ".balign 8 \n\t"
  376. "1: \n\t"
  377. "movq (%1), %%mm0 \n\t"
  378. "movq 8(%1), %%mm4 \n\t"
  379. "movq (%1, %3), %%mm1 \n\t"
  380. "movq 8(%1, %3), %%mm5 \n\t"
  381. "movq %%mm0, (%2) \n\t"
  382. "movq %%mm4, 8(%2) \n\t"
  383. "movq %%mm1, (%2, %3) \n\t"
  384. "movq %%mm5, 8(%2, %3) \n\t"
  385. "add %%"REG_a", %1 \n\t"
  386. "add %%"REG_a", %2 \n\t"
  387. "movq (%1), %%mm0 \n\t"
  388. "movq 8(%1), %%mm4 \n\t"
  389. "movq (%1, %3), %%mm1 \n\t"
  390. "movq 8(%1, %3), %%mm5 \n\t"
  391. "movq %%mm0, (%2) \n\t"
  392. "movq %%mm4, 8(%2) \n\t"
  393. "movq %%mm1, (%2, %3) \n\t"
  394. "movq %%mm5, 8(%2, %3) \n\t"
  395. "add %%"REG_a", %1 \n\t"
  396. "add %%"REG_a", %2 \n\t"
  397. "subl $4, %0 \n\t"
  398. "jnz 1b \n\t"
  399. : "+g"(h), "+r" (pixels), "+r" (block)
  400. : "r"((long)line_size)
  401. : "%"REG_a, "memory"
  402. );
  403. }
  404. static void clear_blocks_mmx(DCTELEM *blocks)
  405. {
  406. __asm __volatile(
  407. "pxor %%mm7, %%mm7 \n\t"
  408. "mov $-128*6, %%"REG_a" \n\t"
  409. "1: \n\t"
  410. "movq %%mm7, (%0, %%"REG_a") \n\t"
  411. "movq %%mm7, 8(%0, %%"REG_a") \n\t"
  412. "movq %%mm7, 16(%0, %%"REG_a") \n\t"
  413. "movq %%mm7, 24(%0, %%"REG_a") \n\t"
  414. "add $32, %%"REG_a" \n\t"
  415. " js 1b \n\t"
  416. : : "r" (((uint8_t *)blocks)+128*6)
  417. : "%"REG_a
  418. );
  419. }
  420. #ifdef CONFIG_ENCODERS
  421. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  422. const int h=16;
  423. int sum;
  424. long index= -line_size*h;
  425. __asm __volatile(
  426. "pxor %%mm7, %%mm7 \n\t"
  427. "pxor %%mm6, %%mm6 \n\t"
  428. "1: \n\t"
  429. "movq (%2, %1), %%mm0 \n\t"
  430. "movq (%2, %1), %%mm1 \n\t"
  431. "movq 8(%2, %1), %%mm2 \n\t"
  432. "movq 8(%2, %1), %%mm3 \n\t"
  433. "punpcklbw %%mm7, %%mm0 \n\t"
  434. "punpckhbw %%mm7, %%mm1 \n\t"
  435. "punpcklbw %%mm7, %%mm2 \n\t"
  436. "punpckhbw %%mm7, %%mm3 \n\t"
  437. "paddw %%mm0, %%mm1 \n\t"
  438. "paddw %%mm2, %%mm3 \n\t"
  439. "paddw %%mm1, %%mm3 \n\t"
  440. "paddw %%mm3, %%mm6 \n\t"
  441. "add %3, %1 \n\t"
  442. " js 1b \n\t"
  443. "movq %%mm6, %%mm5 \n\t"
  444. "psrlq $32, %%mm6 \n\t"
  445. "paddw %%mm5, %%mm6 \n\t"
  446. "movq %%mm6, %%mm5 \n\t"
  447. "psrlq $16, %%mm6 \n\t"
  448. "paddw %%mm5, %%mm6 \n\t"
  449. "movd %%mm6, %0 \n\t"
  450. "andl $0xFFFF, %0 \n\t"
  451. : "=&r" (sum), "+r" (index)
  452. : "r" (pix - index), "r" ((long)line_size)
  453. );
  454. return sum;
  455. }
  456. #endif //CONFIG_ENCODERS
  457. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  458. long i=0;
  459. asm volatile(
  460. "1: \n\t"
  461. "movq (%1, %0), %%mm0 \n\t"
  462. "movq (%2, %0), %%mm1 \n\t"
  463. "paddb %%mm0, %%mm1 \n\t"
  464. "movq %%mm1, (%2, %0) \n\t"
  465. "movq 8(%1, %0), %%mm0 \n\t"
  466. "movq 8(%2, %0), %%mm1 \n\t"
  467. "paddb %%mm0, %%mm1 \n\t"
  468. "movq %%mm1, 8(%2, %0) \n\t"
  469. "add $16, %0 \n\t"
  470. "cmp %3, %0 \n\t"
  471. " jb 1b \n\t"
  472. : "+r" (i)
  473. : "r"(src), "r"(dst), "r"((long)w-15)
  474. );
  475. for(; i<w; i++)
  476. dst[i+0] += src[i+0];
  477. }
  478. #define H263_LOOP_FILTER \
  479. "pxor %%mm7, %%mm7 \n\t"\
  480. "movq %0, %%mm0 \n\t"\
  481. "movq %0, %%mm1 \n\t"\
  482. "movq %3, %%mm2 \n\t"\
  483. "movq %3, %%mm3 \n\t"\
  484. "punpcklbw %%mm7, %%mm0 \n\t"\
  485. "punpckhbw %%mm7, %%mm1 \n\t"\
  486. "punpcklbw %%mm7, %%mm2 \n\t"\
  487. "punpckhbw %%mm7, %%mm3 \n\t"\
  488. "psubw %%mm2, %%mm0 \n\t"\
  489. "psubw %%mm3, %%mm1 \n\t"\
  490. "movq %1, %%mm2 \n\t"\
  491. "movq %1, %%mm3 \n\t"\
  492. "movq %2, %%mm4 \n\t"\
  493. "movq %2, %%mm5 \n\t"\
  494. "punpcklbw %%mm7, %%mm2 \n\t"\
  495. "punpckhbw %%mm7, %%mm3 \n\t"\
  496. "punpcklbw %%mm7, %%mm4 \n\t"\
  497. "punpckhbw %%mm7, %%mm5 \n\t"\
  498. "psubw %%mm2, %%mm4 \n\t"\
  499. "psubw %%mm3, %%mm5 \n\t"\
  500. "psllw $2, %%mm4 \n\t"\
  501. "psllw $2, %%mm5 \n\t"\
  502. "paddw %%mm0, %%mm4 \n\t"\
  503. "paddw %%mm1, %%mm5 \n\t"\
  504. "pxor %%mm6, %%mm6 \n\t"\
  505. "pcmpgtw %%mm4, %%mm6 \n\t"\
  506. "pcmpgtw %%mm5, %%mm7 \n\t"\
  507. "pxor %%mm6, %%mm4 \n\t"\
  508. "pxor %%mm7, %%mm5 \n\t"\
  509. "psubw %%mm6, %%mm4 \n\t"\
  510. "psubw %%mm7, %%mm5 \n\t"\
  511. "psrlw $3, %%mm4 \n\t"\
  512. "psrlw $3, %%mm5 \n\t"\
  513. "packuswb %%mm5, %%mm4 \n\t"\
  514. "packsswb %%mm7, %%mm6 \n\t"\
  515. "pxor %%mm7, %%mm7 \n\t"\
  516. "movd %4, %%mm2 \n\t"\
  517. "punpcklbw %%mm2, %%mm2 \n\t"\
  518. "punpcklbw %%mm2, %%mm2 \n\t"\
  519. "punpcklbw %%mm2, %%mm2 \n\t"\
  520. "psubusb %%mm4, %%mm2 \n\t"\
  521. "movq %%mm2, %%mm3 \n\t"\
  522. "psubusb %%mm4, %%mm3 \n\t"\
  523. "psubb %%mm3, %%mm2 \n\t"\
  524. "movq %1, %%mm3 \n\t"\
  525. "movq %2, %%mm4 \n\t"\
  526. "pxor %%mm6, %%mm3 \n\t"\
  527. "pxor %%mm6, %%mm4 \n\t"\
  528. "paddusb %%mm2, %%mm3 \n\t"\
  529. "psubusb %%mm2, %%mm4 \n\t"\
  530. "pxor %%mm6, %%mm3 \n\t"\
  531. "pxor %%mm6, %%mm4 \n\t"\
  532. "paddusb %%mm2, %%mm2 \n\t"\
  533. "packsswb %%mm1, %%mm0 \n\t"\
  534. "pcmpgtb %%mm0, %%mm7 \n\t"\
  535. "pxor %%mm7, %%mm0 \n\t"\
  536. "psubb %%mm7, %%mm0 \n\t"\
  537. "movq %%mm0, %%mm1 \n\t"\
  538. "psubusb %%mm2, %%mm0 \n\t"\
  539. "psubb %%mm0, %%mm1 \n\t"\
  540. "pand %5, %%mm1 \n\t"\
  541. "psrlw $2, %%mm1 \n\t"\
  542. "pxor %%mm7, %%mm1 \n\t"\
  543. "psubb %%mm7, %%mm1 \n\t"\
  544. "movq %0, %%mm5 \n\t"\
  545. "movq %3, %%mm6 \n\t"\
  546. "psubb %%mm1, %%mm5 \n\t"\
  547. "paddb %%mm1, %%mm6 \n\t"
  548. static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  549. const int strength= ff_h263_loop_filter_strength[qscale];
  550. asm volatile(
  551. H263_LOOP_FILTER
  552. "movq %%mm3, %1 \n\t"
  553. "movq %%mm4, %2 \n\t"
  554. "movq %%mm5, %0 \n\t"
  555. "movq %%mm6, %3 \n\t"
  556. : "+m" (*(uint64_t*)(src - 2*stride)),
  557. "+m" (*(uint64_t*)(src - 1*stride)),
  558. "+m" (*(uint64_t*)(src + 0*stride)),
  559. "+m" (*(uint64_t*)(src + 1*stride))
  560. : "g" (2*strength), "m"(ff_pb_FC)
  561. );
  562. }
  563. static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
  564. asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
  565. "movd %4, %%mm0 \n\t"
  566. "movd %5, %%mm1 \n\t"
  567. "movd %6, %%mm2 \n\t"
  568. "movd %7, %%mm3 \n\t"
  569. "punpcklbw %%mm1, %%mm0 \n\t"
  570. "punpcklbw %%mm3, %%mm2 \n\t"
  571. "movq %%mm0, %%mm1 \n\t"
  572. "punpcklwd %%mm2, %%mm0 \n\t"
  573. "punpckhwd %%mm2, %%mm1 \n\t"
  574. "movd %%mm0, %0 \n\t"
  575. "punpckhdq %%mm0, %%mm0 \n\t"
  576. "movd %%mm0, %1 \n\t"
  577. "movd %%mm1, %2 \n\t"
  578. "punpckhdq %%mm1, %%mm1 \n\t"
  579. "movd %%mm1, %3 \n\t"
  580. : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
  581. "=m" (*(uint32_t*)(dst + 1*dst_stride)),
  582. "=m" (*(uint32_t*)(dst + 2*dst_stride)),
  583. "=m" (*(uint32_t*)(dst + 3*dst_stride))
  584. : "m" (*(uint32_t*)(src + 0*src_stride)),
  585. "m" (*(uint32_t*)(src + 1*src_stride)),
  586. "m" (*(uint32_t*)(src + 2*src_stride)),
  587. "m" (*(uint32_t*)(src + 3*src_stride))
  588. );
  589. }
  590. static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  591. const int strength= ff_h263_loop_filter_strength[qscale];
  592. uint64_t temp[4] __attribute__ ((aligned(8)));
  593. uint8_t *btemp= (uint8_t*)temp;
  594. src -= 2;
  595. transpose4x4(btemp , src , 8, stride);
  596. transpose4x4(btemp+4, src + 4*stride, 8, stride);
  597. asm volatile(
  598. H263_LOOP_FILTER // 5 3 4 6
  599. : "+m" (temp[0]),
  600. "+m" (temp[1]),
  601. "+m" (temp[2]),
  602. "+m" (temp[3])
  603. : "g" (2*strength), "m"(ff_pb_FC)
  604. );
  605. asm volatile(
  606. "movq %%mm5, %%mm1 \n\t"
  607. "movq %%mm4, %%mm0 \n\t"
  608. "punpcklbw %%mm3, %%mm5 \n\t"
  609. "punpcklbw %%mm6, %%mm4 \n\t"
  610. "punpckhbw %%mm3, %%mm1 \n\t"
  611. "punpckhbw %%mm6, %%mm0 \n\t"
  612. "movq %%mm5, %%mm3 \n\t"
  613. "movq %%mm1, %%mm6 \n\t"
  614. "punpcklwd %%mm4, %%mm5 \n\t"
  615. "punpcklwd %%mm0, %%mm1 \n\t"
  616. "punpckhwd %%mm4, %%mm3 \n\t"
  617. "punpckhwd %%mm0, %%mm6 \n\t"
  618. "movd %%mm5, (%0) \n\t"
  619. "punpckhdq %%mm5, %%mm5 \n\t"
  620. "movd %%mm5, (%0,%2) \n\t"
  621. "movd %%mm3, (%0,%2,2) \n\t"
  622. "punpckhdq %%mm3, %%mm3 \n\t"
  623. "movd %%mm3, (%0,%3) \n\t"
  624. "movd %%mm1, (%1) \n\t"
  625. "punpckhdq %%mm1, %%mm1 \n\t"
  626. "movd %%mm1, (%1,%2) \n\t"
  627. "movd %%mm6, (%1,%2,2) \n\t"
  628. "punpckhdq %%mm6, %%mm6 \n\t"
  629. "movd %%mm6, (%1,%3) \n\t"
  630. :: "r" (src),
  631. "r" (src + 4*stride),
  632. "r" ((long) stride ),
  633. "r" ((long)(3*stride))
  634. );
  635. }
  636. #ifdef CONFIG_ENCODERS
  637. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  638. int tmp;
  639. asm volatile (
  640. "movl $16,%%ecx\n"
  641. "pxor %%mm0,%%mm0\n"
  642. "pxor %%mm7,%%mm7\n"
  643. "1:\n"
  644. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  645. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  646. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  647. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  648. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  649. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  650. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  651. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  652. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  653. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  654. "pmaddwd %%mm3,%%mm3\n"
  655. "pmaddwd %%mm4,%%mm4\n"
  656. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  657. pix2^2+pix3^2+pix6^2+pix7^2) */
  658. "paddd %%mm3,%%mm4\n"
  659. "paddd %%mm2,%%mm7\n"
  660. "add %2, %0\n"
  661. "paddd %%mm4,%%mm7\n"
  662. "dec %%ecx\n"
  663. "jnz 1b\n"
  664. "movq %%mm7,%%mm1\n"
  665. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  666. "paddd %%mm7,%%mm1\n"
  667. "movd %%mm1,%1\n"
  668. : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
  669. return tmp;
  670. }
  671. static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  672. int tmp;
  673. asm volatile (
  674. "movl %4,%%ecx\n"
  675. "shr $1,%%ecx\n"
  676. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  677. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  678. "1:\n"
  679. "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
  680. "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
  681. "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
  682. "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
  683. /* todo: mm1-mm2, mm3-mm4 */
  684. /* algo: substract mm1 from mm2 with saturation and vice versa */
  685. /* OR the results to get absolute difference */
  686. "movq %%mm1,%%mm5\n"
  687. "movq %%mm3,%%mm6\n"
  688. "psubusb %%mm2,%%mm1\n"
  689. "psubusb %%mm4,%%mm3\n"
  690. "psubusb %%mm5,%%mm2\n"
  691. "psubusb %%mm6,%%mm4\n"
  692. "por %%mm1,%%mm2\n"
  693. "por %%mm3,%%mm4\n"
  694. /* now convert to 16-bit vectors so we can square them */
  695. "movq %%mm2,%%mm1\n"
  696. "movq %%mm4,%%mm3\n"
  697. "punpckhbw %%mm0,%%mm2\n"
  698. "punpckhbw %%mm0,%%mm4\n"
  699. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  700. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  701. "pmaddwd %%mm2,%%mm2\n"
  702. "pmaddwd %%mm4,%%mm4\n"
  703. "pmaddwd %%mm1,%%mm1\n"
  704. "pmaddwd %%mm3,%%mm3\n"
  705. "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
  706. "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
  707. "paddd %%mm2,%%mm1\n"
  708. "paddd %%mm4,%%mm3\n"
  709. "paddd %%mm1,%%mm7\n"
  710. "paddd %%mm3,%%mm7\n"
  711. "decl %%ecx\n"
  712. "jnz 1b\n"
  713. "movq %%mm7,%%mm1\n"
  714. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  715. "paddd %%mm7,%%mm1\n"
  716. "movd %%mm1,%2\n"
  717. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  718. : "r" ((long)line_size) , "m" (h)
  719. : "%ecx");
  720. return tmp;
  721. }
  722. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  723. int tmp;
  724. asm volatile (
  725. "movl %4,%%ecx\n"
  726. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  727. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  728. "1:\n"
  729. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  730. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  731. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  732. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  733. /* todo: mm1-mm2, mm3-mm4 */
  734. /* algo: substract mm1 from mm2 with saturation and vice versa */
  735. /* OR the results to get absolute difference */
  736. "movq %%mm1,%%mm5\n"
  737. "movq %%mm3,%%mm6\n"
  738. "psubusb %%mm2,%%mm1\n"
  739. "psubusb %%mm4,%%mm3\n"
  740. "psubusb %%mm5,%%mm2\n"
  741. "psubusb %%mm6,%%mm4\n"
  742. "por %%mm1,%%mm2\n"
  743. "por %%mm3,%%mm4\n"
  744. /* now convert to 16-bit vectors so we can square them */
  745. "movq %%mm2,%%mm1\n"
  746. "movq %%mm4,%%mm3\n"
  747. "punpckhbw %%mm0,%%mm2\n"
  748. "punpckhbw %%mm0,%%mm4\n"
  749. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  750. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  751. "pmaddwd %%mm2,%%mm2\n"
  752. "pmaddwd %%mm4,%%mm4\n"
  753. "pmaddwd %%mm1,%%mm1\n"
  754. "pmaddwd %%mm3,%%mm3\n"
  755. "add %3,%0\n"
  756. "add %3,%1\n"
  757. "paddd %%mm2,%%mm1\n"
  758. "paddd %%mm4,%%mm3\n"
  759. "paddd %%mm1,%%mm7\n"
  760. "paddd %%mm3,%%mm7\n"
  761. "decl %%ecx\n"
  762. "jnz 1b\n"
  763. "movq %%mm7,%%mm1\n"
  764. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  765. "paddd %%mm7,%%mm1\n"
  766. "movd %%mm1,%2\n"
  767. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  768. : "r" ((long)line_size) , "m" (h)
  769. : "%ecx");
  770. return tmp;
  771. }
  772. static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  773. int tmp;
  774. asm volatile (
  775. "shr $1,%2\n"
  776. "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
  777. "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
  778. "1:\n"
  779. "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
  780. "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
  781. "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
  782. "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
  783. /* todo: mm1-mm2, mm3-mm4 */
  784. /* algo: substract mm1 from mm2 with saturation and vice versa */
  785. /* OR the results to get absolute difference */
  786. "movdqa %%xmm1,%%xmm5\n"
  787. "movdqa %%xmm3,%%xmm6\n"
  788. "psubusb %%xmm2,%%xmm1\n"
  789. "psubusb %%xmm4,%%xmm3\n"
  790. "psubusb %%xmm5,%%xmm2\n"
  791. "psubusb %%xmm6,%%xmm4\n"
  792. "por %%xmm1,%%xmm2\n"
  793. "por %%xmm3,%%xmm4\n"
  794. /* now convert to 16-bit vectors so we can square them */
  795. "movdqa %%xmm2,%%xmm1\n"
  796. "movdqa %%xmm4,%%xmm3\n"
  797. "punpckhbw %%xmm0,%%xmm2\n"
  798. "punpckhbw %%xmm0,%%xmm4\n"
  799. "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
  800. "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
  801. "pmaddwd %%xmm2,%%xmm2\n"
  802. "pmaddwd %%xmm4,%%xmm4\n"
  803. "pmaddwd %%xmm1,%%xmm1\n"
  804. "pmaddwd %%xmm3,%%xmm3\n"
  805. "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
  806. "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
  807. "paddd %%xmm2,%%xmm1\n"
  808. "paddd %%xmm4,%%xmm3\n"
  809. "paddd %%xmm1,%%xmm7\n"
  810. "paddd %%xmm3,%%xmm7\n"
  811. "decl %2\n"
  812. "jnz 1b\n"
  813. "movdqa %%xmm7,%%xmm1\n"
  814. "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
  815. "paddd %%xmm1,%%xmm7\n"
  816. "movdqa %%xmm7,%%xmm1\n"
  817. "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
  818. "paddd %%xmm1,%%xmm7\n"
  819. "movd %%xmm7,%3\n"
  820. : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
  821. : "r" ((long)line_size));
  822. return tmp;
  823. }
  824. static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
  825. int tmp;
  826. asm volatile (
  827. "movl %3,%%ecx\n"
  828. "pxor %%mm7,%%mm7\n"
  829. "pxor %%mm6,%%mm6\n"
  830. "movq (%0),%%mm0\n"
  831. "movq %%mm0, %%mm1\n"
  832. "psllq $8, %%mm0\n"
  833. "psrlq $8, %%mm1\n"
  834. "psrlq $8, %%mm0\n"
  835. "movq %%mm0, %%mm2\n"
  836. "movq %%mm1, %%mm3\n"
  837. "punpcklbw %%mm7,%%mm0\n"
  838. "punpcklbw %%mm7,%%mm1\n"
  839. "punpckhbw %%mm7,%%mm2\n"
  840. "punpckhbw %%mm7,%%mm3\n"
  841. "psubw %%mm1, %%mm0\n"
  842. "psubw %%mm3, %%mm2\n"
  843. "add %2,%0\n"
  844. "movq (%0),%%mm4\n"
  845. "movq %%mm4, %%mm1\n"
  846. "psllq $8, %%mm4\n"
  847. "psrlq $8, %%mm1\n"
  848. "psrlq $8, %%mm4\n"
  849. "movq %%mm4, %%mm5\n"
  850. "movq %%mm1, %%mm3\n"
  851. "punpcklbw %%mm7,%%mm4\n"
  852. "punpcklbw %%mm7,%%mm1\n"
  853. "punpckhbw %%mm7,%%mm5\n"
  854. "punpckhbw %%mm7,%%mm3\n"
  855. "psubw %%mm1, %%mm4\n"
  856. "psubw %%mm3, %%mm5\n"
  857. "psubw %%mm4, %%mm0\n"
  858. "psubw %%mm5, %%mm2\n"
  859. "pxor %%mm3, %%mm3\n"
  860. "pxor %%mm1, %%mm1\n"
  861. "pcmpgtw %%mm0, %%mm3\n\t"
  862. "pcmpgtw %%mm2, %%mm1\n\t"
  863. "pxor %%mm3, %%mm0\n"
  864. "pxor %%mm1, %%mm2\n"
  865. "psubw %%mm3, %%mm0\n"
  866. "psubw %%mm1, %%mm2\n"
  867. "paddw %%mm0, %%mm2\n"
  868. "paddw %%mm2, %%mm6\n"
  869. "add %2,%0\n"
  870. "1:\n"
  871. "movq (%0),%%mm0\n"
  872. "movq %%mm0, %%mm1\n"
  873. "psllq $8, %%mm0\n"
  874. "psrlq $8, %%mm1\n"
  875. "psrlq $8, %%mm0\n"
  876. "movq %%mm0, %%mm2\n"
  877. "movq %%mm1, %%mm3\n"
  878. "punpcklbw %%mm7,%%mm0\n"
  879. "punpcklbw %%mm7,%%mm1\n"
  880. "punpckhbw %%mm7,%%mm2\n"
  881. "punpckhbw %%mm7,%%mm3\n"
  882. "psubw %%mm1, %%mm0\n"
  883. "psubw %%mm3, %%mm2\n"
  884. "psubw %%mm0, %%mm4\n"
  885. "psubw %%mm2, %%mm5\n"
  886. "pxor %%mm3, %%mm3\n"
  887. "pxor %%mm1, %%mm1\n"
  888. "pcmpgtw %%mm4, %%mm3\n\t"
  889. "pcmpgtw %%mm5, %%mm1\n\t"
  890. "pxor %%mm3, %%mm4\n"
  891. "pxor %%mm1, %%mm5\n"
  892. "psubw %%mm3, %%mm4\n"
  893. "psubw %%mm1, %%mm5\n"
  894. "paddw %%mm4, %%mm5\n"
  895. "paddw %%mm5, %%mm6\n"
  896. "add %2,%0\n"
  897. "movq (%0),%%mm4\n"
  898. "movq %%mm4, %%mm1\n"
  899. "psllq $8, %%mm4\n"
  900. "psrlq $8, %%mm1\n"
  901. "psrlq $8, %%mm4\n"
  902. "movq %%mm4, %%mm5\n"
  903. "movq %%mm1, %%mm3\n"
  904. "punpcklbw %%mm7,%%mm4\n"
  905. "punpcklbw %%mm7,%%mm1\n"
  906. "punpckhbw %%mm7,%%mm5\n"
  907. "punpckhbw %%mm7,%%mm3\n"
  908. "psubw %%mm1, %%mm4\n"
  909. "psubw %%mm3, %%mm5\n"
  910. "psubw %%mm4, %%mm0\n"
  911. "psubw %%mm5, %%mm2\n"
  912. "pxor %%mm3, %%mm3\n"
  913. "pxor %%mm1, %%mm1\n"
  914. "pcmpgtw %%mm0, %%mm3\n\t"
  915. "pcmpgtw %%mm2, %%mm1\n\t"
  916. "pxor %%mm3, %%mm0\n"
  917. "pxor %%mm1, %%mm2\n"
  918. "psubw %%mm3, %%mm0\n"
  919. "psubw %%mm1, %%mm2\n"
  920. "paddw %%mm0, %%mm2\n"
  921. "paddw %%mm2, %%mm6\n"
  922. "add %2,%0\n"
  923. "subl $2, %%ecx\n"
  924. " jnz 1b\n"
  925. "movq %%mm6, %%mm0\n"
  926. "punpcklwd %%mm7,%%mm0\n"
  927. "punpckhwd %%mm7,%%mm6\n"
  928. "paddd %%mm0, %%mm6\n"
  929. "movq %%mm6,%%mm0\n"
  930. "psrlq $32, %%mm6\n"
  931. "paddd %%mm6,%%mm0\n"
  932. "movd %%mm0,%1\n"
  933. : "+r" (pix1), "=r"(tmp)
  934. : "r" ((long)line_size) , "g" (h-2)
  935. : "%ecx");
  936. return tmp;
  937. }
  938. static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
  939. int tmp;
  940. uint8_t * pix= pix1;
  941. asm volatile (
  942. "movl %3,%%ecx\n"
  943. "pxor %%mm7,%%mm7\n"
  944. "pxor %%mm6,%%mm6\n"
  945. "movq (%0),%%mm0\n"
  946. "movq 1(%0),%%mm1\n"
  947. "movq %%mm0, %%mm2\n"
  948. "movq %%mm1, %%mm3\n"
  949. "punpcklbw %%mm7,%%mm0\n"
  950. "punpcklbw %%mm7,%%mm1\n"
  951. "punpckhbw %%mm7,%%mm2\n"
  952. "punpckhbw %%mm7,%%mm3\n"
  953. "psubw %%mm1, %%mm0\n"
  954. "psubw %%mm3, %%mm2\n"
  955. "add %2,%0\n"
  956. "movq (%0),%%mm4\n"
  957. "movq 1(%0),%%mm1\n"
  958. "movq %%mm4, %%mm5\n"
  959. "movq %%mm1, %%mm3\n"
  960. "punpcklbw %%mm7,%%mm4\n"
  961. "punpcklbw %%mm7,%%mm1\n"
  962. "punpckhbw %%mm7,%%mm5\n"
  963. "punpckhbw %%mm7,%%mm3\n"
  964. "psubw %%mm1, %%mm4\n"
  965. "psubw %%mm3, %%mm5\n"
  966. "psubw %%mm4, %%mm0\n"
  967. "psubw %%mm5, %%mm2\n"
  968. "pxor %%mm3, %%mm3\n"
  969. "pxor %%mm1, %%mm1\n"
  970. "pcmpgtw %%mm0, %%mm3\n\t"
  971. "pcmpgtw %%mm2, %%mm1\n\t"
  972. "pxor %%mm3, %%mm0\n"
  973. "pxor %%mm1, %%mm2\n"
  974. "psubw %%mm3, %%mm0\n"
  975. "psubw %%mm1, %%mm2\n"
  976. "paddw %%mm0, %%mm2\n"
  977. "paddw %%mm2, %%mm6\n"
  978. "add %2,%0\n"
  979. "1:\n"
  980. "movq (%0),%%mm0\n"
  981. "movq 1(%0),%%mm1\n"
  982. "movq %%mm0, %%mm2\n"
  983. "movq %%mm1, %%mm3\n"
  984. "punpcklbw %%mm7,%%mm0\n"
  985. "punpcklbw %%mm7,%%mm1\n"
  986. "punpckhbw %%mm7,%%mm2\n"
  987. "punpckhbw %%mm7,%%mm3\n"
  988. "psubw %%mm1, %%mm0\n"
  989. "psubw %%mm3, %%mm2\n"
  990. "psubw %%mm0, %%mm4\n"
  991. "psubw %%mm2, %%mm5\n"
  992. "pxor %%mm3, %%mm3\n"
  993. "pxor %%mm1, %%mm1\n"
  994. "pcmpgtw %%mm4, %%mm3\n\t"
  995. "pcmpgtw %%mm5, %%mm1\n\t"
  996. "pxor %%mm3, %%mm4\n"
  997. "pxor %%mm1, %%mm5\n"
  998. "psubw %%mm3, %%mm4\n"
  999. "psubw %%mm1, %%mm5\n"
  1000. "paddw %%mm4, %%mm5\n"
  1001. "paddw %%mm5, %%mm6\n"
  1002. "add %2,%0\n"
  1003. "movq (%0),%%mm4\n"
  1004. "movq 1(%0),%%mm1\n"
  1005. "movq %%mm4, %%mm5\n"
  1006. "movq %%mm1, %%mm3\n"
  1007. "punpcklbw %%mm7,%%mm4\n"
  1008. "punpcklbw %%mm7,%%mm1\n"
  1009. "punpckhbw %%mm7,%%mm5\n"
  1010. "punpckhbw %%mm7,%%mm3\n"
  1011. "psubw %%mm1, %%mm4\n"
  1012. "psubw %%mm3, %%mm5\n"
  1013. "psubw %%mm4, %%mm0\n"
  1014. "psubw %%mm5, %%mm2\n"
  1015. "pxor %%mm3, %%mm3\n"
  1016. "pxor %%mm1, %%mm1\n"
  1017. "pcmpgtw %%mm0, %%mm3\n\t"
  1018. "pcmpgtw %%mm2, %%mm1\n\t"
  1019. "pxor %%mm3, %%mm0\n"
  1020. "pxor %%mm1, %%mm2\n"
  1021. "psubw %%mm3, %%mm0\n"
  1022. "psubw %%mm1, %%mm2\n"
  1023. "paddw %%mm0, %%mm2\n"
  1024. "paddw %%mm2, %%mm6\n"
  1025. "add %2,%0\n"
  1026. "subl $2, %%ecx\n"
  1027. " jnz 1b\n"
  1028. "movq %%mm6, %%mm0\n"
  1029. "punpcklwd %%mm7,%%mm0\n"
  1030. "punpckhwd %%mm7,%%mm6\n"
  1031. "paddd %%mm0, %%mm6\n"
  1032. "movq %%mm6,%%mm0\n"
  1033. "psrlq $32, %%mm6\n"
  1034. "paddd %%mm6,%%mm0\n"
  1035. "movd %%mm0,%1\n"
  1036. : "+r" (pix1), "=r"(tmp)
  1037. : "r" ((long)line_size) , "g" (h-2)
  1038. : "%ecx");
  1039. return tmp + hf_noise8_mmx(pix+8, line_size, h);
  1040. }
  1041. static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1042. MpegEncContext *c = p;
  1043. int score1, score2;
  1044. if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  1045. else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  1046. score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
  1047. if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
  1048. else return score1 + ABS(score2)*8;
  1049. }
  1050. static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1051. MpegEncContext *c = p;
  1052. int score1= sse8_mmx(c, pix1, pix2, line_size, h);
  1053. int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
  1054. if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
  1055. else return score1 + ABS(score2)*8;
  1056. }
  1057. static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  1058. int tmp;
  1059. assert( (((int)pix) & 7) == 0);
  1060. assert((line_size &7) ==0);
  1061. #define SUM(in0, in1, out0, out1) \
  1062. "movq (%0), %%mm2\n"\
  1063. "movq 8(%0), %%mm3\n"\
  1064. "add %2,%0\n"\
  1065. "movq %%mm2, " #out0 "\n"\
  1066. "movq %%mm3, " #out1 "\n"\
  1067. "psubusb " #in0 ", %%mm2\n"\
  1068. "psubusb " #in1 ", %%mm3\n"\
  1069. "psubusb " #out0 ", " #in0 "\n"\
  1070. "psubusb " #out1 ", " #in1 "\n"\
  1071. "por %%mm2, " #in0 "\n"\
  1072. "por %%mm3, " #in1 "\n"\
  1073. "movq " #in0 ", %%mm2\n"\
  1074. "movq " #in1 ", %%mm3\n"\
  1075. "punpcklbw %%mm7, " #in0 "\n"\
  1076. "punpcklbw %%mm7, " #in1 "\n"\
  1077. "punpckhbw %%mm7, %%mm2\n"\
  1078. "punpckhbw %%mm7, %%mm3\n"\
  1079. "paddw " #in1 ", " #in0 "\n"\
  1080. "paddw %%mm3, %%mm2\n"\
  1081. "paddw %%mm2, " #in0 "\n"\
  1082. "paddw " #in0 ", %%mm6\n"
  1083. asm volatile (
  1084. "movl %3,%%ecx\n"
  1085. "pxor %%mm6,%%mm6\n"
  1086. "pxor %%mm7,%%mm7\n"
  1087. "movq (%0),%%mm0\n"
  1088. "movq 8(%0),%%mm1\n"
  1089. "add %2,%0\n"
  1090. "subl $2, %%ecx\n"
  1091. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1092. "1:\n"
  1093. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1094. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1095. "subl $2, %%ecx\n"
  1096. "jnz 1b\n"
  1097. "movq %%mm6,%%mm0\n"
  1098. "psrlq $32, %%mm6\n"
  1099. "paddw %%mm6,%%mm0\n"
  1100. "movq %%mm0,%%mm6\n"
  1101. "psrlq $16, %%mm0\n"
  1102. "paddw %%mm6,%%mm0\n"
  1103. "movd %%mm0,%1\n"
  1104. : "+r" (pix), "=r"(tmp)
  1105. : "r" ((long)line_size) , "m" (h)
  1106. : "%ecx");
  1107. return tmp & 0xFFFF;
  1108. }
  1109. #undef SUM
  1110. static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  1111. int tmp;
  1112. assert( (((int)pix) & 7) == 0);
  1113. assert((line_size &7) ==0);
  1114. #define SUM(in0, in1, out0, out1) \
  1115. "movq (%0), " #out0 "\n"\
  1116. "movq 8(%0), " #out1 "\n"\
  1117. "add %2,%0\n"\
  1118. "psadbw " #out0 ", " #in0 "\n"\
  1119. "psadbw " #out1 ", " #in1 "\n"\
  1120. "paddw " #in1 ", " #in0 "\n"\
  1121. "paddw " #in0 ", %%mm6\n"
  1122. asm volatile (
  1123. "movl %3,%%ecx\n"
  1124. "pxor %%mm6,%%mm6\n"
  1125. "pxor %%mm7,%%mm7\n"
  1126. "movq (%0),%%mm0\n"
  1127. "movq 8(%0),%%mm1\n"
  1128. "add %2,%0\n"
  1129. "subl $2, %%ecx\n"
  1130. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1131. "1:\n"
  1132. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1133. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1134. "subl $2, %%ecx\n"
  1135. "jnz 1b\n"
  1136. "movd %%mm6,%1\n"
  1137. : "+r" (pix), "=r"(tmp)
  1138. : "r" ((long)line_size) , "m" (h)
  1139. : "%ecx");
  1140. return tmp;
  1141. }
  1142. #undef SUM
  1143. static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1144. int tmp;
  1145. assert( (((int)pix1) & 7) == 0);
  1146. assert( (((int)pix2) & 7) == 0);
  1147. assert((line_size &7) ==0);
  1148. #define SUM(in0, in1, out0, out1) \
  1149. "movq (%0),%%mm2\n"\
  1150. "movq (%1)," #out0 "\n"\
  1151. "movq 8(%0),%%mm3\n"\
  1152. "movq 8(%1)," #out1 "\n"\
  1153. "add %3,%0\n"\
  1154. "add %3,%1\n"\
  1155. "psubb " #out0 ", %%mm2\n"\
  1156. "psubb " #out1 ", %%mm3\n"\
  1157. "pxor %%mm7, %%mm2\n"\
  1158. "pxor %%mm7, %%mm3\n"\
  1159. "movq %%mm2, " #out0 "\n"\
  1160. "movq %%mm3, " #out1 "\n"\
  1161. "psubusb " #in0 ", %%mm2\n"\
  1162. "psubusb " #in1 ", %%mm3\n"\
  1163. "psubusb " #out0 ", " #in0 "\n"\
  1164. "psubusb " #out1 ", " #in1 "\n"\
  1165. "por %%mm2, " #in0 "\n"\
  1166. "por %%mm3, " #in1 "\n"\
  1167. "movq " #in0 ", %%mm2\n"\
  1168. "movq " #in1 ", %%mm3\n"\
  1169. "punpcklbw %%mm7, " #in0 "\n"\
  1170. "punpcklbw %%mm7, " #in1 "\n"\
  1171. "punpckhbw %%mm7, %%mm2\n"\
  1172. "punpckhbw %%mm7, %%mm3\n"\
  1173. "paddw " #in1 ", " #in0 "\n"\
  1174. "paddw %%mm3, %%mm2\n"\
  1175. "paddw %%mm2, " #in0 "\n"\
  1176. "paddw " #in0 ", %%mm6\n"
  1177. asm volatile (
  1178. "movl %4,%%ecx\n"
  1179. "pxor %%mm6,%%mm6\n"
  1180. "pcmpeqw %%mm7,%%mm7\n"
  1181. "psllw $15, %%mm7\n"
  1182. "packsswb %%mm7, %%mm7\n"
  1183. "movq (%0),%%mm0\n"
  1184. "movq (%1),%%mm2\n"
  1185. "movq 8(%0),%%mm1\n"
  1186. "movq 8(%1),%%mm3\n"
  1187. "add %3,%0\n"
  1188. "add %3,%1\n"
  1189. "subl $2, %%ecx\n"
  1190. "psubb %%mm2, %%mm0\n"
  1191. "psubb %%mm3, %%mm1\n"
  1192. "pxor %%mm7, %%mm0\n"
  1193. "pxor %%mm7, %%mm1\n"
  1194. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1195. "1:\n"
  1196. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1197. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1198. "subl $2, %%ecx\n"
  1199. "jnz 1b\n"
  1200. "movq %%mm6,%%mm0\n"
  1201. "psrlq $32, %%mm6\n"
  1202. "paddw %%mm6,%%mm0\n"
  1203. "movq %%mm0,%%mm6\n"
  1204. "psrlq $16, %%mm0\n"
  1205. "paddw %%mm6,%%mm0\n"
  1206. "movd %%mm0,%2\n"
  1207. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  1208. : "r" ((long)line_size) , "m" (h)
  1209. : "%ecx");
  1210. return tmp & 0x7FFF;
  1211. }
  1212. #undef SUM
  1213. static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1214. int tmp;
  1215. assert( (((int)pix1) & 7) == 0);
  1216. assert( (((int)pix2) & 7) == 0);
  1217. assert((line_size &7) ==0);
  1218. #define SUM(in0, in1, out0, out1) \
  1219. "movq (%0)," #out0 "\n"\
  1220. "movq (%1),%%mm2\n"\
  1221. "movq 8(%0)," #out1 "\n"\
  1222. "movq 8(%1),%%mm3\n"\
  1223. "add %3,%0\n"\
  1224. "add %3,%1\n"\
  1225. "psubb %%mm2, " #out0 "\n"\
  1226. "psubb %%mm3, " #out1 "\n"\
  1227. "pxor %%mm7, " #out0 "\n"\
  1228. "pxor %%mm7, " #out1 "\n"\
  1229. "psadbw " #out0 ", " #in0 "\n"\
  1230. "psadbw " #out1 ", " #in1 "\n"\
  1231. "paddw " #in1 ", " #in0 "\n"\
  1232. "paddw " #in0 ", %%mm6\n"
  1233. asm volatile (
  1234. "movl %4,%%ecx\n"
  1235. "pxor %%mm6,%%mm6\n"
  1236. "pcmpeqw %%mm7,%%mm7\n"
  1237. "psllw $15, %%mm7\n"
  1238. "packsswb %%mm7, %%mm7\n"
  1239. "movq (%0),%%mm0\n"
  1240. "movq (%1),%%mm2\n"
  1241. "movq 8(%0),%%mm1\n"
  1242. "movq 8(%1),%%mm3\n"
  1243. "add %3,%0\n"
  1244. "add %3,%1\n"
  1245. "subl $2, %%ecx\n"
  1246. "psubb %%mm2, %%mm0\n"
  1247. "psubb %%mm3, %%mm1\n"
  1248. "pxor %%mm7, %%mm0\n"
  1249. "pxor %%mm7, %%mm1\n"
  1250. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1251. "1:\n"
  1252. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1253. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1254. "subl $2, %%ecx\n"
  1255. "jnz 1b\n"
  1256. "movd %%mm6,%2\n"
  1257. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  1258. : "r" ((long)line_size) , "m" (h)
  1259. : "%ecx");
  1260. return tmp;
  1261. }
  1262. #undef SUM
  1263. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  1264. long i=0;
  1265. asm volatile(
  1266. "1: \n\t"
  1267. "movq (%2, %0), %%mm0 \n\t"
  1268. "movq (%1, %0), %%mm1 \n\t"
  1269. "psubb %%mm0, %%mm1 \n\t"
  1270. "movq %%mm1, (%3, %0) \n\t"
  1271. "movq 8(%2, %0), %%mm0 \n\t"
  1272. "movq 8(%1, %0), %%mm1 \n\t"
  1273. "psubb %%mm0, %%mm1 \n\t"
  1274. "movq %%mm1, 8(%3, %0) \n\t"
  1275. "add $16, %0 \n\t"
  1276. "cmp %4, %0 \n\t"
  1277. " jb 1b \n\t"
  1278. : "+r" (i)
  1279. : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
  1280. );
  1281. for(; i<w; i++)
  1282. dst[i+0] = src1[i+0]-src2[i+0];
  1283. }
  1284. static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
  1285. long i=0;
  1286. uint8_t l, lt;
  1287. asm volatile(
  1288. "1: \n\t"
  1289. "movq -1(%1, %0), %%mm0 \n\t" // LT
  1290. "movq (%1, %0), %%mm1 \n\t" // T
  1291. "movq -1(%2, %0), %%mm2 \n\t" // L
  1292. "movq (%2, %0), %%mm3 \n\t" // X
  1293. "movq %%mm2, %%mm4 \n\t" // L
  1294. "psubb %%mm0, %%mm2 \n\t"
  1295. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  1296. "movq %%mm4, %%mm5 \n\t" // L
  1297. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  1298. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  1299. "pminub %%mm2, %%mm4 \n\t"
  1300. "pmaxub %%mm1, %%mm4 \n\t"
  1301. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  1302. "movq %%mm3, (%3, %0) \n\t"
  1303. "add $8, %0 \n\t"
  1304. "cmp %4, %0 \n\t"
  1305. " jb 1b \n\t"
  1306. : "+r" (i)
  1307. : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
  1308. );
  1309. l= *left;
  1310. lt= *left_top;
  1311. dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  1312. *left_top= src1[w-1];
  1313. *left = src2[w-1];
  1314. }
  1315. #define LBUTTERFLY2(a1,b1,a2,b2)\
  1316. "paddw " #b1 ", " #a1 " \n\t"\
  1317. "paddw " #b2 ", " #a2 " \n\t"\
  1318. "paddw " #b1 ", " #b1 " \n\t"\
  1319. "paddw " #b2 ", " #b2 " \n\t"\
  1320. "psubw " #a1 ", " #b1 " \n\t"\
  1321. "psubw " #a2 ", " #b2 " \n\t"
  1322. #define HADAMARD48\
  1323. LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
  1324. LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
  1325. LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
  1326. LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
  1327. LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
  1328. LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
  1329. #define MMABS(a,z)\
  1330. "pxor " #z ", " #z " \n\t"\
  1331. "pcmpgtw " #a ", " #z " \n\t"\
  1332. "pxor " #z ", " #a " \n\t"\
  1333. "psubw " #z ", " #a " \n\t"
  1334. #define MMABS_SUM(a,z, sum)\
  1335. "pxor " #z ", " #z " \n\t"\
  1336. "pcmpgtw " #a ", " #z " \n\t"\
  1337. "pxor " #z ", " #a " \n\t"\
  1338. "psubw " #z ", " #a " \n\t"\
  1339. "paddusw " #a ", " #sum " \n\t"
  1340. #define MMABS_MMX2(a,z)\
  1341. "pxor " #z ", " #z " \n\t"\
  1342. "psubw " #a ", " #z " \n\t"\
  1343. "pmaxsw " #z ", " #a " \n\t"
  1344. #define MMABS_SUM_MMX2(a,z, sum)\
  1345. "pxor " #z ", " #z " \n\t"\
  1346. "psubw " #a ", " #z " \n\t"\
  1347. "pmaxsw " #z ", " #a " \n\t"\
  1348. "paddusw " #a ", " #sum " \n\t"
  1349. #define SBUTTERFLY(a,b,t,n)\
  1350. "movq " #a ", " #t " \n\t" /* abcd */\
  1351. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  1352. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  1353. #define TRANSPOSE4(a,b,c,d,t)\
  1354. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  1355. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  1356. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  1357. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  1358. #define LOAD4(o, a, b, c, d)\
  1359. "movq "#o"(%1), " #a " \n\t"\
  1360. "movq "#o"+16(%1), " #b " \n\t"\
  1361. "movq "#o"+32(%1), " #c " \n\t"\
  1362. "movq "#o"+48(%1), " #d " \n\t"
  1363. #define STORE4(o, a, b, c, d)\
  1364. "movq "#a", "#o"(%1) \n\t"\
  1365. "movq "#b", "#o"+16(%1) \n\t"\
  1366. "movq "#c", "#o"+32(%1) \n\t"\
  1367. "movq "#d", "#o"+48(%1) \n\t"\
  1368. static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  1369. DECLARE_ALIGNED_8(uint64_t, temp[16]);
  1370. int sum=0;
  1371. assert(h==8);
  1372. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  1373. asm volatile(
  1374. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1375. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  1376. HADAMARD48
  1377. "movq %%mm7, 112(%1) \n\t"
  1378. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1379. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  1380. "movq 112(%1), %%mm7 \n\t"
  1381. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1382. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  1383. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  1384. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1385. HADAMARD48
  1386. "movq %%mm7, 120(%1) \n\t"
  1387. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1388. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  1389. "movq 120(%1), %%mm7 \n\t"
  1390. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1391. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  1392. "movq %%mm6, %%mm7 \n\t"
  1393. "movq %%mm0, %%mm6 \n\t"
  1394. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  1395. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  1396. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1397. HADAMARD48
  1398. "movq %%mm7, 64(%1) \n\t"
  1399. MMABS(%%mm0, %%mm7)
  1400. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1401. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  1402. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  1403. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  1404. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  1405. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  1406. "movq 64(%1), %%mm1 \n\t"
  1407. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1408. "movq %%mm0, 64(%1) \n\t"
  1409. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1410. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  1411. HADAMARD48
  1412. "movq %%mm7, (%1) \n\t"
  1413. MMABS(%%mm0, %%mm7)
  1414. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1415. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  1416. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  1417. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  1418. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  1419. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  1420. "movq (%1), %%mm1 \n\t"
  1421. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1422. "movq 64(%1), %%mm1 \n\t"
  1423. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1424. "movq %%mm0, %%mm1 \n\t"
  1425. "psrlq $32, %%mm0 \n\t"
  1426. "paddusw %%mm1, %%mm0 \n\t"
  1427. "movq %%mm0, %%mm1 \n\t"
  1428. "psrlq $16, %%mm0 \n\t"
  1429. "paddusw %%mm1, %%mm0 \n\t"
  1430. "movd %%mm0, %0 \n\t"
  1431. : "=r" (sum)
  1432. : "r"(temp)
  1433. );
  1434. return sum&0xFFFF;
  1435. }
  1436. static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  1437. DECLARE_ALIGNED_8(uint64_t, temp[16]);
  1438. int sum=0;
  1439. assert(h==8);
  1440. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  1441. asm volatile(
  1442. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1443. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  1444. HADAMARD48
  1445. "movq %%mm7, 112(%1) \n\t"
  1446. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1447. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  1448. "movq 112(%1), %%mm7 \n\t"
  1449. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1450. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  1451. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  1452. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1453. HADAMARD48
  1454. "movq %%mm7, 120(%1) \n\t"
  1455. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1456. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  1457. "movq 120(%1), %%mm7 \n\t"
  1458. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1459. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  1460. "movq %%mm6, %%mm7 \n\t"
  1461. "movq %%mm0, %%mm6 \n\t"
  1462. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  1463. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  1464. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1465. HADAMARD48
  1466. "movq %%mm7, 64(%1) \n\t"
  1467. MMABS_MMX2(%%mm0, %%mm7)
  1468. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1469. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  1470. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  1471. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  1472. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  1473. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  1474. "movq 64(%1), %%mm1 \n\t"
  1475. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1476. "movq %%mm0, 64(%1) \n\t"
  1477. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1478. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  1479. HADAMARD48
  1480. "movq %%mm7, (%1) \n\t"
  1481. MMABS_MMX2(%%mm0, %%mm7)
  1482. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1483. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  1484. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  1485. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  1486. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  1487. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  1488. "movq (%1), %%mm1 \n\t"
  1489. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1490. "movq 64(%1), %%mm1 \n\t"
  1491. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1492. "pshufw $0x0E, %%mm0, %%mm1 \n\t"
  1493. "paddusw %%mm1, %%mm0 \n\t"
  1494. "pshufw $0x01, %%mm0, %%mm1 \n\t"
  1495. "paddusw %%mm1, %%mm0 \n\t"
  1496. "movd %%mm0, %0 \n\t"
  1497. : "=r" (sum)
  1498. : "r"(temp)
  1499. );
  1500. return sum&0xFFFF;
  1501. }
  1502. WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
  1503. WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
  1504. #endif //CONFIG_ENCODERS
  1505. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  1506. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  1507. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  1508. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  1509. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  1510. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  1511. "movq "#in7", " #m3 " \n\t" /* d */\
  1512. "movq "#in0", %%mm5 \n\t" /* D */\
  1513. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  1514. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  1515. "movq "#in1", %%mm5 \n\t" /* C */\
  1516. "movq "#in2", %%mm6 \n\t" /* B */\
  1517. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  1518. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  1519. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  1520. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  1521. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  1522. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  1523. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  1524. "psraw $5, %%mm5 \n\t"\
  1525. "packuswb %%mm5, %%mm5 \n\t"\
  1526. OP(%%mm5, out, %%mm7, d)
  1527. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  1528. static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1529. uint64_t temp;\
  1530. \
  1531. asm volatile(\
  1532. "pxor %%mm7, %%mm7 \n\t"\
  1533. "1: \n\t"\
  1534. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1535. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1536. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1537. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1538. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1539. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1540. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1541. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1542. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1543. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1544. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1545. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1546. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1547. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1548. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1549. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1550. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1551. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1552. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1553. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1554. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1555. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1556. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1557. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1558. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1559. "paddw %6, %%mm6 \n\t"\
  1560. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1561. "psraw $5, %%mm0 \n\t"\
  1562. "movq %%mm0, %5 \n\t"\
  1563. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1564. \
  1565. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  1566. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  1567. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  1568. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  1569. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  1570. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  1571. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  1572. "paddw %%mm0, %%mm2 \n\t" /* b */\
  1573. "paddw %%mm5, %%mm3 \n\t" /* c */\
  1574. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1575. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1576. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  1577. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  1578. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  1579. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  1580. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1581. "paddw %%mm2, %%mm1 \n\t" /* a */\
  1582. "paddw %%mm6, %%mm4 \n\t" /* d */\
  1583. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1584. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  1585. "paddw %6, %%mm1 \n\t"\
  1586. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  1587. "psraw $5, %%mm3 \n\t"\
  1588. "movq %5, %%mm1 \n\t"\
  1589. "packuswb %%mm3, %%mm1 \n\t"\
  1590. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  1591. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  1592. \
  1593. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  1594. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  1595. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  1596. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  1597. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  1598. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  1599. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  1600. "paddw %%mm1, %%mm5 \n\t" /* b */\
  1601. "paddw %%mm4, %%mm0 \n\t" /* c */\
  1602. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1603. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  1604. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  1605. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  1606. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  1607. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  1608. "paddw %%mm3, %%mm2 \n\t" /* d */\
  1609. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  1610. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  1611. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  1612. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  1613. "paddw %%mm2, %%mm6 \n\t" /* a */\
  1614. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  1615. "paddw %6, %%mm0 \n\t"\
  1616. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1617. "psraw $5, %%mm0 \n\t"\
  1618. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  1619. \
  1620. "paddw %%mm5, %%mm3 \n\t" /* a */\
  1621. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  1622. "paddw %%mm4, %%mm6 \n\t" /* b */\
  1623. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  1624. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  1625. "paddw %%mm1, %%mm4 \n\t" /* c */\
  1626. "paddw %%mm2, %%mm5 \n\t" /* d */\
  1627. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  1628. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  1629. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  1630. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  1631. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  1632. "paddw %6, %%mm4 \n\t"\
  1633. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  1634. "psraw $5, %%mm4 \n\t"\
  1635. "packuswb %%mm4, %%mm0 \n\t"\
  1636. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  1637. \
  1638. "add %3, %0 \n\t"\
  1639. "add %4, %1 \n\t"\
  1640. "decl %2 \n\t"\
  1641. " jnz 1b \n\t"\
  1642. : "+a"(src), "+c"(dst), "+m"(h)\
  1643. : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1644. : "memory"\
  1645. );\
  1646. }\
  1647. \
  1648. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1649. int i;\
  1650. int16_t temp[16];\
  1651. /* quick HACK, XXX FIXME MUST be optimized */\
  1652. for(i=0; i<h; i++)\
  1653. {\
  1654. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1655. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1656. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1657. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1658. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1659. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  1660. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  1661. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  1662. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  1663. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  1664. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  1665. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  1666. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  1667. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  1668. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  1669. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  1670. asm volatile(\
  1671. "movq (%0), %%mm0 \n\t"\
  1672. "movq 8(%0), %%mm1 \n\t"\
  1673. "paddw %2, %%mm0 \n\t"\
  1674. "paddw %2, %%mm1 \n\t"\
  1675. "psraw $5, %%mm0 \n\t"\
  1676. "psraw $5, %%mm1 \n\t"\
  1677. "packuswb %%mm1, %%mm0 \n\t"\
  1678. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1679. "movq 16(%0), %%mm0 \n\t"\
  1680. "movq 24(%0), %%mm1 \n\t"\
  1681. "paddw %2, %%mm0 \n\t"\
  1682. "paddw %2, %%mm1 \n\t"\
  1683. "psraw $5, %%mm0 \n\t"\
  1684. "psraw $5, %%mm1 \n\t"\
  1685. "packuswb %%mm1, %%mm0 \n\t"\
  1686. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  1687. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1688. : "memory"\
  1689. );\
  1690. dst+=dstStride;\
  1691. src+=srcStride;\
  1692. }\
  1693. }\
  1694. \
  1695. static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1696. uint64_t temp;\
  1697. \
  1698. asm volatile(\
  1699. "pxor %%mm7, %%mm7 \n\t"\
  1700. "1: \n\t"\
  1701. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1702. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1703. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1704. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1705. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1706. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1707. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1708. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1709. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1710. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1711. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1712. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1713. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1714. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1715. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1716. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1717. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1718. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1719. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1720. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1721. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1722. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1723. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1724. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1725. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1726. "paddw %6, %%mm6 \n\t"\
  1727. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1728. "psraw $5, %%mm0 \n\t"\
  1729. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1730. \
  1731. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  1732. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  1733. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  1734. "paddw %%mm5, %%mm1 \n\t" /* a */\
  1735. "paddw %%mm6, %%mm2 \n\t" /* b */\
  1736. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  1737. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  1738. "paddw %%mm6, %%mm3 \n\t" /* c */\
  1739. "paddw %%mm5, %%mm4 \n\t" /* d */\
  1740. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1741. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1742. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1743. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1744. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  1745. "paddw %6, %%mm1 \n\t"\
  1746. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  1747. "psraw $5, %%mm3 \n\t"\
  1748. "packuswb %%mm3, %%mm0 \n\t"\
  1749. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  1750. \
  1751. "add %3, %0 \n\t"\
  1752. "add %4, %1 \n\t"\
  1753. "decl %2 \n\t"\
  1754. " jnz 1b \n\t"\
  1755. : "+a"(src), "+c"(dst), "+m"(h)\
  1756. : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1757. : "memory"\
  1758. );\
  1759. }\
  1760. \
  1761. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1762. int i;\
  1763. int16_t temp[8];\
  1764. /* quick HACK, XXX FIXME MUST be optimized */\
  1765. for(i=0; i<h; i++)\
  1766. {\
  1767. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1768. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1769. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1770. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1771. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1772. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  1773. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  1774. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  1775. asm volatile(\
  1776. "movq (%0), %%mm0 \n\t"\
  1777. "movq 8(%0), %%mm1 \n\t"\
  1778. "paddw %2, %%mm0 \n\t"\
  1779. "paddw %2, %%mm1 \n\t"\
  1780. "psraw $5, %%mm0 \n\t"\
  1781. "psraw $5, %%mm1 \n\t"\
  1782. "packuswb %%mm1, %%mm0 \n\t"\
  1783. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1784. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1785. :"memory"\
  1786. );\
  1787. dst+=dstStride;\
  1788. src+=srcStride;\
  1789. }\
  1790. }
  1791. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  1792. \
  1793. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1794. uint64_t temp[17*4];\
  1795. uint64_t *temp_ptr= temp;\
  1796. int count= 17;\
  1797. \
  1798. /*FIXME unroll */\
  1799. asm volatile(\
  1800. "pxor %%mm7, %%mm7 \n\t"\
  1801. "1: \n\t"\
  1802. "movq (%0), %%mm0 \n\t"\
  1803. "movq (%0), %%mm1 \n\t"\
  1804. "movq 8(%0), %%mm2 \n\t"\
  1805. "movq 8(%0), %%mm3 \n\t"\
  1806. "punpcklbw %%mm7, %%mm0 \n\t"\
  1807. "punpckhbw %%mm7, %%mm1 \n\t"\
  1808. "punpcklbw %%mm7, %%mm2 \n\t"\
  1809. "punpckhbw %%mm7, %%mm3 \n\t"\
  1810. "movq %%mm0, (%1) \n\t"\
  1811. "movq %%mm1, 17*8(%1) \n\t"\
  1812. "movq %%mm2, 2*17*8(%1) \n\t"\
  1813. "movq %%mm3, 3*17*8(%1) \n\t"\
  1814. "add $8, %1 \n\t"\
  1815. "add %3, %0 \n\t"\
  1816. "decl %2 \n\t"\
  1817. " jnz 1b \n\t"\
  1818. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1819. : "r" ((long)srcStride)\
  1820. : "memory"\
  1821. );\
  1822. \
  1823. temp_ptr= temp;\
  1824. count=4;\
  1825. \
  1826. /*FIXME reorder for speed */\
  1827. asm volatile(\
  1828. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1829. "1: \n\t"\
  1830. "movq (%0), %%mm0 \n\t"\
  1831. "movq 8(%0), %%mm1 \n\t"\
  1832. "movq 16(%0), %%mm2 \n\t"\
  1833. "movq 24(%0), %%mm3 \n\t"\
  1834. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1835. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1836. "add %4, %1 \n\t"\
  1837. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1838. \
  1839. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1840. "add %4, %1 \n\t"\
  1841. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1842. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  1843. "add %4, %1 \n\t"\
  1844. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  1845. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  1846. "add %4, %1 \n\t"\
  1847. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  1848. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  1849. "add %4, %1 \n\t"\
  1850. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  1851. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  1852. "add %4, %1 \n\t"\
  1853. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  1854. \
  1855. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  1856. "add %4, %1 \n\t" \
  1857. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  1858. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  1859. \
  1860. "add $136, %0 \n\t"\
  1861. "add %6, %1 \n\t"\
  1862. "decl %2 \n\t"\
  1863. " jnz 1b \n\t"\
  1864. \
  1865. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1866. : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
  1867. :"memory"\
  1868. );\
  1869. }\
  1870. \
  1871. static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1872. uint64_t temp[9*2];\
  1873. uint64_t *temp_ptr= temp;\
  1874. int count= 9;\
  1875. \
  1876. /*FIXME unroll */\
  1877. asm volatile(\
  1878. "pxor %%mm7, %%mm7 \n\t"\
  1879. "1: \n\t"\
  1880. "movq (%0), %%mm0 \n\t"\
  1881. "movq (%0), %%mm1 \n\t"\
  1882. "punpcklbw %%mm7, %%mm0 \n\t"\
  1883. "punpckhbw %%mm7, %%mm1 \n\t"\
  1884. "movq %%mm0, (%1) \n\t"\
  1885. "movq %%mm1, 9*8(%1) \n\t"\
  1886. "add $8, %1 \n\t"\
  1887. "add %3, %0 \n\t"\
  1888. "decl %2 \n\t"\
  1889. " jnz 1b \n\t"\
  1890. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1891. : "r" ((long)srcStride)\
  1892. : "memory"\
  1893. );\
  1894. \
  1895. temp_ptr= temp;\
  1896. count=2;\
  1897. \
  1898. /*FIXME reorder for speed */\
  1899. asm volatile(\
  1900. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1901. "1: \n\t"\
  1902. "movq (%0), %%mm0 \n\t"\
  1903. "movq 8(%0), %%mm1 \n\t"\
  1904. "movq 16(%0), %%mm2 \n\t"\
  1905. "movq 24(%0), %%mm3 \n\t"\
  1906. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1907. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1908. "add %4, %1 \n\t"\
  1909. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1910. \
  1911. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1912. "add %4, %1 \n\t"\
  1913. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1914. \
  1915. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  1916. "add %4, %1 \n\t"\
  1917. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  1918. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  1919. \
  1920. "add $72, %0 \n\t"\
  1921. "add %6, %1 \n\t"\
  1922. "decl %2 \n\t"\
  1923. " jnz 1b \n\t"\
  1924. \
  1925. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1926. : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
  1927. : "memory"\
  1928. );\
  1929. }\
  1930. \
  1931. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1932. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  1933. }\
  1934. \
  1935. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1936. uint64_t temp[8];\
  1937. uint8_t * const half= (uint8_t*)temp;\
  1938. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1939. OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
  1940. }\
  1941. \
  1942. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1943. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  1944. }\
  1945. \
  1946. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1947. uint64_t temp[8];\
  1948. uint8_t * const half= (uint8_t*)temp;\
  1949. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1950. OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
  1951. }\
  1952. \
  1953. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1954. uint64_t temp[8];\
  1955. uint8_t * const half= (uint8_t*)temp;\
  1956. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1957. OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
  1958. }\
  1959. \
  1960. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1961. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1962. }\
  1963. \
  1964. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1965. uint64_t temp[8];\
  1966. uint8_t * const half= (uint8_t*)temp;\
  1967. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1968. OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
  1969. }\
  1970. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1971. uint64_t half[8 + 9];\
  1972. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1973. uint8_t * const halfHV= ((uint8_t*)half);\
  1974. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1975. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  1976. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1977. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  1978. }\
  1979. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1980. uint64_t half[8 + 9];\
  1981. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1982. uint8_t * const halfHV= ((uint8_t*)half);\
  1983. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1984. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  1985. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1986. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  1987. }\
  1988. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1989. uint64_t half[8 + 9];\
  1990. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1991. uint8_t * const halfHV= ((uint8_t*)half);\
  1992. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1993. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  1994. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1995. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  1996. }\
  1997. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1998. uint64_t half[8 + 9];\
  1999. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2000. uint8_t * const halfHV= ((uint8_t*)half);\
  2001. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2002. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  2003. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2004. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  2005. }\
  2006. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2007. uint64_t half[8 + 9];\
  2008. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2009. uint8_t * const halfHV= ((uint8_t*)half);\
  2010. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2011. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2012. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  2013. }\
  2014. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2015. uint64_t half[8 + 9];\
  2016. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2017. uint8_t * const halfHV= ((uint8_t*)half);\
  2018. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2019. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2020. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  2021. }\
  2022. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2023. uint64_t half[8 + 9];\
  2024. uint8_t * const halfH= ((uint8_t*)half);\
  2025. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2026. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  2027. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2028. }\
  2029. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2030. uint64_t half[8 + 9];\
  2031. uint8_t * const halfH= ((uint8_t*)half);\
  2032. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2033. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  2034. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2035. }\
  2036. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2037. uint64_t half[9];\
  2038. uint8_t * const halfH= ((uint8_t*)half);\
  2039. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2040. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2041. }\
  2042. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  2043. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  2044. }\
  2045. \
  2046. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2047. uint64_t temp[32];\
  2048. uint8_t * const half= (uint8_t*)temp;\
  2049. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  2050. OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
  2051. }\
  2052. \
  2053. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2054. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  2055. }\
  2056. \
  2057. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2058. uint64_t temp[32];\
  2059. uint8_t * const half= (uint8_t*)temp;\
  2060. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  2061. OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
  2062. }\
  2063. \
  2064. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2065. uint64_t temp[32];\
  2066. uint8_t * const half= (uint8_t*)temp;\
  2067. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  2068. OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
  2069. }\
  2070. \
  2071. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2072. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  2073. }\
  2074. \
  2075. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2076. uint64_t temp[32];\
  2077. uint8_t * const half= (uint8_t*)temp;\
  2078. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  2079. OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
  2080. }\
  2081. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2082. uint64_t half[16*2 + 17*2];\
  2083. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2084. uint8_t * const halfHV= ((uint8_t*)half);\
  2085. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2086. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2087. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2088. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2089. }\
  2090. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2091. uint64_t half[16*2 + 17*2];\
  2092. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2093. uint8_t * const halfHV= ((uint8_t*)half);\
  2094. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2095. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2096. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2097. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2098. }\
  2099. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2100. uint64_t half[16*2 + 17*2];\
  2101. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2102. uint8_t * const halfHV= ((uint8_t*)half);\
  2103. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2104. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2105. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2106. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2107. }\
  2108. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2109. uint64_t half[16*2 + 17*2];\
  2110. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2111. uint8_t * const halfHV= ((uint8_t*)half);\
  2112. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2113. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2114. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2115. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2116. }\
  2117. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2118. uint64_t half[16*2 + 17*2];\
  2119. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2120. uint8_t * const halfHV= ((uint8_t*)half);\
  2121. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2122. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2123. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2124. }\
  2125. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2126. uint64_t half[16*2 + 17*2];\
  2127. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2128. uint8_t * const halfHV= ((uint8_t*)half);\
  2129. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2130. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2131. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2132. }\
  2133. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2134. uint64_t half[17*2];\
  2135. uint8_t * const halfH= ((uint8_t*)half);\
  2136. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2137. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2138. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2139. }\
  2140. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2141. uint64_t half[17*2];\
  2142. uint8_t * const halfH= ((uint8_t*)half);\
  2143. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2144. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2145. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2146. }\
  2147. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2148. uint64_t half[17*2];\
  2149. uint8_t * const halfH= ((uint8_t*)half);\
  2150. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2151. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2152. }
  2153. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  2154. #define AVG_3DNOW_OP(a,b,temp, size) \
  2155. "mov" #size " " #b ", " #temp " \n\t"\
  2156. "pavgusb " #temp ", " #a " \n\t"\
  2157. "mov" #size " " #a ", " #b " \n\t"
  2158. #define AVG_MMX2_OP(a,b,temp, size) \
  2159. "mov" #size " " #b ", " #temp " \n\t"\
  2160. "pavgb " #temp ", " #a " \n\t"\
  2161. "mov" #size " " #a ", " #b " \n\t"
  2162. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  2163. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  2164. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  2165. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  2166. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  2167. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  2168. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  2169. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  2170. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  2171. #if 0
  2172. static void just_return() { return; }
  2173. #endif
  2174. #define SET_QPEL_FUNC(postfix1, postfix2) \
  2175. c->put_ ## postfix1 = put_ ## postfix2;\
  2176. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  2177. c->avg_ ## postfix1 = avg_ ## postfix2;
  2178. static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
  2179. long i=0;
  2180. assert(ABS(scale) < 256);
  2181. scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
  2182. asm volatile(
  2183. "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
  2184. "psrlw $15, %%mm6 \n\t" // 1w
  2185. "pxor %%mm7, %%mm7 \n\t"
  2186. "movd %4, %%mm5 \n\t"
  2187. "punpcklwd %%mm5, %%mm5 \n\t"
  2188. "punpcklwd %%mm5, %%mm5 \n\t"
  2189. "1: \n\t"
  2190. "movq (%1, %0), %%mm0 \n\t"
  2191. "movq 8(%1, %0), %%mm1 \n\t"
  2192. "pmulhw %%mm5, %%mm0 \n\t"
  2193. "pmulhw %%mm5, %%mm1 \n\t"
  2194. "paddw %%mm6, %%mm0 \n\t"
  2195. "paddw %%mm6, %%mm1 \n\t"
  2196. "psraw $1, %%mm0 \n\t"
  2197. "psraw $1, %%mm1 \n\t"
  2198. "paddw (%2, %0), %%mm0 \n\t"
  2199. "paddw 8(%2, %0), %%mm1 \n\t"
  2200. "psraw $6, %%mm0 \n\t"
  2201. "psraw $6, %%mm1 \n\t"
  2202. "pmullw (%3, %0), %%mm0 \n\t"
  2203. "pmullw 8(%3, %0), %%mm1 \n\t"
  2204. "pmaddwd %%mm0, %%mm0 \n\t"
  2205. "pmaddwd %%mm1, %%mm1 \n\t"
  2206. "paddd %%mm1, %%mm0 \n\t"
  2207. "psrld $4, %%mm0 \n\t"
  2208. "paddd %%mm0, %%mm7 \n\t"
  2209. "add $16, %0 \n\t"
  2210. "cmp $128, %0 \n\t" //FIXME optimize & bench
  2211. " jb 1b \n\t"
  2212. "movq %%mm7, %%mm6 \n\t"
  2213. "psrlq $32, %%mm7 \n\t"
  2214. "paddd %%mm6, %%mm7 \n\t"
  2215. "psrld $2, %%mm7 \n\t"
  2216. "movd %%mm7, %0 \n\t"
  2217. : "+r" (i)
  2218. : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
  2219. );
  2220. return i;
  2221. }
  2222. static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
  2223. long i=0;
  2224. if(ABS(scale) < 256){
  2225. scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
  2226. asm volatile(
  2227. "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
  2228. "psrlw $15, %%mm6 \n\t" // 1w
  2229. "movd %3, %%mm5 \n\t"
  2230. "punpcklwd %%mm5, %%mm5 \n\t"
  2231. "punpcklwd %%mm5, %%mm5 \n\t"
  2232. "1: \n\t"
  2233. "movq (%1, %0), %%mm0 \n\t"
  2234. "movq 8(%1, %0), %%mm1 \n\t"
  2235. "pmulhw %%mm5, %%mm0 \n\t"
  2236. "pmulhw %%mm5, %%mm1 \n\t"
  2237. "paddw %%mm6, %%mm0 \n\t"
  2238. "paddw %%mm6, %%mm1 \n\t"
  2239. "psraw $1, %%mm0 \n\t"
  2240. "psraw $1, %%mm1 \n\t"
  2241. "paddw (%2, %0), %%mm0 \n\t"
  2242. "paddw 8(%2, %0), %%mm1 \n\t"
  2243. "movq %%mm0, (%2, %0) \n\t"
  2244. "movq %%mm1, 8(%2, %0) \n\t"
  2245. "add $16, %0 \n\t"
  2246. "cmp $128, %0 \n\t" //FIXME optimize & bench
  2247. " jb 1b \n\t"
  2248. : "+r" (i)
  2249. : "r"(basis), "r"(rem), "g"(scale)
  2250. );
  2251. }else{
  2252. for(i=0; i<8*8; i++){
  2253. rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
  2254. }
  2255. }
  2256. }
  2257. #include "h264dsp_mmx.c"
  2258. /* external functions, from idct_mmx.c */
  2259. void ff_mmx_idct(DCTELEM *block);
  2260. void ff_mmxext_idct(DCTELEM *block);
  2261. void ff_vp3_idct_sse2(int16_t *input_data);
  2262. void ff_vp3_idct_mmx(int16_t *data);
  2263. void ff_vp3_dsp_init_mmx(void);
  2264. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  2265. converted */
  2266. static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  2267. {
  2268. ff_mmx_idct (block);
  2269. put_pixels_clamped_mmx(block, dest, line_size);
  2270. }
  2271. static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  2272. {
  2273. ff_mmx_idct (block);
  2274. add_pixels_clamped_mmx(block, dest, line_size);
  2275. }
  2276. static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  2277. {
  2278. ff_mmxext_idct (block);
  2279. put_pixels_clamped_mmx(block, dest, line_size);
  2280. }
  2281. static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  2282. {
  2283. ff_mmxext_idct (block);
  2284. add_pixels_clamped_mmx(block, dest, line_size);
  2285. }
  2286. static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
  2287. {
  2288. ff_vp3_idct_sse2(block);
  2289. put_signed_pixels_clamped_mmx(block, dest, line_size);
  2290. }
  2291. static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
  2292. {
  2293. ff_vp3_idct_sse2(block);
  2294. add_pixels_clamped_mmx(block, dest, line_size);
  2295. }
  2296. static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
  2297. {
  2298. ff_vp3_idct_mmx(block);
  2299. put_signed_pixels_clamped_mmx(block, dest, line_size);
  2300. }
  2301. static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
  2302. {
  2303. ff_vp3_idct_mmx(block);
  2304. add_pixels_clamped_mmx(block, dest, line_size);
  2305. }
  2306. #ifdef CONFIG_GPL
  2307. static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
  2308. {
  2309. ff_idct_xvid_mmx (block);
  2310. put_pixels_clamped_mmx(block, dest, line_size);
  2311. }
  2312. static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
  2313. {
  2314. ff_idct_xvid_mmx (block);
  2315. add_pixels_clamped_mmx(block, dest, line_size);
  2316. }
  2317. static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
  2318. {
  2319. ff_idct_xvid_mmx2 (block);
  2320. put_pixels_clamped_mmx(block, dest, line_size);
  2321. }
  2322. static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
  2323. {
  2324. ff_idct_xvid_mmx2 (block);
  2325. add_pixels_clamped_mmx(block, dest, line_size);
  2326. }
  2327. #endif
  2328. void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  2329. {
  2330. mm_flags = mm_support();
  2331. if (avctx->dsp_mask) {
  2332. if (avctx->dsp_mask & FF_MM_FORCE)
  2333. mm_flags |= (avctx->dsp_mask & 0xffff);
  2334. else
  2335. mm_flags &= ~(avctx->dsp_mask & 0xffff);
  2336. }
  2337. #if 0
  2338. av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
  2339. if (mm_flags & MM_MMX)
  2340. av_log(avctx, AV_LOG_INFO, " mmx");
  2341. if (mm_flags & MM_MMXEXT)
  2342. av_log(avctx, AV_LOG_INFO, " mmxext");
  2343. if (mm_flags & MM_3DNOW)
  2344. av_log(avctx, AV_LOG_INFO, " 3dnow");
  2345. if (mm_flags & MM_SSE)
  2346. av_log(avctx, AV_LOG_INFO, " sse");
  2347. if (mm_flags & MM_SSE2)
  2348. av_log(avctx, AV_LOG_INFO, " sse2");
  2349. av_log(avctx, AV_LOG_INFO, "\n");
  2350. #endif
  2351. if (mm_flags & MM_MMX) {
  2352. const int idct_algo= avctx->idct_algo;
  2353. #ifdef CONFIG_ENCODERS
  2354. const int dct_algo = avctx->dct_algo;
  2355. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
  2356. if(mm_flags & MM_SSE2){
  2357. c->fdct = ff_fdct_sse2;
  2358. }else if(mm_flags & MM_MMXEXT){
  2359. c->fdct = ff_fdct_mmx2;
  2360. }else{
  2361. c->fdct = ff_fdct_mmx;
  2362. }
  2363. }
  2364. #endif //CONFIG_ENCODERS
  2365. if(avctx->lowres==0){
  2366. if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
  2367. c->idct_put= ff_simple_idct_put_mmx;
  2368. c->idct_add= ff_simple_idct_add_mmx;
  2369. c->idct = ff_simple_idct_mmx;
  2370. c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
  2371. }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
  2372. if(mm_flags & MM_MMXEXT){
  2373. c->idct_put= ff_libmpeg2mmx2_idct_put;
  2374. c->idct_add= ff_libmpeg2mmx2_idct_add;
  2375. c->idct = ff_mmxext_idct;
  2376. }else{
  2377. c->idct_put= ff_libmpeg2mmx_idct_put;
  2378. c->idct_add= ff_libmpeg2mmx_idct_add;
  2379. c->idct = ff_mmx_idct;
  2380. }
  2381. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  2382. }else if(idct_algo==FF_IDCT_VP3){
  2383. if(mm_flags & MM_SSE2){
  2384. c->idct_put= ff_vp3_idct_put_sse2;
  2385. c->idct_add= ff_vp3_idct_add_sse2;
  2386. c->idct = ff_vp3_idct_sse2;
  2387. c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  2388. }else{
  2389. ff_vp3_dsp_init_mmx();
  2390. c->idct_put= ff_vp3_idct_put_mmx;
  2391. c->idct_add= ff_vp3_idct_add_mmx;
  2392. c->idct = ff_vp3_idct_mmx;
  2393. c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
  2394. }
  2395. #ifdef CONFIG_GPL
  2396. }else if(idct_algo==FF_IDCT_XVIDMMX){
  2397. if(mm_flags & MM_MMXEXT){
  2398. c->idct_put= ff_idct_xvid_mmx2_put;
  2399. c->idct_add= ff_idct_xvid_mmx2_add;
  2400. c->idct = ff_idct_xvid_mmx2;
  2401. }else{
  2402. c->idct_put= ff_idct_xvid_mmx_put;
  2403. c->idct_add= ff_idct_xvid_mmx_add;
  2404. c->idct = ff_idct_xvid_mmx;
  2405. }
  2406. #endif
  2407. }
  2408. }
  2409. #ifdef CONFIG_ENCODERS
  2410. c->get_pixels = get_pixels_mmx;
  2411. c->diff_pixels = diff_pixels_mmx;
  2412. #endif //CONFIG_ENCODERS
  2413. c->put_pixels_clamped = put_pixels_clamped_mmx;
  2414. c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
  2415. c->add_pixels_clamped = add_pixels_clamped_mmx;
  2416. c->clear_blocks = clear_blocks_mmx;
  2417. #ifdef CONFIG_ENCODERS
  2418. c->pix_sum = pix_sum16_mmx;
  2419. #endif //CONFIG_ENCODERS
  2420. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  2421. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  2422. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  2423. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  2424. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  2425. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  2426. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  2427. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  2428. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  2429. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  2430. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  2431. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  2432. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  2433. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  2434. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  2435. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  2436. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  2437. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  2438. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  2439. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  2440. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  2441. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  2442. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  2443. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  2444. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  2445. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  2446. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  2447. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  2448. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  2449. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  2450. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  2451. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  2452. c->add_bytes= add_bytes_mmx;
  2453. #ifdef CONFIG_ENCODERS
  2454. c->diff_bytes= diff_bytes_mmx;
  2455. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  2456. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  2457. c->pix_norm1 = pix_norm1_mmx;
  2458. c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
  2459. c->sse[1] = sse8_mmx;
  2460. c->vsad[4]= vsad_intra16_mmx;
  2461. c->nsse[0] = nsse16_mmx;
  2462. c->nsse[1] = nsse8_mmx;
  2463. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2464. c->vsad[0] = vsad16_mmx;
  2465. }
  2466. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2467. c->try_8x8basis= try_8x8basis_mmx;
  2468. }
  2469. c->add_8x8basis= add_8x8basis_mmx;
  2470. #endif //CONFIG_ENCODERS
  2471. c->h263_v_loop_filter= h263_v_loop_filter_mmx;
  2472. c->h263_h_loop_filter= h263_h_loop_filter_mmx;
  2473. c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
  2474. c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
  2475. if (mm_flags & MM_MMXEXT) {
  2476. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  2477. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  2478. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  2479. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  2480. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  2481. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  2482. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  2483. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  2484. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  2485. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  2486. #ifdef CONFIG_ENCODERS
  2487. c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
  2488. c->hadamard8_diff[1]= hadamard8_diff_mmx2;
  2489. c->vsad[4]= vsad_intra16_mmx2;
  2490. #endif //CONFIG_ENCODERS
  2491. c->h264_idct_add= ff_h264_idct_add_mmx2;
  2492. c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
  2493. c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
  2494. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2495. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  2496. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  2497. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  2498. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  2499. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  2500. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  2501. #ifdef CONFIG_ENCODERS
  2502. c->vsad[0] = vsad16_mmx2;
  2503. #endif //CONFIG_ENCODERS
  2504. }
  2505. #if 1
  2506. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  2507. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  2508. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  2509. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  2510. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  2511. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  2512. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  2513. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  2514. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  2515. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  2516. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  2517. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  2518. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  2519. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  2520. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  2521. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  2522. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  2523. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  2524. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  2525. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  2526. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  2527. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  2528. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  2529. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  2530. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  2531. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  2532. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  2533. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  2534. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  2535. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  2536. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  2537. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  2538. #endif
  2539. //FIXME 3dnow too
  2540. #define dspfunc(PFX, IDX, NUM) \
  2541. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
  2542. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
  2543. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
  2544. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
  2545. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
  2546. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
  2547. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
  2548. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
  2549. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
  2550. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
  2551. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
  2552. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
  2553. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
  2554. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
  2555. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
  2556. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
  2557. dspfunc(put_h264_qpel, 0, 16);
  2558. dspfunc(put_h264_qpel, 1, 8);
  2559. dspfunc(put_h264_qpel, 2, 4);
  2560. dspfunc(avg_h264_qpel, 0, 16);
  2561. dspfunc(avg_h264_qpel, 1, 8);
  2562. dspfunc(avg_h264_qpel, 2, 4);
  2563. #undef dspfunc
  2564. c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
  2565. c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
  2566. c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
  2567. c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
  2568. c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
  2569. c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
  2570. c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
  2571. c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
  2572. c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
  2573. c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
  2574. c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
  2575. c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
  2576. c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
  2577. c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
  2578. c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
  2579. c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
  2580. c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
  2581. c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
  2582. c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
  2583. c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
  2584. c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
  2585. c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
  2586. c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
  2587. c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
  2588. #ifdef CONFIG_ENCODERS
  2589. c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  2590. #endif //CONFIG_ENCODERS
  2591. } else if (mm_flags & MM_3DNOW) {
  2592. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  2593. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  2594. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  2595. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  2596. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  2597. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  2598. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  2599. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  2600. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  2601. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  2602. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2603. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  2604. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  2605. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  2606. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  2607. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  2608. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  2609. }
  2610. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  2611. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  2612. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  2613. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  2614. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  2615. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  2616. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  2617. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  2618. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  2619. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  2620. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  2621. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  2622. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  2623. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  2624. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  2625. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  2626. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  2627. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  2628. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  2629. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  2630. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  2631. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  2632. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  2633. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  2634. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  2635. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  2636. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  2637. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  2638. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  2639. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  2640. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  2641. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  2642. #define dspfunc(PFX, IDX, NUM) \
  2643. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
  2644. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
  2645. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
  2646. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
  2647. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
  2648. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
  2649. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
  2650. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
  2651. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
  2652. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
  2653. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
  2654. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
  2655. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
  2656. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
  2657. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
  2658. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
  2659. dspfunc(put_h264_qpel, 0, 16);
  2660. dspfunc(put_h264_qpel, 1, 8);
  2661. dspfunc(put_h264_qpel, 2, 4);
  2662. dspfunc(avg_h264_qpel, 0, 16);
  2663. dspfunc(avg_h264_qpel, 1, 8);
  2664. dspfunc(avg_h264_qpel, 2, 4);
  2665. c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
  2666. c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
  2667. }
  2668. }
  2669. #ifdef CONFIG_ENCODERS
  2670. dsputil_init_pix_mmx(c, avctx);
  2671. #endif //CONFIG_ENCODERS
  2672. #if 0
  2673. // for speed testing
  2674. get_pixels = just_return;
  2675. put_pixels_clamped = just_return;
  2676. add_pixels_clamped = just_return;
  2677. pix_abs16x16 = just_return;
  2678. pix_abs16x16_x2 = just_return;
  2679. pix_abs16x16_y2 = just_return;
  2680. pix_abs16x16_xy2 = just_return;
  2681. put_pixels_tab[0] = just_return;
  2682. put_pixels_tab[1] = just_return;
  2683. put_pixels_tab[2] = just_return;
  2684. put_pixels_tab[3] = just_return;
  2685. put_no_rnd_pixels_tab[0] = just_return;
  2686. put_no_rnd_pixels_tab[1] = just_return;
  2687. put_no_rnd_pixels_tab[2] = just_return;
  2688. put_no_rnd_pixels_tab[3] = just_return;
  2689. avg_pixels_tab[0] = just_return;
  2690. avg_pixels_tab[1] = just_return;
  2691. avg_pixels_tab[2] = just_return;
  2692. avg_pixels_tab[3] = just_return;
  2693. avg_no_rnd_pixels_tab[0] = just_return;
  2694. avg_no_rnd_pixels_tab[1] = just_return;
  2695. avg_no_rnd_pixels_tab[2] = just_return;
  2696. avg_no_rnd_pixels_tab[3] = just_return;
  2697. //av_fdct = just_return;
  2698. //ff_idct = just_return;
  2699. #endif
  2700. }