You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3838 lines
143KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23. */
  24. #include "dsputil.h"
  25. #include "dsputil_mmx.h"
  26. #include "simple_idct.h"
  27. #include "mpegvideo.h"
  28. #include "x86_cpu.h"
  29. #include "mmx.h"
  30. #include "vp3dsp_mmx.h"
  31. #include "vp3dsp_sse2.h"
  32. #include "h263.h"
  33. //#undef NDEBUG
  34. //#include <assert.h>
  35. extern void ff_idct_xvid_mmx(short *block);
  36. extern void ff_idct_xvid_mmx2(short *block);
  37. int mm_flags; /* multimedia extension flags */
  38. /* pixel operations */
  39. DECLARE_ALIGNED_8 (const uint64_t, ff_bone) = 0x0101010101010101ULL;
  40. DECLARE_ALIGNED_8 (const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
  41. DECLARE_ALIGNED_16(const uint64_t, ff_pdw_80000000[2]) =
  42. {0x8000000080000000ULL, 0x8000000080000000ULL};
  43. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_3 ) = 0x0003000300030003ULL;
  44. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_4 ) = 0x0004000400040004ULL;
  45. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_5 ) = 0x0005000500050005ULL;
  46. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_8 ) = 0x0008000800080008ULL;
  47. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
  48. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_16 ) = 0x0010001000100010ULL;
  49. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
  50. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_32 ) = 0x0020002000200020ULL;
  51. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
  52. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_64 ) = 0x0040004000400040ULL;
  53. DECLARE_ALIGNED_8 (const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
  54. DECLARE_ALIGNED_16(const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
  55. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_1 ) = 0x0101010101010101ULL;
  56. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3 ) = 0x0303030303030303ULL;
  57. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
  58. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
  59. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_A1 ) = 0xA1A1A1A1A1A1A1A1ULL;
  60. DECLARE_ALIGNED_8 (const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
  61. DECLARE_ALIGNED_16(const double, ff_pd_1[2]) = { 1.0, 1.0 };
  62. DECLARE_ALIGNED_16(const double, ff_pd_2[2]) = { 2.0, 2.0 };
  63. #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
  64. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  65. #define MOVQ_WONE(regd) \
  66. __asm __volatile ( \
  67. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  68. "psrlw $15, %%" #regd ::)
  69. #define MOVQ_BFE(regd) \
  70. __asm __volatile ( \
  71. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  72. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  73. #ifndef PIC
  74. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
  75. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
  76. #else
  77. // for shared library it's better to use this way for accessing constants
  78. // pcmpeqd -> -1
  79. #define MOVQ_BONE(regd) \
  80. __asm __volatile ( \
  81. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  82. "psrlw $15, %%" #regd " \n\t" \
  83. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  84. #define MOVQ_WTWO(regd) \
  85. __asm __volatile ( \
  86. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  87. "psrlw $15, %%" #regd " \n\t" \
  88. "psllw $1, %%" #regd " \n\t"::)
  89. #endif
  90. // using regr as temporary and for the output result
  91. // first argument is unmodifed and second is trashed
  92. // regfe is supposed to contain 0xfefefefefefefefe
  93. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  94. "movq " #rega ", " #regr " \n\t"\
  95. "pand " #regb ", " #regr " \n\t"\
  96. "pxor " #rega ", " #regb " \n\t"\
  97. "pand " #regfe "," #regb " \n\t"\
  98. "psrlq $1, " #regb " \n\t"\
  99. "paddb " #regb ", " #regr " \n\t"
  100. #define PAVGB_MMX(rega, regb, regr, regfe) \
  101. "movq " #rega ", " #regr " \n\t"\
  102. "por " #regb ", " #regr " \n\t"\
  103. "pxor " #rega ", " #regb " \n\t"\
  104. "pand " #regfe "," #regb " \n\t"\
  105. "psrlq $1, " #regb " \n\t"\
  106. "psubb " #regb ", " #regr " \n\t"
  107. // mm6 is supposed to contain 0xfefefefefefefefe
  108. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  109. "movq " #rega ", " #regr " \n\t"\
  110. "movq " #regc ", " #regp " \n\t"\
  111. "pand " #regb ", " #regr " \n\t"\
  112. "pand " #regd ", " #regp " \n\t"\
  113. "pxor " #rega ", " #regb " \n\t"\
  114. "pxor " #regc ", " #regd " \n\t"\
  115. "pand %%mm6, " #regb " \n\t"\
  116. "pand %%mm6, " #regd " \n\t"\
  117. "psrlq $1, " #regb " \n\t"\
  118. "psrlq $1, " #regd " \n\t"\
  119. "paddb " #regb ", " #regr " \n\t"\
  120. "paddb " #regd ", " #regp " \n\t"
  121. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  122. "movq " #rega ", " #regr " \n\t"\
  123. "movq " #regc ", " #regp " \n\t"\
  124. "por " #regb ", " #regr " \n\t"\
  125. "por " #regd ", " #regp " \n\t"\
  126. "pxor " #rega ", " #regb " \n\t"\
  127. "pxor " #regc ", " #regd " \n\t"\
  128. "pand %%mm6, " #regb " \n\t"\
  129. "pand %%mm6, " #regd " \n\t"\
  130. "psrlq $1, " #regd " \n\t"\
  131. "psrlq $1, " #regb " \n\t"\
  132. "psubb " #regb ", " #regr " \n\t"\
  133. "psubb " #regd ", " #regp " \n\t"
  134. /***********************************/
  135. /* MMX no rounding */
  136. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  137. #define SET_RND MOVQ_WONE
  138. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  139. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  140. #include "dsputil_mmx_rnd.h"
  141. #undef DEF
  142. #undef SET_RND
  143. #undef PAVGBP
  144. #undef PAVGB
  145. /***********************************/
  146. /* MMX rounding */
  147. #define DEF(x, y) x ## _ ## y ##_mmx
  148. #define SET_RND MOVQ_WTWO
  149. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  150. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  151. #include "dsputil_mmx_rnd.h"
  152. #undef DEF
  153. #undef SET_RND
  154. #undef PAVGBP
  155. #undef PAVGB
  156. /***********************************/
  157. /* 3Dnow specific */
  158. #define DEF(x) x ## _3dnow
  159. #define PAVGB "pavgusb"
  160. #include "dsputil_mmx_avg.h"
  161. #undef DEF
  162. #undef PAVGB
  163. /***********************************/
  164. /* MMX2 specific */
  165. #define DEF(x) x ## _mmx2
  166. /* Introduced only in MMX2 set */
  167. #define PAVGB "pavgb"
  168. #include "dsputil_mmx_avg.h"
  169. #undef DEF
  170. #undef PAVGB
  171. #define SBUTTERFLY(a,b,t,n,m)\
  172. "mov" #m " " #a ", " #t " \n\t" /* abcd */\
  173. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  174. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  175. #define TRANSPOSE4(a,b,c,d,t)\
  176. SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
  177. SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
  178. SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
  179. SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
  180. /***********************************/
  181. /* standard MMX */
  182. #ifdef CONFIG_ENCODERS
  183. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  184. {
  185. asm volatile(
  186. "mov $-128, %%"REG_a" \n\t"
  187. "pxor %%mm7, %%mm7 \n\t"
  188. ASMALIGN(4)
  189. "1: \n\t"
  190. "movq (%0), %%mm0 \n\t"
  191. "movq (%0, %2), %%mm2 \n\t"
  192. "movq %%mm0, %%mm1 \n\t"
  193. "movq %%mm2, %%mm3 \n\t"
  194. "punpcklbw %%mm7, %%mm0 \n\t"
  195. "punpckhbw %%mm7, %%mm1 \n\t"
  196. "punpcklbw %%mm7, %%mm2 \n\t"
  197. "punpckhbw %%mm7, %%mm3 \n\t"
  198. "movq %%mm0, (%1, %%"REG_a") \n\t"
  199. "movq %%mm1, 8(%1, %%"REG_a") \n\t"
  200. "movq %%mm2, 16(%1, %%"REG_a") \n\t"
  201. "movq %%mm3, 24(%1, %%"REG_a") \n\t"
  202. "add %3, %0 \n\t"
  203. "add $32, %%"REG_a" \n\t"
  204. "js 1b \n\t"
  205. : "+r" (pixels)
  206. : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
  207. : "%"REG_a
  208. );
  209. }
  210. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  211. {
  212. asm volatile(
  213. "pxor %%mm7, %%mm7 \n\t"
  214. "mov $-128, %%"REG_a" \n\t"
  215. ASMALIGN(4)
  216. "1: \n\t"
  217. "movq (%0), %%mm0 \n\t"
  218. "movq (%1), %%mm2 \n\t"
  219. "movq %%mm0, %%mm1 \n\t"
  220. "movq %%mm2, %%mm3 \n\t"
  221. "punpcklbw %%mm7, %%mm0 \n\t"
  222. "punpckhbw %%mm7, %%mm1 \n\t"
  223. "punpcklbw %%mm7, %%mm2 \n\t"
  224. "punpckhbw %%mm7, %%mm3 \n\t"
  225. "psubw %%mm2, %%mm0 \n\t"
  226. "psubw %%mm3, %%mm1 \n\t"
  227. "movq %%mm0, (%2, %%"REG_a") \n\t"
  228. "movq %%mm1, 8(%2, %%"REG_a") \n\t"
  229. "add %3, %0 \n\t"
  230. "add %3, %1 \n\t"
  231. "add $16, %%"REG_a" \n\t"
  232. "jnz 1b \n\t"
  233. : "+r" (s1), "+r" (s2)
  234. : "r" (block+64), "r" ((long)stride)
  235. : "%"REG_a
  236. );
  237. }
  238. #endif //CONFIG_ENCODERS
  239. void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  240. {
  241. const DCTELEM *p;
  242. uint8_t *pix;
  243. /* read the pixels */
  244. p = block;
  245. pix = pixels;
  246. /* unrolled loop */
  247. __asm __volatile(
  248. "movq %3, %%mm0 \n\t"
  249. "movq 8%3, %%mm1 \n\t"
  250. "movq 16%3, %%mm2 \n\t"
  251. "movq 24%3, %%mm3 \n\t"
  252. "movq 32%3, %%mm4 \n\t"
  253. "movq 40%3, %%mm5 \n\t"
  254. "movq 48%3, %%mm6 \n\t"
  255. "movq 56%3, %%mm7 \n\t"
  256. "packuswb %%mm1, %%mm0 \n\t"
  257. "packuswb %%mm3, %%mm2 \n\t"
  258. "packuswb %%mm5, %%mm4 \n\t"
  259. "packuswb %%mm7, %%mm6 \n\t"
  260. "movq %%mm0, (%0) \n\t"
  261. "movq %%mm2, (%0, %1) \n\t"
  262. "movq %%mm4, (%0, %1, 2) \n\t"
  263. "movq %%mm6, (%0, %2) \n\t"
  264. ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
  265. :"memory");
  266. pix += line_size*4;
  267. p += 32;
  268. // if here would be an exact copy of the code above
  269. // compiler would generate some very strange code
  270. // thus using "r"
  271. __asm __volatile(
  272. "movq (%3), %%mm0 \n\t"
  273. "movq 8(%3), %%mm1 \n\t"
  274. "movq 16(%3), %%mm2 \n\t"
  275. "movq 24(%3), %%mm3 \n\t"
  276. "movq 32(%3), %%mm4 \n\t"
  277. "movq 40(%3), %%mm5 \n\t"
  278. "movq 48(%3), %%mm6 \n\t"
  279. "movq 56(%3), %%mm7 \n\t"
  280. "packuswb %%mm1, %%mm0 \n\t"
  281. "packuswb %%mm3, %%mm2 \n\t"
  282. "packuswb %%mm5, %%mm4 \n\t"
  283. "packuswb %%mm7, %%mm6 \n\t"
  284. "movq %%mm0, (%0) \n\t"
  285. "movq %%mm2, (%0, %1) \n\t"
  286. "movq %%mm4, (%0, %1, 2) \n\t"
  287. "movq %%mm6, (%0, %2) \n\t"
  288. ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
  289. :"memory");
  290. }
  291. static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) =
  292. { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
  293. void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  294. {
  295. int i;
  296. movq_m2r(*vector128, mm1);
  297. for (i = 0; i < 8; i++) {
  298. movq_m2r(*(block), mm0);
  299. packsswb_m2r(*(block + 4), mm0);
  300. block += 8;
  301. paddb_r2r(mm1, mm0);
  302. movq_r2m(mm0, *pixels);
  303. pixels += line_size;
  304. }
  305. }
  306. void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  307. {
  308. const DCTELEM *p;
  309. uint8_t *pix;
  310. int i;
  311. /* read the pixels */
  312. p = block;
  313. pix = pixels;
  314. MOVQ_ZERO(mm7);
  315. i = 4;
  316. do {
  317. __asm __volatile(
  318. "movq (%2), %%mm0 \n\t"
  319. "movq 8(%2), %%mm1 \n\t"
  320. "movq 16(%2), %%mm2 \n\t"
  321. "movq 24(%2), %%mm3 \n\t"
  322. "movq %0, %%mm4 \n\t"
  323. "movq %1, %%mm6 \n\t"
  324. "movq %%mm4, %%mm5 \n\t"
  325. "punpcklbw %%mm7, %%mm4 \n\t"
  326. "punpckhbw %%mm7, %%mm5 \n\t"
  327. "paddsw %%mm4, %%mm0 \n\t"
  328. "paddsw %%mm5, %%mm1 \n\t"
  329. "movq %%mm6, %%mm5 \n\t"
  330. "punpcklbw %%mm7, %%mm6 \n\t"
  331. "punpckhbw %%mm7, %%mm5 \n\t"
  332. "paddsw %%mm6, %%mm2 \n\t"
  333. "paddsw %%mm5, %%mm3 \n\t"
  334. "packuswb %%mm1, %%mm0 \n\t"
  335. "packuswb %%mm3, %%mm2 \n\t"
  336. "movq %%mm0, %0 \n\t"
  337. "movq %%mm2, %1 \n\t"
  338. :"+m"(*pix), "+m"(*(pix+line_size))
  339. :"r"(p)
  340. :"memory");
  341. pix += line_size*2;
  342. p += 16;
  343. } while (--i);
  344. }
  345. static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  346. {
  347. __asm __volatile(
  348. "lea (%3, %3), %%"REG_a" \n\t"
  349. ASMALIGN(3)
  350. "1: \n\t"
  351. "movd (%1), %%mm0 \n\t"
  352. "movd (%1, %3), %%mm1 \n\t"
  353. "movd %%mm0, (%2) \n\t"
  354. "movd %%mm1, (%2, %3) \n\t"
  355. "add %%"REG_a", %1 \n\t"
  356. "add %%"REG_a", %2 \n\t"
  357. "movd (%1), %%mm0 \n\t"
  358. "movd (%1, %3), %%mm1 \n\t"
  359. "movd %%mm0, (%2) \n\t"
  360. "movd %%mm1, (%2, %3) \n\t"
  361. "add %%"REG_a", %1 \n\t"
  362. "add %%"REG_a", %2 \n\t"
  363. "subl $4, %0 \n\t"
  364. "jnz 1b \n\t"
  365. : "+g"(h), "+r" (pixels), "+r" (block)
  366. : "r"((long)line_size)
  367. : "%"REG_a, "memory"
  368. );
  369. }
  370. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  371. {
  372. __asm __volatile(
  373. "lea (%3, %3), %%"REG_a" \n\t"
  374. ASMALIGN(3)
  375. "1: \n\t"
  376. "movq (%1), %%mm0 \n\t"
  377. "movq (%1, %3), %%mm1 \n\t"
  378. "movq %%mm0, (%2) \n\t"
  379. "movq %%mm1, (%2, %3) \n\t"
  380. "add %%"REG_a", %1 \n\t"
  381. "add %%"REG_a", %2 \n\t"
  382. "movq (%1), %%mm0 \n\t"
  383. "movq (%1, %3), %%mm1 \n\t"
  384. "movq %%mm0, (%2) \n\t"
  385. "movq %%mm1, (%2, %3) \n\t"
  386. "add %%"REG_a", %1 \n\t"
  387. "add %%"REG_a", %2 \n\t"
  388. "subl $4, %0 \n\t"
  389. "jnz 1b \n\t"
  390. : "+g"(h), "+r" (pixels), "+r" (block)
  391. : "r"((long)line_size)
  392. : "%"REG_a, "memory"
  393. );
  394. }
  395. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  396. {
  397. __asm __volatile(
  398. "lea (%3, %3), %%"REG_a" \n\t"
  399. ASMALIGN(3)
  400. "1: \n\t"
  401. "movq (%1), %%mm0 \n\t"
  402. "movq 8(%1), %%mm4 \n\t"
  403. "movq (%1, %3), %%mm1 \n\t"
  404. "movq 8(%1, %3), %%mm5 \n\t"
  405. "movq %%mm0, (%2) \n\t"
  406. "movq %%mm4, 8(%2) \n\t"
  407. "movq %%mm1, (%2, %3) \n\t"
  408. "movq %%mm5, 8(%2, %3) \n\t"
  409. "add %%"REG_a", %1 \n\t"
  410. "add %%"REG_a", %2 \n\t"
  411. "movq (%1), %%mm0 \n\t"
  412. "movq 8(%1), %%mm4 \n\t"
  413. "movq (%1, %3), %%mm1 \n\t"
  414. "movq 8(%1, %3), %%mm5 \n\t"
  415. "movq %%mm0, (%2) \n\t"
  416. "movq %%mm4, 8(%2) \n\t"
  417. "movq %%mm1, (%2, %3) \n\t"
  418. "movq %%mm5, 8(%2, %3) \n\t"
  419. "add %%"REG_a", %1 \n\t"
  420. "add %%"REG_a", %2 \n\t"
  421. "subl $4, %0 \n\t"
  422. "jnz 1b \n\t"
  423. : "+g"(h), "+r" (pixels), "+r" (block)
  424. : "r"((long)line_size)
  425. : "%"REG_a, "memory"
  426. );
  427. }
  428. static void clear_blocks_mmx(DCTELEM *blocks)
  429. {
  430. __asm __volatile(
  431. "pxor %%mm7, %%mm7 \n\t"
  432. "mov $-128*6, %%"REG_a" \n\t"
  433. "1: \n\t"
  434. "movq %%mm7, (%0, %%"REG_a") \n\t"
  435. "movq %%mm7, 8(%0, %%"REG_a") \n\t"
  436. "movq %%mm7, 16(%0, %%"REG_a") \n\t"
  437. "movq %%mm7, 24(%0, %%"REG_a") \n\t"
  438. "add $32, %%"REG_a" \n\t"
  439. " js 1b \n\t"
  440. : : "r" (((uint8_t *)blocks)+128*6)
  441. : "%"REG_a
  442. );
  443. }
  444. #ifdef CONFIG_ENCODERS
  445. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  446. const int h=16;
  447. int sum;
  448. long index= -line_size*h;
  449. __asm __volatile(
  450. "pxor %%mm7, %%mm7 \n\t"
  451. "pxor %%mm6, %%mm6 \n\t"
  452. "1: \n\t"
  453. "movq (%2, %1), %%mm0 \n\t"
  454. "movq (%2, %1), %%mm1 \n\t"
  455. "movq 8(%2, %1), %%mm2 \n\t"
  456. "movq 8(%2, %1), %%mm3 \n\t"
  457. "punpcklbw %%mm7, %%mm0 \n\t"
  458. "punpckhbw %%mm7, %%mm1 \n\t"
  459. "punpcklbw %%mm7, %%mm2 \n\t"
  460. "punpckhbw %%mm7, %%mm3 \n\t"
  461. "paddw %%mm0, %%mm1 \n\t"
  462. "paddw %%mm2, %%mm3 \n\t"
  463. "paddw %%mm1, %%mm3 \n\t"
  464. "paddw %%mm3, %%mm6 \n\t"
  465. "add %3, %1 \n\t"
  466. " js 1b \n\t"
  467. "movq %%mm6, %%mm5 \n\t"
  468. "psrlq $32, %%mm6 \n\t"
  469. "paddw %%mm5, %%mm6 \n\t"
  470. "movq %%mm6, %%mm5 \n\t"
  471. "psrlq $16, %%mm6 \n\t"
  472. "paddw %%mm5, %%mm6 \n\t"
  473. "movd %%mm6, %0 \n\t"
  474. "andl $0xFFFF, %0 \n\t"
  475. : "=&r" (sum), "+r" (index)
  476. : "r" (pix - index), "r" ((long)line_size)
  477. );
  478. return sum;
  479. }
  480. #endif //CONFIG_ENCODERS
  481. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  482. long i=0;
  483. asm volatile(
  484. "1: \n\t"
  485. "movq (%1, %0), %%mm0 \n\t"
  486. "movq (%2, %0), %%mm1 \n\t"
  487. "paddb %%mm0, %%mm1 \n\t"
  488. "movq %%mm1, (%2, %0) \n\t"
  489. "movq 8(%1, %0), %%mm0 \n\t"
  490. "movq 8(%2, %0), %%mm1 \n\t"
  491. "paddb %%mm0, %%mm1 \n\t"
  492. "movq %%mm1, 8(%2, %0) \n\t"
  493. "add $16, %0 \n\t"
  494. "cmp %3, %0 \n\t"
  495. " jb 1b \n\t"
  496. : "+r" (i)
  497. : "r"(src), "r"(dst), "r"((long)w-15)
  498. );
  499. for(; i<w; i++)
  500. dst[i+0] += src[i+0];
  501. }
  502. #define H263_LOOP_FILTER \
  503. "pxor %%mm7, %%mm7 \n\t"\
  504. "movq %0, %%mm0 \n\t"\
  505. "movq %0, %%mm1 \n\t"\
  506. "movq %3, %%mm2 \n\t"\
  507. "movq %3, %%mm3 \n\t"\
  508. "punpcklbw %%mm7, %%mm0 \n\t"\
  509. "punpckhbw %%mm7, %%mm1 \n\t"\
  510. "punpcklbw %%mm7, %%mm2 \n\t"\
  511. "punpckhbw %%mm7, %%mm3 \n\t"\
  512. "psubw %%mm2, %%mm0 \n\t"\
  513. "psubw %%mm3, %%mm1 \n\t"\
  514. "movq %1, %%mm2 \n\t"\
  515. "movq %1, %%mm3 \n\t"\
  516. "movq %2, %%mm4 \n\t"\
  517. "movq %2, %%mm5 \n\t"\
  518. "punpcklbw %%mm7, %%mm2 \n\t"\
  519. "punpckhbw %%mm7, %%mm3 \n\t"\
  520. "punpcklbw %%mm7, %%mm4 \n\t"\
  521. "punpckhbw %%mm7, %%mm5 \n\t"\
  522. "psubw %%mm2, %%mm4 \n\t"\
  523. "psubw %%mm3, %%mm5 \n\t"\
  524. "psllw $2, %%mm4 \n\t"\
  525. "psllw $2, %%mm5 \n\t"\
  526. "paddw %%mm0, %%mm4 \n\t"\
  527. "paddw %%mm1, %%mm5 \n\t"\
  528. "pxor %%mm6, %%mm6 \n\t"\
  529. "pcmpgtw %%mm4, %%mm6 \n\t"\
  530. "pcmpgtw %%mm5, %%mm7 \n\t"\
  531. "pxor %%mm6, %%mm4 \n\t"\
  532. "pxor %%mm7, %%mm5 \n\t"\
  533. "psubw %%mm6, %%mm4 \n\t"\
  534. "psubw %%mm7, %%mm5 \n\t"\
  535. "psrlw $3, %%mm4 \n\t"\
  536. "psrlw $3, %%mm5 \n\t"\
  537. "packuswb %%mm5, %%mm4 \n\t"\
  538. "packsswb %%mm7, %%mm6 \n\t"\
  539. "pxor %%mm7, %%mm7 \n\t"\
  540. "movd %4, %%mm2 \n\t"\
  541. "punpcklbw %%mm2, %%mm2 \n\t"\
  542. "punpcklbw %%mm2, %%mm2 \n\t"\
  543. "punpcklbw %%mm2, %%mm2 \n\t"\
  544. "psubusb %%mm4, %%mm2 \n\t"\
  545. "movq %%mm2, %%mm3 \n\t"\
  546. "psubusb %%mm4, %%mm3 \n\t"\
  547. "psubb %%mm3, %%mm2 \n\t"\
  548. "movq %1, %%mm3 \n\t"\
  549. "movq %2, %%mm4 \n\t"\
  550. "pxor %%mm6, %%mm3 \n\t"\
  551. "pxor %%mm6, %%mm4 \n\t"\
  552. "paddusb %%mm2, %%mm3 \n\t"\
  553. "psubusb %%mm2, %%mm4 \n\t"\
  554. "pxor %%mm6, %%mm3 \n\t"\
  555. "pxor %%mm6, %%mm4 \n\t"\
  556. "paddusb %%mm2, %%mm2 \n\t"\
  557. "packsswb %%mm1, %%mm0 \n\t"\
  558. "pcmpgtb %%mm0, %%mm7 \n\t"\
  559. "pxor %%mm7, %%mm0 \n\t"\
  560. "psubb %%mm7, %%mm0 \n\t"\
  561. "movq %%mm0, %%mm1 \n\t"\
  562. "psubusb %%mm2, %%mm0 \n\t"\
  563. "psubb %%mm0, %%mm1 \n\t"\
  564. "pand %5, %%mm1 \n\t"\
  565. "psrlw $2, %%mm1 \n\t"\
  566. "pxor %%mm7, %%mm1 \n\t"\
  567. "psubb %%mm7, %%mm1 \n\t"\
  568. "movq %0, %%mm5 \n\t"\
  569. "movq %3, %%mm6 \n\t"\
  570. "psubb %%mm1, %%mm5 \n\t"\
  571. "paddb %%mm1, %%mm6 \n\t"
  572. static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  573. if(ENABLE_ANY_H263) {
  574. const int strength= ff_h263_loop_filter_strength[qscale];
  575. asm volatile(
  576. H263_LOOP_FILTER
  577. "movq %%mm3, %1 \n\t"
  578. "movq %%mm4, %2 \n\t"
  579. "movq %%mm5, %0 \n\t"
  580. "movq %%mm6, %3 \n\t"
  581. : "+m" (*(uint64_t*)(src - 2*stride)),
  582. "+m" (*(uint64_t*)(src - 1*stride)),
  583. "+m" (*(uint64_t*)(src + 0*stride)),
  584. "+m" (*(uint64_t*)(src + 1*stride))
  585. : "g" (2*strength), "m"(ff_pb_FC)
  586. );
  587. }
  588. }
  589. static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
  590. asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
  591. "movd %4, %%mm0 \n\t"
  592. "movd %5, %%mm1 \n\t"
  593. "movd %6, %%mm2 \n\t"
  594. "movd %7, %%mm3 \n\t"
  595. "punpcklbw %%mm1, %%mm0 \n\t"
  596. "punpcklbw %%mm3, %%mm2 \n\t"
  597. "movq %%mm0, %%mm1 \n\t"
  598. "punpcklwd %%mm2, %%mm0 \n\t"
  599. "punpckhwd %%mm2, %%mm1 \n\t"
  600. "movd %%mm0, %0 \n\t"
  601. "punpckhdq %%mm0, %%mm0 \n\t"
  602. "movd %%mm0, %1 \n\t"
  603. "movd %%mm1, %2 \n\t"
  604. "punpckhdq %%mm1, %%mm1 \n\t"
  605. "movd %%mm1, %3 \n\t"
  606. : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
  607. "=m" (*(uint32_t*)(dst + 1*dst_stride)),
  608. "=m" (*(uint32_t*)(dst + 2*dst_stride)),
  609. "=m" (*(uint32_t*)(dst + 3*dst_stride))
  610. : "m" (*(uint32_t*)(src + 0*src_stride)),
  611. "m" (*(uint32_t*)(src + 1*src_stride)),
  612. "m" (*(uint32_t*)(src + 2*src_stride)),
  613. "m" (*(uint32_t*)(src + 3*src_stride))
  614. );
  615. }
  616. static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  617. if(ENABLE_ANY_H263) {
  618. const int strength= ff_h263_loop_filter_strength[qscale];
  619. uint64_t temp[4] __attribute__ ((aligned(8)));
  620. uint8_t *btemp= (uint8_t*)temp;
  621. src -= 2;
  622. transpose4x4(btemp , src , 8, stride);
  623. transpose4x4(btemp+4, src + 4*stride, 8, stride);
  624. asm volatile(
  625. H263_LOOP_FILTER // 5 3 4 6
  626. : "+m" (temp[0]),
  627. "+m" (temp[1]),
  628. "+m" (temp[2]),
  629. "+m" (temp[3])
  630. : "g" (2*strength), "m"(ff_pb_FC)
  631. );
  632. asm volatile(
  633. "movq %%mm5, %%mm1 \n\t"
  634. "movq %%mm4, %%mm0 \n\t"
  635. "punpcklbw %%mm3, %%mm5 \n\t"
  636. "punpcklbw %%mm6, %%mm4 \n\t"
  637. "punpckhbw %%mm3, %%mm1 \n\t"
  638. "punpckhbw %%mm6, %%mm0 \n\t"
  639. "movq %%mm5, %%mm3 \n\t"
  640. "movq %%mm1, %%mm6 \n\t"
  641. "punpcklwd %%mm4, %%mm5 \n\t"
  642. "punpcklwd %%mm0, %%mm1 \n\t"
  643. "punpckhwd %%mm4, %%mm3 \n\t"
  644. "punpckhwd %%mm0, %%mm6 \n\t"
  645. "movd %%mm5, (%0) \n\t"
  646. "punpckhdq %%mm5, %%mm5 \n\t"
  647. "movd %%mm5, (%0,%2) \n\t"
  648. "movd %%mm3, (%0,%2,2) \n\t"
  649. "punpckhdq %%mm3, %%mm3 \n\t"
  650. "movd %%mm3, (%0,%3) \n\t"
  651. "movd %%mm1, (%1) \n\t"
  652. "punpckhdq %%mm1, %%mm1 \n\t"
  653. "movd %%mm1, (%1,%2) \n\t"
  654. "movd %%mm6, (%1,%2,2) \n\t"
  655. "punpckhdq %%mm6, %%mm6 \n\t"
  656. "movd %%mm6, (%1,%3) \n\t"
  657. :: "r" (src),
  658. "r" (src + 4*stride),
  659. "r" ((long) stride ),
  660. "r" ((long)(3*stride))
  661. );
  662. }
  663. }
  664. #ifdef CONFIG_ENCODERS
  665. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  666. int tmp;
  667. asm volatile (
  668. "movl $16,%%ecx\n"
  669. "pxor %%mm0,%%mm0\n"
  670. "pxor %%mm7,%%mm7\n"
  671. "1:\n"
  672. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  673. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  674. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  675. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  676. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  677. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  678. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  679. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  680. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  681. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  682. "pmaddwd %%mm3,%%mm3\n"
  683. "pmaddwd %%mm4,%%mm4\n"
  684. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  685. pix2^2+pix3^2+pix6^2+pix7^2) */
  686. "paddd %%mm3,%%mm4\n"
  687. "paddd %%mm2,%%mm7\n"
  688. "add %2, %0\n"
  689. "paddd %%mm4,%%mm7\n"
  690. "dec %%ecx\n"
  691. "jnz 1b\n"
  692. "movq %%mm7,%%mm1\n"
  693. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  694. "paddd %%mm7,%%mm1\n"
  695. "movd %%mm1,%1\n"
  696. : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
  697. return tmp;
  698. }
  699. static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  700. int tmp;
  701. asm volatile (
  702. "movl %4,%%ecx\n"
  703. "shr $1,%%ecx\n"
  704. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  705. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  706. "1:\n"
  707. "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
  708. "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
  709. "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
  710. "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
  711. /* todo: mm1-mm2, mm3-mm4 */
  712. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  713. /* OR the results to get absolute difference */
  714. "movq %%mm1,%%mm5\n"
  715. "movq %%mm3,%%mm6\n"
  716. "psubusb %%mm2,%%mm1\n"
  717. "psubusb %%mm4,%%mm3\n"
  718. "psubusb %%mm5,%%mm2\n"
  719. "psubusb %%mm6,%%mm4\n"
  720. "por %%mm1,%%mm2\n"
  721. "por %%mm3,%%mm4\n"
  722. /* now convert to 16-bit vectors so we can square them */
  723. "movq %%mm2,%%mm1\n"
  724. "movq %%mm4,%%mm3\n"
  725. "punpckhbw %%mm0,%%mm2\n"
  726. "punpckhbw %%mm0,%%mm4\n"
  727. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  728. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  729. "pmaddwd %%mm2,%%mm2\n"
  730. "pmaddwd %%mm4,%%mm4\n"
  731. "pmaddwd %%mm1,%%mm1\n"
  732. "pmaddwd %%mm3,%%mm3\n"
  733. "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
  734. "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
  735. "paddd %%mm2,%%mm1\n"
  736. "paddd %%mm4,%%mm3\n"
  737. "paddd %%mm1,%%mm7\n"
  738. "paddd %%mm3,%%mm7\n"
  739. "decl %%ecx\n"
  740. "jnz 1b\n"
  741. "movq %%mm7,%%mm1\n"
  742. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  743. "paddd %%mm7,%%mm1\n"
  744. "movd %%mm1,%2\n"
  745. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  746. : "r" ((long)line_size) , "m" (h)
  747. : "%ecx");
  748. return tmp;
  749. }
  750. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  751. int tmp;
  752. asm volatile (
  753. "movl %4,%%ecx\n"
  754. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  755. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  756. "1:\n"
  757. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  758. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  759. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  760. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  761. /* todo: mm1-mm2, mm3-mm4 */
  762. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  763. /* OR the results to get absolute difference */
  764. "movq %%mm1,%%mm5\n"
  765. "movq %%mm3,%%mm6\n"
  766. "psubusb %%mm2,%%mm1\n"
  767. "psubusb %%mm4,%%mm3\n"
  768. "psubusb %%mm5,%%mm2\n"
  769. "psubusb %%mm6,%%mm4\n"
  770. "por %%mm1,%%mm2\n"
  771. "por %%mm3,%%mm4\n"
  772. /* now convert to 16-bit vectors so we can square them */
  773. "movq %%mm2,%%mm1\n"
  774. "movq %%mm4,%%mm3\n"
  775. "punpckhbw %%mm0,%%mm2\n"
  776. "punpckhbw %%mm0,%%mm4\n"
  777. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  778. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  779. "pmaddwd %%mm2,%%mm2\n"
  780. "pmaddwd %%mm4,%%mm4\n"
  781. "pmaddwd %%mm1,%%mm1\n"
  782. "pmaddwd %%mm3,%%mm3\n"
  783. "add %3,%0\n"
  784. "add %3,%1\n"
  785. "paddd %%mm2,%%mm1\n"
  786. "paddd %%mm4,%%mm3\n"
  787. "paddd %%mm1,%%mm7\n"
  788. "paddd %%mm3,%%mm7\n"
  789. "decl %%ecx\n"
  790. "jnz 1b\n"
  791. "movq %%mm7,%%mm1\n"
  792. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  793. "paddd %%mm7,%%mm1\n"
  794. "movd %%mm1,%2\n"
  795. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  796. : "r" ((long)line_size) , "m" (h)
  797. : "%ecx");
  798. return tmp;
  799. }
  800. static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  801. int tmp;
  802. asm volatile (
  803. "shr $1,%2\n"
  804. "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
  805. "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
  806. "1:\n"
  807. "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
  808. "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
  809. "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
  810. "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
  811. /* todo: mm1-mm2, mm3-mm4 */
  812. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  813. /* OR the results to get absolute difference */
  814. "movdqa %%xmm1,%%xmm5\n"
  815. "movdqa %%xmm3,%%xmm6\n"
  816. "psubusb %%xmm2,%%xmm1\n"
  817. "psubusb %%xmm4,%%xmm3\n"
  818. "psubusb %%xmm5,%%xmm2\n"
  819. "psubusb %%xmm6,%%xmm4\n"
  820. "por %%xmm1,%%xmm2\n"
  821. "por %%xmm3,%%xmm4\n"
  822. /* now convert to 16-bit vectors so we can square them */
  823. "movdqa %%xmm2,%%xmm1\n"
  824. "movdqa %%xmm4,%%xmm3\n"
  825. "punpckhbw %%xmm0,%%xmm2\n"
  826. "punpckhbw %%xmm0,%%xmm4\n"
  827. "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
  828. "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
  829. "pmaddwd %%xmm2,%%xmm2\n"
  830. "pmaddwd %%xmm4,%%xmm4\n"
  831. "pmaddwd %%xmm1,%%xmm1\n"
  832. "pmaddwd %%xmm3,%%xmm3\n"
  833. "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
  834. "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
  835. "paddd %%xmm2,%%xmm1\n"
  836. "paddd %%xmm4,%%xmm3\n"
  837. "paddd %%xmm1,%%xmm7\n"
  838. "paddd %%xmm3,%%xmm7\n"
  839. "decl %2\n"
  840. "jnz 1b\n"
  841. "movdqa %%xmm7,%%xmm1\n"
  842. "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
  843. "paddd %%xmm1,%%xmm7\n"
  844. "movdqa %%xmm7,%%xmm1\n"
  845. "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
  846. "paddd %%xmm1,%%xmm7\n"
  847. "movd %%xmm7,%3\n"
  848. : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
  849. : "r" ((long)line_size));
  850. return tmp;
  851. }
  852. static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
  853. int tmp;
  854. asm volatile (
  855. "movl %3,%%ecx\n"
  856. "pxor %%mm7,%%mm7\n"
  857. "pxor %%mm6,%%mm6\n"
  858. "movq (%0),%%mm0\n"
  859. "movq %%mm0, %%mm1\n"
  860. "psllq $8, %%mm0\n"
  861. "psrlq $8, %%mm1\n"
  862. "psrlq $8, %%mm0\n"
  863. "movq %%mm0, %%mm2\n"
  864. "movq %%mm1, %%mm3\n"
  865. "punpcklbw %%mm7,%%mm0\n"
  866. "punpcklbw %%mm7,%%mm1\n"
  867. "punpckhbw %%mm7,%%mm2\n"
  868. "punpckhbw %%mm7,%%mm3\n"
  869. "psubw %%mm1, %%mm0\n"
  870. "psubw %%mm3, %%mm2\n"
  871. "add %2,%0\n"
  872. "movq (%0),%%mm4\n"
  873. "movq %%mm4, %%mm1\n"
  874. "psllq $8, %%mm4\n"
  875. "psrlq $8, %%mm1\n"
  876. "psrlq $8, %%mm4\n"
  877. "movq %%mm4, %%mm5\n"
  878. "movq %%mm1, %%mm3\n"
  879. "punpcklbw %%mm7,%%mm4\n"
  880. "punpcklbw %%mm7,%%mm1\n"
  881. "punpckhbw %%mm7,%%mm5\n"
  882. "punpckhbw %%mm7,%%mm3\n"
  883. "psubw %%mm1, %%mm4\n"
  884. "psubw %%mm3, %%mm5\n"
  885. "psubw %%mm4, %%mm0\n"
  886. "psubw %%mm5, %%mm2\n"
  887. "pxor %%mm3, %%mm3\n"
  888. "pxor %%mm1, %%mm1\n"
  889. "pcmpgtw %%mm0, %%mm3\n\t"
  890. "pcmpgtw %%mm2, %%mm1\n\t"
  891. "pxor %%mm3, %%mm0\n"
  892. "pxor %%mm1, %%mm2\n"
  893. "psubw %%mm3, %%mm0\n"
  894. "psubw %%mm1, %%mm2\n"
  895. "paddw %%mm0, %%mm2\n"
  896. "paddw %%mm2, %%mm6\n"
  897. "add %2,%0\n"
  898. "1:\n"
  899. "movq (%0),%%mm0\n"
  900. "movq %%mm0, %%mm1\n"
  901. "psllq $8, %%mm0\n"
  902. "psrlq $8, %%mm1\n"
  903. "psrlq $8, %%mm0\n"
  904. "movq %%mm0, %%mm2\n"
  905. "movq %%mm1, %%mm3\n"
  906. "punpcklbw %%mm7,%%mm0\n"
  907. "punpcklbw %%mm7,%%mm1\n"
  908. "punpckhbw %%mm7,%%mm2\n"
  909. "punpckhbw %%mm7,%%mm3\n"
  910. "psubw %%mm1, %%mm0\n"
  911. "psubw %%mm3, %%mm2\n"
  912. "psubw %%mm0, %%mm4\n"
  913. "psubw %%mm2, %%mm5\n"
  914. "pxor %%mm3, %%mm3\n"
  915. "pxor %%mm1, %%mm1\n"
  916. "pcmpgtw %%mm4, %%mm3\n\t"
  917. "pcmpgtw %%mm5, %%mm1\n\t"
  918. "pxor %%mm3, %%mm4\n"
  919. "pxor %%mm1, %%mm5\n"
  920. "psubw %%mm3, %%mm4\n"
  921. "psubw %%mm1, %%mm5\n"
  922. "paddw %%mm4, %%mm5\n"
  923. "paddw %%mm5, %%mm6\n"
  924. "add %2,%0\n"
  925. "movq (%0),%%mm4\n"
  926. "movq %%mm4, %%mm1\n"
  927. "psllq $8, %%mm4\n"
  928. "psrlq $8, %%mm1\n"
  929. "psrlq $8, %%mm4\n"
  930. "movq %%mm4, %%mm5\n"
  931. "movq %%mm1, %%mm3\n"
  932. "punpcklbw %%mm7,%%mm4\n"
  933. "punpcklbw %%mm7,%%mm1\n"
  934. "punpckhbw %%mm7,%%mm5\n"
  935. "punpckhbw %%mm7,%%mm3\n"
  936. "psubw %%mm1, %%mm4\n"
  937. "psubw %%mm3, %%mm5\n"
  938. "psubw %%mm4, %%mm0\n"
  939. "psubw %%mm5, %%mm2\n"
  940. "pxor %%mm3, %%mm3\n"
  941. "pxor %%mm1, %%mm1\n"
  942. "pcmpgtw %%mm0, %%mm3\n\t"
  943. "pcmpgtw %%mm2, %%mm1\n\t"
  944. "pxor %%mm3, %%mm0\n"
  945. "pxor %%mm1, %%mm2\n"
  946. "psubw %%mm3, %%mm0\n"
  947. "psubw %%mm1, %%mm2\n"
  948. "paddw %%mm0, %%mm2\n"
  949. "paddw %%mm2, %%mm6\n"
  950. "add %2,%0\n"
  951. "subl $2, %%ecx\n"
  952. " jnz 1b\n"
  953. "movq %%mm6, %%mm0\n"
  954. "punpcklwd %%mm7,%%mm0\n"
  955. "punpckhwd %%mm7,%%mm6\n"
  956. "paddd %%mm0, %%mm6\n"
  957. "movq %%mm6,%%mm0\n"
  958. "psrlq $32, %%mm6\n"
  959. "paddd %%mm6,%%mm0\n"
  960. "movd %%mm0,%1\n"
  961. : "+r" (pix1), "=r"(tmp)
  962. : "r" ((long)line_size) , "g" (h-2)
  963. : "%ecx");
  964. return tmp;
  965. }
  966. static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
  967. int tmp;
  968. uint8_t * pix= pix1;
  969. asm volatile (
  970. "movl %3,%%ecx\n"
  971. "pxor %%mm7,%%mm7\n"
  972. "pxor %%mm6,%%mm6\n"
  973. "movq (%0),%%mm0\n"
  974. "movq 1(%0),%%mm1\n"
  975. "movq %%mm0, %%mm2\n"
  976. "movq %%mm1, %%mm3\n"
  977. "punpcklbw %%mm7,%%mm0\n"
  978. "punpcklbw %%mm7,%%mm1\n"
  979. "punpckhbw %%mm7,%%mm2\n"
  980. "punpckhbw %%mm7,%%mm3\n"
  981. "psubw %%mm1, %%mm0\n"
  982. "psubw %%mm3, %%mm2\n"
  983. "add %2,%0\n"
  984. "movq (%0),%%mm4\n"
  985. "movq 1(%0),%%mm1\n"
  986. "movq %%mm4, %%mm5\n"
  987. "movq %%mm1, %%mm3\n"
  988. "punpcklbw %%mm7,%%mm4\n"
  989. "punpcklbw %%mm7,%%mm1\n"
  990. "punpckhbw %%mm7,%%mm5\n"
  991. "punpckhbw %%mm7,%%mm3\n"
  992. "psubw %%mm1, %%mm4\n"
  993. "psubw %%mm3, %%mm5\n"
  994. "psubw %%mm4, %%mm0\n"
  995. "psubw %%mm5, %%mm2\n"
  996. "pxor %%mm3, %%mm3\n"
  997. "pxor %%mm1, %%mm1\n"
  998. "pcmpgtw %%mm0, %%mm3\n\t"
  999. "pcmpgtw %%mm2, %%mm1\n\t"
  1000. "pxor %%mm3, %%mm0\n"
  1001. "pxor %%mm1, %%mm2\n"
  1002. "psubw %%mm3, %%mm0\n"
  1003. "psubw %%mm1, %%mm2\n"
  1004. "paddw %%mm0, %%mm2\n"
  1005. "paddw %%mm2, %%mm6\n"
  1006. "add %2,%0\n"
  1007. "1:\n"
  1008. "movq (%0),%%mm0\n"
  1009. "movq 1(%0),%%mm1\n"
  1010. "movq %%mm0, %%mm2\n"
  1011. "movq %%mm1, %%mm3\n"
  1012. "punpcklbw %%mm7,%%mm0\n"
  1013. "punpcklbw %%mm7,%%mm1\n"
  1014. "punpckhbw %%mm7,%%mm2\n"
  1015. "punpckhbw %%mm7,%%mm3\n"
  1016. "psubw %%mm1, %%mm0\n"
  1017. "psubw %%mm3, %%mm2\n"
  1018. "psubw %%mm0, %%mm4\n"
  1019. "psubw %%mm2, %%mm5\n"
  1020. "pxor %%mm3, %%mm3\n"
  1021. "pxor %%mm1, %%mm1\n"
  1022. "pcmpgtw %%mm4, %%mm3\n\t"
  1023. "pcmpgtw %%mm5, %%mm1\n\t"
  1024. "pxor %%mm3, %%mm4\n"
  1025. "pxor %%mm1, %%mm5\n"
  1026. "psubw %%mm3, %%mm4\n"
  1027. "psubw %%mm1, %%mm5\n"
  1028. "paddw %%mm4, %%mm5\n"
  1029. "paddw %%mm5, %%mm6\n"
  1030. "add %2,%0\n"
  1031. "movq (%0),%%mm4\n"
  1032. "movq 1(%0),%%mm1\n"
  1033. "movq %%mm4, %%mm5\n"
  1034. "movq %%mm1, %%mm3\n"
  1035. "punpcklbw %%mm7,%%mm4\n"
  1036. "punpcklbw %%mm7,%%mm1\n"
  1037. "punpckhbw %%mm7,%%mm5\n"
  1038. "punpckhbw %%mm7,%%mm3\n"
  1039. "psubw %%mm1, %%mm4\n"
  1040. "psubw %%mm3, %%mm5\n"
  1041. "psubw %%mm4, %%mm0\n"
  1042. "psubw %%mm5, %%mm2\n"
  1043. "pxor %%mm3, %%mm3\n"
  1044. "pxor %%mm1, %%mm1\n"
  1045. "pcmpgtw %%mm0, %%mm3\n\t"
  1046. "pcmpgtw %%mm2, %%mm1\n\t"
  1047. "pxor %%mm3, %%mm0\n"
  1048. "pxor %%mm1, %%mm2\n"
  1049. "psubw %%mm3, %%mm0\n"
  1050. "psubw %%mm1, %%mm2\n"
  1051. "paddw %%mm0, %%mm2\n"
  1052. "paddw %%mm2, %%mm6\n"
  1053. "add %2,%0\n"
  1054. "subl $2, %%ecx\n"
  1055. " jnz 1b\n"
  1056. "movq %%mm6, %%mm0\n"
  1057. "punpcklwd %%mm7,%%mm0\n"
  1058. "punpckhwd %%mm7,%%mm6\n"
  1059. "paddd %%mm0, %%mm6\n"
  1060. "movq %%mm6,%%mm0\n"
  1061. "psrlq $32, %%mm6\n"
  1062. "paddd %%mm6,%%mm0\n"
  1063. "movd %%mm0,%1\n"
  1064. : "+r" (pix1), "=r"(tmp)
  1065. : "r" ((long)line_size) , "g" (h-2)
  1066. : "%ecx");
  1067. return tmp + hf_noise8_mmx(pix+8, line_size, h);
  1068. }
  1069. static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1070. MpegEncContext *c = p;
  1071. int score1, score2;
  1072. if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  1073. else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  1074. score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
  1075. if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  1076. else return score1 + FFABS(score2)*8;
  1077. }
  1078. static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1079. MpegEncContext *c = p;
  1080. int score1= sse8_mmx(c, pix1, pix2, line_size, h);
  1081. int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
  1082. if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  1083. else return score1 + FFABS(score2)*8;
  1084. }
  1085. static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  1086. int tmp;
  1087. assert( (((int)pix) & 7) == 0);
  1088. assert((line_size &7) ==0);
  1089. #define SUM(in0, in1, out0, out1) \
  1090. "movq (%0), %%mm2\n"\
  1091. "movq 8(%0), %%mm3\n"\
  1092. "add %2,%0\n"\
  1093. "movq %%mm2, " #out0 "\n"\
  1094. "movq %%mm3, " #out1 "\n"\
  1095. "psubusb " #in0 ", %%mm2\n"\
  1096. "psubusb " #in1 ", %%mm3\n"\
  1097. "psubusb " #out0 ", " #in0 "\n"\
  1098. "psubusb " #out1 ", " #in1 "\n"\
  1099. "por %%mm2, " #in0 "\n"\
  1100. "por %%mm3, " #in1 "\n"\
  1101. "movq " #in0 ", %%mm2\n"\
  1102. "movq " #in1 ", %%mm3\n"\
  1103. "punpcklbw %%mm7, " #in0 "\n"\
  1104. "punpcklbw %%mm7, " #in1 "\n"\
  1105. "punpckhbw %%mm7, %%mm2\n"\
  1106. "punpckhbw %%mm7, %%mm3\n"\
  1107. "paddw " #in1 ", " #in0 "\n"\
  1108. "paddw %%mm3, %%mm2\n"\
  1109. "paddw %%mm2, " #in0 "\n"\
  1110. "paddw " #in0 ", %%mm6\n"
  1111. asm volatile (
  1112. "movl %3,%%ecx\n"
  1113. "pxor %%mm6,%%mm6\n"
  1114. "pxor %%mm7,%%mm7\n"
  1115. "movq (%0),%%mm0\n"
  1116. "movq 8(%0),%%mm1\n"
  1117. "add %2,%0\n"
  1118. "subl $2, %%ecx\n"
  1119. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1120. "1:\n"
  1121. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1122. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1123. "subl $2, %%ecx\n"
  1124. "jnz 1b\n"
  1125. "movq %%mm6,%%mm0\n"
  1126. "psrlq $32, %%mm6\n"
  1127. "paddw %%mm6,%%mm0\n"
  1128. "movq %%mm0,%%mm6\n"
  1129. "psrlq $16, %%mm0\n"
  1130. "paddw %%mm6,%%mm0\n"
  1131. "movd %%mm0,%1\n"
  1132. : "+r" (pix), "=r"(tmp)
  1133. : "r" ((long)line_size) , "m" (h)
  1134. : "%ecx");
  1135. return tmp & 0xFFFF;
  1136. }
  1137. #undef SUM
  1138. static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  1139. int tmp;
  1140. assert( (((int)pix) & 7) == 0);
  1141. assert((line_size &7) ==0);
  1142. #define SUM(in0, in1, out0, out1) \
  1143. "movq (%0), " #out0 "\n"\
  1144. "movq 8(%0), " #out1 "\n"\
  1145. "add %2,%0\n"\
  1146. "psadbw " #out0 ", " #in0 "\n"\
  1147. "psadbw " #out1 ", " #in1 "\n"\
  1148. "paddw " #in1 ", " #in0 "\n"\
  1149. "paddw " #in0 ", %%mm6\n"
  1150. asm volatile (
  1151. "movl %3,%%ecx\n"
  1152. "pxor %%mm6,%%mm6\n"
  1153. "pxor %%mm7,%%mm7\n"
  1154. "movq (%0),%%mm0\n"
  1155. "movq 8(%0),%%mm1\n"
  1156. "add %2,%0\n"
  1157. "subl $2, %%ecx\n"
  1158. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1159. "1:\n"
  1160. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1161. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1162. "subl $2, %%ecx\n"
  1163. "jnz 1b\n"
  1164. "movd %%mm6,%1\n"
  1165. : "+r" (pix), "=r"(tmp)
  1166. : "r" ((long)line_size) , "m" (h)
  1167. : "%ecx");
  1168. return tmp;
  1169. }
  1170. #undef SUM
  1171. static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1172. int tmp;
  1173. assert( (((int)pix1) & 7) == 0);
  1174. assert( (((int)pix2) & 7) == 0);
  1175. assert((line_size &7) ==0);
  1176. #define SUM(in0, in1, out0, out1) \
  1177. "movq (%0),%%mm2\n"\
  1178. "movq (%1)," #out0 "\n"\
  1179. "movq 8(%0),%%mm3\n"\
  1180. "movq 8(%1)," #out1 "\n"\
  1181. "add %3,%0\n"\
  1182. "add %3,%1\n"\
  1183. "psubb " #out0 ", %%mm2\n"\
  1184. "psubb " #out1 ", %%mm3\n"\
  1185. "pxor %%mm7, %%mm2\n"\
  1186. "pxor %%mm7, %%mm3\n"\
  1187. "movq %%mm2, " #out0 "\n"\
  1188. "movq %%mm3, " #out1 "\n"\
  1189. "psubusb " #in0 ", %%mm2\n"\
  1190. "psubusb " #in1 ", %%mm3\n"\
  1191. "psubusb " #out0 ", " #in0 "\n"\
  1192. "psubusb " #out1 ", " #in1 "\n"\
  1193. "por %%mm2, " #in0 "\n"\
  1194. "por %%mm3, " #in1 "\n"\
  1195. "movq " #in0 ", %%mm2\n"\
  1196. "movq " #in1 ", %%mm3\n"\
  1197. "punpcklbw %%mm7, " #in0 "\n"\
  1198. "punpcklbw %%mm7, " #in1 "\n"\
  1199. "punpckhbw %%mm7, %%mm2\n"\
  1200. "punpckhbw %%mm7, %%mm3\n"\
  1201. "paddw " #in1 ", " #in0 "\n"\
  1202. "paddw %%mm3, %%mm2\n"\
  1203. "paddw %%mm2, " #in0 "\n"\
  1204. "paddw " #in0 ", %%mm6\n"
  1205. asm volatile (
  1206. "movl %4,%%ecx\n"
  1207. "pxor %%mm6,%%mm6\n"
  1208. "pcmpeqw %%mm7,%%mm7\n"
  1209. "psllw $15, %%mm7\n"
  1210. "packsswb %%mm7, %%mm7\n"
  1211. "movq (%0),%%mm0\n"
  1212. "movq (%1),%%mm2\n"
  1213. "movq 8(%0),%%mm1\n"
  1214. "movq 8(%1),%%mm3\n"
  1215. "add %3,%0\n"
  1216. "add %3,%1\n"
  1217. "subl $2, %%ecx\n"
  1218. "psubb %%mm2, %%mm0\n"
  1219. "psubb %%mm3, %%mm1\n"
  1220. "pxor %%mm7, %%mm0\n"
  1221. "pxor %%mm7, %%mm1\n"
  1222. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1223. "1:\n"
  1224. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1225. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1226. "subl $2, %%ecx\n"
  1227. "jnz 1b\n"
  1228. "movq %%mm6,%%mm0\n"
  1229. "psrlq $32, %%mm6\n"
  1230. "paddw %%mm6,%%mm0\n"
  1231. "movq %%mm0,%%mm6\n"
  1232. "psrlq $16, %%mm0\n"
  1233. "paddw %%mm6,%%mm0\n"
  1234. "movd %%mm0,%2\n"
  1235. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  1236. : "r" ((long)line_size) , "m" (h)
  1237. : "%ecx");
  1238. return tmp & 0x7FFF;
  1239. }
  1240. #undef SUM
  1241. static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  1242. int tmp;
  1243. assert( (((int)pix1) & 7) == 0);
  1244. assert( (((int)pix2) & 7) == 0);
  1245. assert((line_size &7) ==0);
  1246. #define SUM(in0, in1, out0, out1) \
  1247. "movq (%0)," #out0 "\n"\
  1248. "movq (%1),%%mm2\n"\
  1249. "movq 8(%0)," #out1 "\n"\
  1250. "movq 8(%1),%%mm3\n"\
  1251. "add %3,%0\n"\
  1252. "add %3,%1\n"\
  1253. "psubb %%mm2, " #out0 "\n"\
  1254. "psubb %%mm3, " #out1 "\n"\
  1255. "pxor %%mm7, " #out0 "\n"\
  1256. "pxor %%mm7, " #out1 "\n"\
  1257. "psadbw " #out0 ", " #in0 "\n"\
  1258. "psadbw " #out1 ", " #in1 "\n"\
  1259. "paddw " #in1 ", " #in0 "\n"\
  1260. "paddw " #in0 ", %%mm6\n"
  1261. asm volatile (
  1262. "movl %4,%%ecx\n"
  1263. "pxor %%mm6,%%mm6\n"
  1264. "pcmpeqw %%mm7,%%mm7\n"
  1265. "psllw $15, %%mm7\n"
  1266. "packsswb %%mm7, %%mm7\n"
  1267. "movq (%0),%%mm0\n"
  1268. "movq (%1),%%mm2\n"
  1269. "movq 8(%0),%%mm1\n"
  1270. "movq 8(%1),%%mm3\n"
  1271. "add %3,%0\n"
  1272. "add %3,%1\n"
  1273. "subl $2, %%ecx\n"
  1274. "psubb %%mm2, %%mm0\n"
  1275. "psubb %%mm3, %%mm1\n"
  1276. "pxor %%mm7, %%mm0\n"
  1277. "pxor %%mm7, %%mm1\n"
  1278. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1279. "1:\n"
  1280. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  1281. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  1282. "subl $2, %%ecx\n"
  1283. "jnz 1b\n"
  1284. "movd %%mm6,%2\n"
  1285. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  1286. : "r" ((long)line_size) , "m" (h)
  1287. : "%ecx");
  1288. return tmp;
  1289. }
  1290. #undef SUM
  1291. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  1292. long i=0;
  1293. asm volatile(
  1294. "1: \n\t"
  1295. "movq (%2, %0), %%mm0 \n\t"
  1296. "movq (%1, %0), %%mm1 \n\t"
  1297. "psubb %%mm0, %%mm1 \n\t"
  1298. "movq %%mm1, (%3, %0) \n\t"
  1299. "movq 8(%2, %0), %%mm0 \n\t"
  1300. "movq 8(%1, %0), %%mm1 \n\t"
  1301. "psubb %%mm0, %%mm1 \n\t"
  1302. "movq %%mm1, 8(%3, %0) \n\t"
  1303. "add $16, %0 \n\t"
  1304. "cmp %4, %0 \n\t"
  1305. " jb 1b \n\t"
  1306. : "+r" (i)
  1307. : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
  1308. );
  1309. for(; i<w; i++)
  1310. dst[i+0] = src1[i+0]-src2[i+0];
  1311. }
  1312. static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
  1313. long i=0;
  1314. uint8_t l, lt;
  1315. asm volatile(
  1316. "1: \n\t"
  1317. "movq -1(%1, %0), %%mm0 \n\t" // LT
  1318. "movq (%1, %0), %%mm1 \n\t" // T
  1319. "movq -1(%2, %0), %%mm2 \n\t" // L
  1320. "movq (%2, %0), %%mm3 \n\t" // X
  1321. "movq %%mm2, %%mm4 \n\t" // L
  1322. "psubb %%mm0, %%mm2 \n\t"
  1323. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  1324. "movq %%mm4, %%mm5 \n\t" // L
  1325. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  1326. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  1327. "pminub %%mm2, %%mm4 \n\t"
  1328. "pmaxub %%mm1, %%mm4 \n\t"
  1329. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  1330. "movq %%mm3, (%3, %0) \n\t"
  1331. "add $8, %0 \n\t"
  1332. "cmp %4, %0 \n\t"
  1333. " jb 1b \n\t"
  1334. : "+r" (i)
  1335. : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
  1336. );
  1337. l= *left;
  1338. lt= *left_top;
  1339. dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  1340. *left_top= src1[w-1];
  1341. *left = src2[w-1];
  1342. }
  1343. #define DIFF_PIXELS_1(m,a,t,p1,p2)\
  1344. "mov"#m" "#p1", "#a" \n\t"\
  1345. "mov"#m" "#p2", "#t" \n\t"\
  1346. "punpcklbw "#a", "#t" \n\t"\
  1347. "punpcklbw "#a", "#a" \n\t"\
  1348. "psubw "#t", "#a" \n\t"\
  1349. #define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
  1350. uint8_t *p1b=p1, *p2b=p2;\
  1351. asm volatile(\
  1352. DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
  1353. DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
  1354. DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
  1355. "add %4, %1 \n\t"\
  1356. "add %4, %2 \n\t"\
  1357. DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
  1358. DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
  1359. DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
  1360. DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
  1361. "mov"#m1" "#mm"0, %0 \n\t"\
  1362. DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
  1363. "mov"#m1" %0, "#mm"0 \n\t"\
  1364. : "+m"(temp), "+r"(p1b), "+r"(p2b)\
  1365. : "r"((long)stride), "r"((long)stride*3)\
  1366. );\
  1367. }
  1368. //the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
  1369. #define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
  1370. #define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
  1371. #ifdef ARCH_X86_64
  1372. // permutes 01234567 -> 05736421
  1373. #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
  1374. SBUTTERFLY(a,b,%%xmm8,wd,dqa)\
  1375. SBUTTERFLY(c,d,b,wd,dqa)\
  1376. SBUTTERFLY(e,f,d,wd,dqa)\
  1377. SBUTTERFLY(g,h,f,wd,dqa)\
  1378. SBUTTERFLY(a,c,h,dq,dqa)\
  1379. SBUTTERFLY(%%xmm8,b,c,dq,dqa)\
  1380. SBUTTERFLY(e,g,b,dq,dqa)\
  1381. SBUTTERFLY(d,f,g,dq,dqa)\
  1382. SBUTTERFLY(a,e,f,qdq,dqa)\
  1383. SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\
  1384. SBUTTERFLY(h,b,d,qdq,dqa)\
  1385. SBUTTERFLY(c,g,b,qdq,dqa)\
  1386. "movdqa %%xmm8, "#g" \n\t"
  1387. #else
  1388. #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\
  1389. "movdqa "#h", "#t" \n\t"\
  1390. SBUTTERFLY(a,b,h,wd,dqa)\
  1391. "movdqa "#h", 16"#t" \n\t"\
  1392. "movdqa "#t", "#h" \n\t"\
  1393. SBUTTERFLY(c,d,b,wd,dqa)\
  1394. SBUTTERFLY(e,f,d,wd,dqa)\
  1395. SBUTTERFLY(g,h,f,wd,dqa)\
  1396. SBUTTERFLY(a,c,h,dq,dqa)\
  1397. "movdqa "#h", "#t" \n\t"\
  1398. "movdqa 16"#t", "#h" \n\t"\
  1399. SBUTTERFLY(h,b,c,dq,dqa)\
  1400. SBUTTERFLY(e,g,b,dq,dqa)\
  1401. SBUTTERFLY(d,f,g,dq,dqa)\
  1402. SBUTTERFLY(a,e,f,qdq,dqa)\
  1403. SBUTTERFLY(h,d,e,qdq,dqa)\
  1404. "movdqa "#h", 16"#t" \n\t"\
  1405. "movdqa "#t", "#h" \n\t"\
  1406. SBUTTERFLY(h,b,d,qdq,dqa)\
  1407. SBUTTERFLY(c,g,b,qdq,dqa)\
  1408. "movdqa 16"#t", "#g" \n\t"
  1409. #endif
  1410. #define LBUTTERFLY2(a1,b1,a2,b2)\
  1411. "paddw " #b1 ", " #a1 " \n\t"\
  1412. "paddw " #b2 ", " #a2 " \n\t"\
  1413. "paddw " #b1 ", " #b1 " \n\t"\
  1414. "paddw " #b2 ", " #b2 " \n\t"\
  1415. "psubw " #a1 ", " #b1 " \n\t"\
  1416. "psubw " #a2 ", " #b2 " \n\t"
  1417. #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
  1418. LBUTTERFLY2(m0, m1, m2, m3)\
  1419. LBUTTERFLY2(m4, m5, m6, m7)\
  1420. LBUTTERFLY2(m0, m2, m1, m3)\
  1421. LBUTTERFLY2(m4, m6, m5, m7)\
  1422. LBUTTERFLY2(m0, m4, m1, m5)\
  1423. LBUTTERFLY2(m2, m6, m3, m7)\
  1424. #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
  1425. #define MMABS_MMX(a,z)\
  1426. "pxor " #z ", " #z " \n\t"\
  1427. "pcmpgtw " #a ", " #z " \n\t"\
  1428. "pxor " #z ", " #a " \n\t"\
  1429. "psubw " #z ", " #a " \n\t"
  1430. #define MMABS_MMX2(a,z)\
  1431. "pxor " #z ", " #z " \n\t"\
  1432. "psubw " #a ", " #z " \n\t"\
  1433. "pmaxsw " #z ", " #a " \n\t"
  1434. #define MMABS_SSSE3(a,z)\
  1435. "pabsw " #a ", " #a " \n\t"
  1436. #define MMABS_SUM(a,z, sum)\
  1437. MMABS(a,z)\
  1438. "paddusw " #a ", " #sum " \n\t"
  1439. #define MMABS_SUM_8x8_NOSPILL\
  1440. MMABS(%%xmm0, %%xmm8)\
  1441. MMABS(%%xmm1, %%xmm9)\
  1442. MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
  1443. MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
  1444. MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
  1445. MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
  1446. MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
  1447. MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
  1448. "paddusw %%xmm1, %%xmm0 \n\t"
  1449. #ifdef ARCH_X86_64
  1450. #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
  1451. #else
  1452. #define MMABS_SUM_8x8_SSE2\
  1453. "movdqa %%xmm7, (%1) \n\t"\
  1454. MMABS(%%xmm0, %%xmm7)\
  1455. MMABS(%%xmm1, %%xmm7)\
  1456. MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
  1457. MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
  1458. MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
  1459. MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
  1460. MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
  1461. "movdqa (%1), %%xmm2 \n\t"\
  1462. MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
  1463. "paddusw %%xmm1, %%xmm0 \n\t"
  1464. #endif
  1465. #define LOAD4(o, a, b, c, d)\
  1466. "movq "#o"(%1), "#a" \n\t"\
  1467. "movq "#o"+8(%1), "#b" \n\t"\
  1468. "movq "#o"+16(%1), "#c" \n\t"\
  1469. "movq "#o"+24(%1), "#d" \n\t"\
  1470. #define STORE4(o, a, b, c, d)\
  1471. "movq "#a", "#o"(%1) \n\t"\
  1472. "movq "#b", "#o"+8(%1) \n\t"\
  1473. "movq "#c", "#o"+16(%1) \n\t"\
  1474. "movq "#d", "#o"+24(%1) \n\t"\
  1475. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
  1476. * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
  1477. * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
  1478. #define HSUM_MMX(a, t, dst)\
  1479. "movq "#a", "#t" \n\t"\
  1480. "psrlq $32, "#a" \n\t"\
  1481. "paddusw "#t", "#a" \n\t"\
  1482. "movq "#a", "#t" \n\t"\
  1483. "psrlq $16, "#a" \n\t"\
  1484. "paddusw "#t", "#a" \n\t"\
  1485. "movd "#a", "#dst" \n\t"\
  1486. #define HSUM_MMX2(a, t, dst)\
  1487. "pshufw $0x0E, "#a", "#t" \n\t"\
  1488. "paddusw "#t", "#a" \n\t"\
  1489. "pshufw $0x01, "#a", "#t" \n\t"\
  1490. "paddusw "#t", "#a" \n\t"\
  1491. "movd "#a", "#dst" \n\t"\
  1492. #define HSUM_SSE2(a, t, dst)\
  1493. "movhlps "#a", "#t" \n\t"\
  1494. "paddusw "#t", "#a" \n\t"\
  1495. "pshuflw $0x0E, "#a", "#t" \n\t"\
  1496. "paddusw "#t", "#a" \n\t"\
  1497. "pshuflw $0x01, "#a", "#t" \n\t"\
  1498. "paddusw "#t", "#a" \n\t"\
  1499. "movd "#a", "#dst" \n\t"\
  1500. #define HADAMARD8_DIFF_MMX(cpu) \
  1501. static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
  1502. DECLARE_ALIGNED_8(uint64_t, temp[13]);\
  1503. int sum;\
  1504. \
  1505. assert(h==8);\
  1506. \
  1507. DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
  1508. \
  1509. asm volatile(\
  1510. HADAMARD48\
  1511. \
  1512. "movq %%mm7, 96(%1) \n\t"\
  1513. \
  1514. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
  1515. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
  1516. \
  1517. "movq 96(%1), %%mm7 \n\t"\
  1518. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
  1519. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
  1520. \
  1521. : "=r" (sum)\
  1522. : "r"(temp)\
  1523. );\
  1524. \
  1525. DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
  1526. \
  1527. asm volatile(\
  1528. HADAMARD48\
  1529. \
  1530. "movq %%mm7, 96(%1) \n\t"\
  1531. \
  1532. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
  1533. STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\
  1534. \
  1535. "movq 96(%1), %%mm7 \n\t"\
  1536. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
  1537. "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
  1538. "movq %%mm6, %%mm7 \n\t"\
  1539. "movq %%mm0, %%mm6 \n\t"\
  1540. \
  1541. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
  1542. \
  1543. HADAMARD48\
  1544. "movq %%mm7, 64(%1) \n\t"\
  1545. MMABS(%%mm0, %%mm7)\
  1546. MMABS(%%mm1, %%mm7)\
  1547. MMABS_SUM(%%mm2, %%mm7, %%mm0)\
  1548. MMABS_SUM(%%mm3, %%mm7, %%mm1)\
  1549. MMABS_SUM(%%mm4, %%mm7, %%mm0)\
  1550. MMABS_SUM(%%mm5, %%mm7, %%mm1)\
  1551. MMABS_SUM(%%mm6, %%mm7, %%mm0)\
  1552. "movq 64(%1), %%mm2 \n\t"\
  1553. MMABS_SUM(%%mm2, %%mm7, %%mm1)\
  1554. "paddusw %%mm1, %%mm0 \n\t"\
  1555. "movq %%mm0, 64(%1) \n\t"\
  1556. \
  1557. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
  1558. LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\
  1559. \
  1560. HADAMARD48\
  1561. "movq %%mm7, (%1) \n\t"\
  1562. MMABS(%%mm0, %%mm7)\
  1563. MMABS(%%mm1, %%mm7)\
  1564. MMABS_SUM(%%mm2, %%mm7, %%mm0)\
  1565. MMABS_SUM(%%mm3, %%mm7, %%mm1)\
  1566. MMABS_SUM(%%mm4, %%mm7, %%mm0)\
  1567. MMABS_SUM(%%mm5, %%mm7, %%mm1)\
  1568. MMABS_SUM(%%mm6, %%mm7, %%mm0)\
  1569. "movq (%1), %%mm2 \n\t"\
  1570. MMABS_SUM(%%mm2, %%mm7, %%mm1)\
  1571. "paddusw 64(%1), %%mm0 \n\t"\
  1572. "paddusw %%mm1, %%mm0 \n\t"\
  1573. \
  1574. HSUM(%%mm0, %%mm1, %0)\
  1575. \
  1576. : "=r" (sum)\
  1577. : "r"(temp)\
  1578. );\
  1579. return sum&0xFFFF;\
  1580. }\
  1581. WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
  1582. #define HADAMARD8_DIFF_SSE2(cpu) \
  1583. static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
  1584. DECLARE_ALIGNED_16(uint64_t, temp[4]);\
  1585. int sum;\
  1586. \
  1587. assert(h==8);\
  1588. \
  1589. DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
  1590. \
  1591. asm volatile(\
  1592. HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
  1593. TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
  1594. HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
  1595. MMABS_SUM_8x8\
  1596. HSUM_SSE2(%%xmm0, %%xmm1, %0)\
  1597. : "=r" (sum)\
  1598. : "r"(temp)\
  1599. );\
  1600. return sum&0xFFFF;\
  1601. }\
  1602. WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
  1603. #define MMABS(a,z) MMABS_MMX(a,z)
  1604. #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
  1605. HADAMARD8_DIFF_MMX(mmx)
  1606. #undef MMABS
  1607. #undef HSUM
  1608. #define MMABS(a,z) MMABS_MMX2(a,z)
  1609. #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
  1610. #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
  1611. HADAMARD8_DIFF_MMX(mmx2)
  1612. HADAMARD8_DIFF_SSE2(sse2)
  1613. #undef MMABS
  1614. #undef MMABS_SUM_8x8
  1615. #undef HSUM
  1616. #ifdef HAVE_SSSE3
  1617. #define MMABS(a,z) MMABS_SSSE3(a,z)
  1618. #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
  1619. HADAMARD8_DIFF_SSE2(ssse3)
  1620. #undef MMABS
  1621. #undef MMABS_SUM_8x8
  1622. #endif
  1623. #define DCT_SAD4(m,mm,o)\
  1624. "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
  1625. "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
  1626. "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
  1627. "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
  1628. MMABS_SUM(mm##2, mm##6, mm##0)\
  1629. MMABS_SUM(mm##3, mm##7, mm##1)\
  1630. MMABS_SUM(mm##4, mm##6, mm##0)\
  1631. MMABS_SUM(mm##5, mm##7, mm##1)\
  1632. #define DCT_SAD_MMX\
  1633. "pxor %%mm0, %%mm0 \n\t"\
  1634. "pxor %%mm1, %%mm1 \n\t"\
  1635. DCT_SAD4(q, %%mm, 0)\
  1636. DCT_SAD4(q, %%mm, 8)\
  1637. DCT_SAD4(q, %%mm, 64)\
  1638. DCT_SAD4(q, %%mm, 72)\
  1639. "paddusw %%mm1, %%mm0 \n\t"\
  1640. HSUM(%%mm0, %%mm1, %0)
  1641. #define DCT_SAD_SSE2\
  1642. "pxor %%xmm0, %%xmm0 \n\t"\
  1643. "pxor %%xmm1, %%xmm1 \n\t"\
  1644. DCT_SAD4(dqa, %%xmm, 0)\
  1645. DCT_SAD4(dqa, %%xmm, 64)\
  1646. "paddusw %%xmm1, %%xmm0 \n\t"\
  1647. HSUM(%%xmm0, %%xmm1, %0)
  1648. #define DCT_SAD_FUNC(cpu) \
  1649. static int sum_abs_dctelem_##cpu(DCTELEM *block){\
  1650. int sum;\
  1651. asm volatile(\
  1652. DCT_SAD\
  1653. :"=r"(sum)\
  1654. :"r"(block)\
  1655. );\
  1656. return sum&0xFFFF;\
  1657. }
  1658. #define DCT_SAD DCT_SAD_MMX
  1659. #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
  1660. #define MMABS(a,z) MMABS_MMX(a,z)
  1661. DCT_SAD_FUNC(mmx)
  1662. #undef MMABS
  1663. #undef HSUM
  1664. #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
  1665. #define MMABS(a,z) MMABS_MMX2(a,z)
  1666. DCT_SAD_FUNC(mmx2)
  1667. #undef HSUM
  1668. #undef DCT_SAD
  1669. #define DCT_SAD DCT_SAD_SSE2
  1670. #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
  1671. DCT_SAD_FUNC(sse2)
  1672. #undef MMABS
  1673. #ifdef HAVE_SSSE3
  1674. #define MMABS(a,z) MMABS_SSSE3(a,z)
  1675. DCT_SAD_FUNC(ssse3)
  1676. #undef MMABS
  1677. #endif
  1678. #undef HSUM
  1679. #undef DCT_SAD
  1680. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
  1681. int sum;
  1682. long i=size;
  1683. asm volatile(
  1684. "pxor %%mm4, %%mm4 \n"
  1685. "1: \n"
  1686. "sub $8, %0 \n"
  1687. "movq (%2,%0), %%mm2 \n"
  1688. "movq (%3,%0,2), %%mm0 \n"
  1689. "movq 8(%3,%0,2), %%mm1 \n"
  1690. "punpckhbw %%mm2, %%mm3 \n"
  1691. "punpcklbw %%mm2, %%mm2 \n"
  1692. "psraw $8, %%mm3 \n"
  1693. "psraw $8, %%mm2 \n"
  1694. "psubw %%mm3, %%mm1 \n"
  1695. "psubw %%mm2, %%mm0 \n"
  1696. "pmaddwd %%mm1, %%mm1 \n"
  1697. "pmaddwd %%mm0, %%mm0 \n"
  1698. "paddd %%mm1, %%mm4 \n"
  1699. "paddd %%mm0, %%mm4 \n"
  1700. "jg 1b \n"
  1701. "movq %%mm4, %%mm3 \n"
  1702. "psrlq $32, %%mm3 \n"
  1703. "paddd %%mm3, %%mm4 \n"
  1704. "movd %%mm4, %1 \n"
  1705. :"+r"(i), "=r"(sum)
  1706. :"r"(pix1), "r"(pix2)
  1707. );
  1708. return sum;
  1709. }
  1710. #endif //CONFIG_ENCODERS
  1711. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  1712. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  1713. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  1714. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  1715. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  1716. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  1717. "movq "#in7", " #m3 " \n\t" /* d */\
  1718. "movq "#in0", %%mm5 \n\t" /* D */\
  1719. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  1720. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  1721. "movq "#in1", %%mm5 \n\t" /* C */\
  1722. "movq "#in2", %%mm6 \n\t" /* B */\
  1723. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  1724. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  1725. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  1726. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  1727. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  1728. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  1729. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  1730. "psraw $5, %%mm5 \n\t"\
  1731. "packuswb %%mm5, %%mm5 \n\t"\
  1732. OP(%%mm5, out, %%mm7, d)
  1733. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  1734. static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1735. uint64_t temp;\
  1736. \
  1737. asm volatile(\
  1738. "pxor %%mm7, %%mm7 \n\t"\
  1739. "1: \n\t"\
  1740. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1741. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1742. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1743. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1744. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1745. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1746. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1747. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1748. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1749. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1750. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1751. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1752. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1753. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1754. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1755. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1756. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1757. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1758. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1759. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1760. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1761. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1762. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1763. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1764. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1765. "paddw %6, %%mm6 \n\t"\
  1766. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1767. "psraw $5, %%mm0 \n\t"\
  1768. "movq %%mm0, %5 \n\t"\
  1769. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1770. \
  1771. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  1772. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  1773. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  1774. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  1775. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  1776. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  1777. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  1778. "paddw %%mm0, %%mm2 \n\t" /* b */\
  1779. "paddw %%mm5, %%mm3 \n\t" /* c */\
  1780. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1781. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1782. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  1783. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  1784. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  1785. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  1786. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1787. "paddw %%mm2, %%mm1 \n\t" /* a */\
  1788. "paddw %%mm6, %%mm4 \n\t" /* d */\
  1789. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1790. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  1791. "paddw %6, %%mm1 \n\t"\
  1792. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  1793. "psraw $5, %%mm3 \n\t"\
  1794. "movq %5, %%mm1 \n\t"\
  1795. "packuswb %%mm3, %%mm1 \n\t"\
  1796. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  1797. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  1798. \
  1799. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  1800. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  1801. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  1802. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  1803. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  1804. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  1805. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  1806. "paddw %%mm1, %%mm5 \n\t" /* b */\
  1807. "paddw %%mm4, %%mm0 \n\t" /* c */\
  1808. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1809. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  1810. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  1811. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  1812. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  1813. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  1814. "paddw %%mm3, %%mm2 \n\t" /* d */\
  1815. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  1816. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  1817. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  1818. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  1819. "paddw %%mm2, %%mm6 \n\t" /* a */\
  1820. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  1821. "paddw %6, %%mm0 \n\t"\
  1822. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1823. "psraw $5, %%mm0 \n\t"\
  1824. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  1825. \
  1826. "paddw %%mm5, %%mm3 \n\t" /* a */\
  1827. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  1828. "paddw %%mm4, %%mm6 \n\t" /* b */\
  1829. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  1830. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  1831. "paddw %%mm1, %%mm4 \n\t" /* c */\
  1832. "paddw %%mm2, %%mm5 \n\t" /* d */\
  1833. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  1834. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  1835. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  1836. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  1837. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  1838. "paddw %6, %%mm4 \n\t"\
  1839. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  1840. "psraw $5, %%mm4 \n\t"\
  1841. "packuswb %%mm4, %%mm0 \n\t"\
  1842. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  1843. \
  1844. "add %3, %0 \n\t"\
  1845. "add %4, %1 \n\t"\
  1846. "decl %2 \n\t"\
  1847. " jnz 1b \n\t"\
  1848. : "+a"(src), "+c"(dst), "+m"(h)\
  1849. : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1850. : "memory"\
  1851. );\
  1852. }\
  1853. \
  1854. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1855. int i;\
  1856. int16_t temp[16];\
  1857. /* quick HACK, XXX FIXME MUST be optimized */\
  1858. for(i=0; i<h; i++)\
  1859. {\
  1860. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1861. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1862. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1863. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1864. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1865. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  1866. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  1867. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  1868. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  1869. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  1870. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  1871. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  1872. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  1873. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  1874. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  1875. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  1876. asm volatile(\
  1877. "movq (%0), %%mm0 \n\t"\
  1878. "movq 8(%0), %%mm1 \n\t"\
  1879. "paddw %2, %%mm0 \n\t"\
  1880. "paddw %2, %%mm1 \n\t"\
  1881. "psraw $5, %%mm0 \n\t"\
  1882. "psraw $5, %%mm1 \n\t"\
  1883. "packuswb %%mm1, %%mm0 \n\t"\
  1884. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1885. "movq 16(%0), %%mm0 \n\t"\
  1886. "movq 24(%0), %%mm1 \n\t"\
  1887. "paddw %2, %%mm0 \n\t"\
  1888. "paddw %2, %%mm1 \n\t"\
  1889. "psraw $5, %%mm0 \n\t"\
  1890. "psraw $5, %%mm1 \n\t"\
  1891. "packuswb %%mm1, %%mm0 \n\t"\
  1892. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  1893. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1894. : "memory"\
  1895. );\
  1896. dst+=dstStride;\
  1897. src+=srcStride;\
  1898. }\
  1899. }\
  1900. \
  1901. static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1902. uint64_t temp;\
  1903. \
  1904. asm volatile(\
  1905. "pxor %%mm7, %%mm7 \n\t"\
  1906. "1: \n\t"\
  1907. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1908. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1909. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1910. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1911. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1912. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1913. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1914. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1915. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1916. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1917. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1918. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1919. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1920. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1921. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1922. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1923. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1924. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1925. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1926. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1927. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1928. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1929. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1930. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1931. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1932. "paddw %6, %%mm6 \n\t"\
  1933. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1934. "psraw $5, %%mm0 \n\t"\
  1935. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1936. \
  1937. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  1938. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  1939. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  1940. "paddw %%mm5, %%mm1 \n\t" /* a */\
  1941. "paddw %%mm6, %%mm2 \n\t" /* b */\
  1942. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  1943. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  1944. "paddw %%mm6, %%mm3 \n\t" /* c */\
  1945. "paddw %%mm5, %%mm4 \n\t" /* d */\
  1946. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1947. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1948. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1949. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1950. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  1951. "paddw %6, %%mm1 \n\t"\
  1952. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  1953. "psraw $5, %%mm3 \n\t"\
  1954. "packuswb %%mm3, %%mm0 \n\t"\
  1955. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  1956. \
  1957. "add %3, %0 \n\t"\
  1958. "add %4, %1 \n\t"\
  1959. "decl %2 \n\t"\
  1960. " jnz 1b \n\t"\
  1961. : "+a"(src), "+c"(dst), "+m"(h)\
  1962. : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1963. : "memory"\
  1964. );\
  1965. }\
  1966. \
  1967. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1968. int i;\
  1969. int16_t temp[8];\
  1970. /* quick HACK, XXX FIXME MUST be optimized */\
  1971. for(i=0; i<h; i++)\
  1972. {\
  1973. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1974. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1975. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1976. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1977. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1978. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  1979. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  1980. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  1981. asm volatile(\
  1982. "movq (%0), %%mm0 \n\t"\
  1983. "movq 8(%0), %%mm1 \n\t"\
  1984. "paddw %2, %%mm0 \n\t"\
  1985. "paddw %2, %%mm1 \n\t"\
  1986. "psraw $5, %%mm0 \n\t"\
  1987. "psraw $5, %%mm1 \n\t"\
  1988. "packuswb %%mm1, %%mm0 \n\t"\
  1989. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1990. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1991. :"memory"\
  1992. );\
  1993. dst+=dstStride;\
  1994. src+=srcStride;\
  1995. }\
  1996. }
  1997. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  1998. \
  1999. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  2000. uint64_t temp[17*4];\
  2001. uint64_t *temp_ptr= temp;\
  2002. int count= 17;\
  2003. \
  2004. /*FIXME unroll */\
  2005. asm volatile(\
  2006. "pxor %%mm7, %%mm7 \n\t"\
  2007. "1: \n\t"\
  2008. "movq (%0), %%mm0 \n\t"\
  2009. "movq (%0), %%mm1 \n\t"\
  2010. "movq 8(%0), %%mm2 \n\t"\
  2011. "movq 8(%0), %%mm3 \n\t"\
  2012. "punpcklbw %%mm7, %%mm0 \n\t"\
  2013. "punpckhbw %%mm7, %%mm1 \n\t"\
  2014. "punpcklbw %%mm7, %%mm2 \n\t"\
  2015. "punpckhbw %%mm7, %%mm3 \n\t"\
  2016. "movq %%mm0, (%1) \n\t"\
  2017. "movq %%mm1, 17*8(%1) \n\t"\
  2018. "movq %%mm2, 2*17*8(%1) \n\t"\
  2019. "movq %%mm3, 3*17*8(%1) \n\t"\
  2020. "add $8, %1 \n\t"\
  2021. "add %3, %0 \n\t"\
  2022. "decl %2 \n\t"\
  2023. " jnz 1b \n\t"\
  2024. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  2025. : "r" ((long)srcStride)\
  2026. : "memory"\
  2027. );\
  2028. \
  2029. temp_ptr= temp;\
  2030. count=4;\
  2031. \
  2032. /*FIXME reorder for speed */\
  2033. asm volatile(\
  2034. /*"pxor %%mm7, %%mm7 \n\t"*/\
  2035. "1: \n\t"\
  2036. "movq (%0), %%mm0 \n\t"\
  2037. "movq 8(%0), %%mm1 \n\t"\
  2038. "movq 16(%0), %%mm2 \n\t"\
  2039. "movq 24(%0), %%mm3 \n\t"\
  2040. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  2041. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  2042. "add %4, %1 \n\t"\
  2043. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  2044. \
  2045. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  2046. "add %4, %1 \n\t"\
  2047. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  2048. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  2049. "add %4, %1 \n\t"\
  2050. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  2051. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  2052. "add %4, %1 \n\t"\
  2053. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  2054. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  2055. "add %4, %1 \n\t"\
  2056. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  2057. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  2058. "add %4, %1 \n\t"\
  2059. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  2060. \
  2061. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  2062. "add %4, %1 \n\t" \
  2063. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  2064. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  2065. \
  2066. "add $136, %0 \n\t"\
  2067. "add %6, %1 \n\t"\
  2068. "decl %2 \n\t"\
  2069. " jnz 1b \n\t"\
  2070. \
  2071. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  2072. : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
  2073. :"memory"\
  2074. );\
  2075. }\
  2076. \
  2077. static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  2078. uint64_t temp[9*2];\
  2079. uint64_t *temp_ptr= temp;\
  2080. int count= 9;\
  2081. \
  2082. /*FIXME unroll */\
  2083. asm volatile(\
  2084. "pxor %%mm7, %%mm7 \n\t"\
  2085. "1: \n\t"\
  2086. "movq (%0), %%mm0 \n\t"\
  2087. "movq (%0), %%mm1 \n\t"\
  2088. "punpcklbw %%mm7, %%mm0 \n\t"\
  2089. "punpckhbw %%mm7, %%mm1 \n\t"\
  2090. "movq %%mm0, (%1) \n\t"\
  2091. "movq %%mm1, 9*8(%1) \n\t"\
  2092. "add $8, %1 \n\t"\
  2093. "add %3, %0 \n\t"\
  2094. "decl %2 \n\t"\
  2095. " jnz 1b \n\t"\
  2096. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  2097. : "r" ((long)srcStride)\
  2098. : "memory"\
  2099. );\
  2100. \
  2101. temp_ptr= temp;\
  2102. count=2;\
  2103. \
  2104. /*FIXME reorder for speed */\
  2105. asm volatile(\
  2106. /*"pxor %%mm7, %%mm7 \n\t"*/\
  2107. "1: \n\t"\
  2108. "movq (%0), %%mm0 \n\t"\
  2109. "movq 8(%0), %%mm1 \n\t"\
  2110. "movq 16(%0), %%mm2 \n\t"\
  2111. "movq 24(%0), %%mm3 \n\t"\
  2112. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  2113. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  2114. "add %4, %1 \n\t"\
  2115. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  2116. \
  2117. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  2118. "add %4, %1 \n\t"\
  2119. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  2120. \
  2121. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  2122. "add %4, %1 \n\t"\
  2123. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  2124. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  2125. \
  2126. "add $72, %0 \n\t"\
  2127. "add %6, %1 \n\t"\
  2128. "decl %2 \n\t"\
  2129. " jnz 1b \n\t"\
  2130. \
  2131. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  2132. : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
  2133. : "memory"\
  2134. );\
  2135. }\
  2136. \
  2137. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  2138. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  2139. }\
  2140. \
  2141. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2142. uint64_t temp[8];\
  2143. uint8_t * const half= (uint8_t*)temp;\
  2144. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  2145. OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
  2146. }\
  2147. \
  2148. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2149. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  2150. }\
  2151. \
  2152. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2153. uint64_t temp[8];\
  2154. uint8_t * const half= (uint8_t*)temp;\
  2155. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  2156. OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
  2157. }\
  2158. \
  2159. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2160. uint64_t temp[8];\
  2161. uint8_t * const half= (uint8_t*)temp;\
  2162. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  2163. OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
  2164. }\
  2165. \
  2166. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2167. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  2168. }\
  2169. \
  2170. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2171. uint64_t temp[8];\
  2172. uint8_t * const half= (uint8_t*)temp;\
  2173. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  2174. OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
  2175. }\
  2176. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2177. uint64_t half[8 + 9];\
  2178. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2179. uint8_t * const halfHV= ((uint8_t*)half);\
  2180. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2181. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  2182. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2183. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  2184. }\
  2185. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2186. uint64_t half[8 + 9];\
  2187. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2188. uint8_t * const halfHV= ((uint8_t*)half);\
  2189. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2190. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  2191. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2192. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  2193. }\
  2194. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2195. uint64_t half[8 + 9];\
  2196. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2197. uint8_t * const halfHV= ((uint8_t*)half);\
  2198. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2199. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  2200. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2201. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  2202. }\
  2203. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2204. uint64_t half[8 + 9];\
  2205. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2206. uint8_t * const halfHV= ((uint8_t*)half);\
  2207. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2208. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  2209. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2210. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  2211. }\
  2212. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2213. uint64_t half[8 + 9];\
  2214. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2215. uint8_t * const halfHV= ((uint8_t*)half);\
  2216. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2217. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2218. OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
  2219. }\
  2220. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2221. uint64_t half[8 + 9];\
  2222. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  2223. uint8_t * const halfHV= ((uint8_t*)half);\
  2224. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2225. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  2226. OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
  2227. }\
  2228. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2229. uint64_t half[8 + 9];\
  2230. uint8_t * const halfH= ((uint8_t*)half);\
  2231. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2232. put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
  2233. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2234. }\
  2235. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2236. uint64_t half[8 + 9];\
  2237. uint8_t * const halfH= ((uint8_t*)half);\
  2238. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2239. put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
  2240. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2241. }\
  2242. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2243. uint64_t half[9];\
  2244. uint8_t * const halfH= ((uint8_t*)half);\
  2245. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  2246. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  2247. }\
  2248. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  2249. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  2250. }\
  2251. \
  2252. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2253. uint64_t temp[32];\
  2254. uint8_t * const half= (uint8_t*)temp;\
  2255. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  2256. OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
  2257. }\
  2258. \
  2259. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2260. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  2261. }\
  2262. \
  2263. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2264. uint64_t temp[32];\
  2265. uint8_t * const half= (uint8_t*)temp;\
  2266. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  2267. OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
  2268. }\
  2269. \
  2270. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2271. uint64_t temp[32];\
  2272. uint8_t * const half= (uint8_t*)temp;\
  2273. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  2274. OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
  2275. }\
  2276. \
  2277. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2278. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  2279. }\
  2280. \
  2281. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2282. uint64_t temp[32];\
  2283. uint8_t * const half= (uint8_t*)temp;\
  2284. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  2285. OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
  2286. }\
  2287. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2288. uint64_t half[16*2 + 17*2];\
  2289. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2290. uint8_t * const halfHV= ((uint8_t*)half);\
  2291. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2292. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2293. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2294. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2295. }\
  2296. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2297. uint64_t half[16*2 + 17*2];\
  2298. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2299. uint8_t * const halfHV= ((uint8_t*)half);\
  2300. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2301. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2302. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2303. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2304. }\
  2305. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2306. uint64_t half[16*2 + 17*2];\
  2307. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2308. uint8_t * const halfHV= ((uint8_t*)half);\
  2309. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2310. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2311. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2312. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2313. }\
  2314. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2315. uint64_t half[16*2 + 17*2];\
  2316. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2317. uint8_t * const halfHV= ((uint8_t*)half);\
  2318. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2319. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2320. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2321. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2322. }\
  2323. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2324. uint64_t half[16*2 + 17*2];\
  2325. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2326. uint8_t * const halfHV= ((uint8_t*)half);\
  2327. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2328. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2329. OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
  2330. }\
  2331. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2332. uint64_t half[16*2 + 17*2];\
  2333. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  2334. uint8_t * const halfHV= ((uint8_t*)half);\
  2335. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2336. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  2337. OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
  2338. }\
  2339. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2340. uint64_t half[17*2];\
  2341. uint8_t * const halfH= ((uint8_t*)half);\
  2342. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2343. put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
  2344. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2345. }\
  2346. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2347. uint64_t half[17*2];\
  2348. uint8_t * const halfH= ((uint8_t*)half);\
  2349. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2350. put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
  2351. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2352. }\
  2353. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2354. uint64_t half[17*2];\
  2355. uint8_t * const halfH= ((uint8_t*)half);\
  2356. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  2357. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  2358. }
  2359. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  2360. #define AVG_3DNOW_OP(a,b,temp, size) \
  2361. "mov" #size " " #b ", " #temp " \n\t"\
  2362. "pavgusb " #temp ", " #a " \n\t"\
  2363. "mov" #size " " #a ", " #b " \n\t"
  2364. #define AVG_MMX2_OP(a,b,temp, size) \
  2365. "mov" #size " " #b ", " #temp " \n\t"\
  2366. "pavgb " #temp ", " #a " \n\t"\
  2367. "mov" #size " " #a ", " #b " \n\t"
  2368. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  2369. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  2370. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  2371. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  2372. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  2373. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  2374. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  2375. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  2376. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  2377. /***********************************/
  2378. /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
  2379. #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
  2380. static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2381. OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
  2382. }
  2383. #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
  2384. static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2385. OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
  2386. }
  2387. #define QPEL_2TAP(OPNAME, SIZE, MMX)\
  2388. QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
  2389. QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
  2390. QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
  2391. static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
  2392. OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
  2393. static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
  2394. OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
  2395. static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
  2396. OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
  2397. static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2398. OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
  2399. }\
  2400. static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  2401. OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
  2402. }\
  2403. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
  2404. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
  2405. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
  2406. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
  2407. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
  2408. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
  2409. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
  2410. QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
  2411. QPEL_2TAP(put_, 16, mmx2)
  2412. QPEL_2TAP(avg_, 16, mmx2)
  2413. QPEL_2TAP(put_, 8, mmx2)
  2414. QPEL_2TAP(avg_, 8, mmx2)
  2415. QPEL_2TAP(put_, 16, 3dnow)
  2416. QPEL_2TAP(avg_, 16, 3dnow)
  2417. QPEL_2TAP(put_, 8, 3dnow)
  2418. QPEL_2TAP(avg_, 8, 3dnow)
  2419. #if 0
  2420. static void just_return() { return; }
  2421. #endif
  2422. #define SET_QPEL_FUNC(postfix1, postfix2) \
  2423. c->put_ ## postfix1 = put_ ## postfix2;\
  2424. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  2425. c->avg_ ## postfix1 = avg_ ## postfix2;
  2426. static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
  2427. int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){
  2428. const int w = 8;
  2429. const int ix = ox>>(16+shift);
  2430. const int iy = oy>>(16+shift);
  2431. const int oxs = ox>>4;
  2432. const int oys = oy>>4;
  2433. const int dxxs = dxx>>4;
  2434. const int dxys = dxy>>4;
  2435. const int dyxs = dyx>>4;
  2436. const int dyys = dyy>>4;
  2437. const uint16_t r4[4] = {r,r,r,r};
  2438. const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
  2439. const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
  2440. const uint64_t shift2 = 2*shift;
  2441. uint8_t edge_buf[(h+1)*stride];
  2442. int x, y;
  2443. const int dxw = (dxx-(1<<(16+shift)))*(w-1);
  2444. const int dyh = (dyy-(1<<(16+shift)))*(h-1);
  2445. const int dxh = dxy*(h-1);
  2446. const int dyw = dyx*(w-1);
  2447. if( // non-constant fullpel offset (3% of blocks)
  2448. (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) |
  2449. oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift)
  2450. // uses more than 16 bits of subpel mv (only at huge resolution)
  2451. || (dxx|dxy|dyx|dyy)&15 )
  2452. {
  2453. //FIXME could still use mmx for some of the rows
  2454. ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
  2455. return;
  2456. }
  2457. src += ix + iy*stride;
  2458. if( (unsigned)ix >= width-w ||
  2459. (unsigned)iy >= height-h )
  2460. {
  2461. ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
  2462. src = edge_buf;
  2463. }
  2464. asm volatile(
  2465. "movd %0, %%mm6 \n\t"
  2466. "pxor %%mm7, %%mm7 \n\t"
  2467. "punpcklwd %%mm6, %%mm6 \n\t"
  2468. "punpcklwd %%mm6, %%mm6 \n\t"
  2469. :: "r"(1<<shift)
  2470. );
  2471. for(x=0; x<w; x+=4){
  2472. uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
  2473. oxs - dxys + dxxs*(x+1),
  2474. oxs - dxys + dxxs*(x+2),
  2475. oxs - dxys + dxxs*(x+3) };
  2476. uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
  2477. oys - dyys + dyxs*(x+1),
  2478. oys - dyys + dyxs*(x+2),
  2479. oys - dyys + dyxs*(x+3) };
  2480. for(y=0; y<h; y++){
  2481. asm volatile(
  2482. "movq %0, %%mm4 \n\t"
  2483. "movq %1, %%mm5 \n\t"
  2484. "paddw %2, %%mm4 \n\t"
  2485. "paddw %3, %%mm5 \n\t"
  2486. "movq %%mm4, %0 \n\t"
  2487. "movq %%mm5, %1 \n\t"
  2488. "psrlw $12, %%mm4 \n\t"
  2489. "psrlw $12, %%mm5 \n\t"
  2490. : "+m"(*dx4), "+m"(*dy4)
  2491. : "m"(*dxy4), "m"(*dyy4)
  2492. );
  2493. asm volatile(
  2494. "movq %%mm6, %%mm2 \n\t"
  2495. "movq %%mm6, %%mm1 \n\t"
  2496. "psubw %%mm4, %%mm2 \n\t"
  2497. "psubw %%mm5, %%mm1 \n\t"
  2498. "movq %%mm2, %%mm0 \n\t"
  2499. "movq %%mm4, %%mm3 \n\t"
  2500. "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
  2501. "pmullw %%mm5, %%mm3 \n\t" // dx*dy
  2502. "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
  2503. "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
  2504. "movd %4, %%mm5 \n\t"
  2505. "movd %3, %%mm4 \n\t"
  2506. "punpcklbw %%mm7, %%mm5 \n\t"
  2507. "punpcklbw %%mm7, %%mm4 \n\t"
  2508. "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
  2509. "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
  2510. "movd %2, %%mm5 \n\t"
  2511. "movd %1, %%mm4 \n\t"
  2512. "punpcklbw %%mm7, %%mm5 \n\t"
  2513. "punpcklbw %%mm7, %%mm4 \n\t"
  2514. "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
  2515. "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
  2516. "paddw %5, %%mm1 \n\t"
  2517. "paddw %%mm3, %%mm2 \n\t"
  2518. "paddw %%mm1, %%mm0 \n\t"
  2519. "paddw %%mm2, %%mm0 \n\t"
  2520. "psrlw %6, %%mm0 \n\t"
  2521. "packuswb %%mm0, %%mm0 \n\t"
  2522. "movd %%mm0, %0 \n\t"
  2523. : "=m"(dst[x+y*stride])
  2524. : "m"(src[0]), "m"(src[1]),
  2525. "m"(src[stride]), "m"(src[stride+1]),
  2526. "m"(*r4), "m"(shift2)
  2527. );
  2528. src += stride;
  2529. }
  2530. src += 4-h*stride;
  2531. }
  2532. }
  2533. #ifdef CONFIG_ENCODERS
  2534. #define PHADDD(a, t)\
  2535. "movq "#a", "#t" \n\t"\
  2536. "psrlq $32, "#a" \n\t"\
  2537. "paddd "#t", "#a" \n\t"
  2538. /*
  2539. pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
  2540. pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
  2541. pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
  2542. */
  2543. #define PMULHRW(x, y, s, o)\
  2544. "pmulhw " #s ", "#x " \n\t"\
  2545. "pmulhw " #s ", "#y " \n\t"\
  2546. "paddw " #o ", "#x " \n\t"\
  2547. "paddw " #o ", "#y " \n\t"\
  2548. "psraw $1, "#x " \n\t"\
  2549. "psraw $1, "#y " \n\t"
  2550. #define DEF(x) x ## _mmx
  2551. #define SET_RND MOVQ_WONE
  2552. #define SCALE_OFFSET 1
  2553. #include "dsputil_mmx_qns.h"
  2554. #undef DEF
  2555. #undef SET_RND
  2556. #undef SCALE_OFFSET
  2557. #undef PMULHRW
  2558. #define DEF(x) x ## _3dnow
  2559. #define SET_RND(x)
  2560. #define SCALE_OFFSET 0
  2561. #define PMULHRW(x, y, s, o)\
  2562. "pmulhrw " #s ", "#x " \n\t"\
  2563. "pmulhrw " #s ", "#y " \n\t"
  2564. #include "dsputil_mmx_qns.h"
  2565. #undef DEF
  2566. #undef SET_RND
  2567. #undef SCALE_OFFSET
  2568. #undef PMULHRW
  2569. #ifdef HAVE_SSSE3
  2570. #undef PHADDD
  2571. #define DEF(x) x ## _ssse3
  2572. #define SET_RND(x)
  2573. #define SCALE_OFFSET -1
  2574. #define PHADDD(a, t)\
  2575. "pshufw $0x0E, "#a", "#t" \n\t"\
  2576. "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
  2577. #define PMULHRW(x, y, s, o)\
  2578. "pmulhrsw " #s ", "#x " \n\t"\
  2579. "pmulhrsw " #s ", "#y " \n\t"
  2580. #include "dsputil_mmx_qns.h"
  2581. #undef DEF
  2582. #undef SET_RND
  2583. #undef SCALE_OFFSET
  2584. #undef PMULHRW
  2585. #undef PHADDD
  2586. #endif //HAVE_SSSE3
  2587. #endif /* CONFIG_ENCODERS */
  2588. #define PREFETCH(name, op) \
  2589. static void name(void *mem, int stride, int h){\
  2590. const uint8_t *p= mem;\
  2591. do{\
  2592. asm volatile(#op" %0" :: "m"(*p));\
  2593. p+= stride;\
  2594. }while(--h);\
  2595. }
  2596. PREFETCH(prefetch_mmx2, prefetcht0)
  2597. PREFETCH(prefetch_3dnow, prefetch)
  2598. #undef PREFETCH
  2599. #include "h264dsp_mmx.c"
  2600. /* AVS specific */
  2601. void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
  2602. void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  2603. put_pixels8_mmx(dst, src, stride, 8);
  2604. }
  2605. void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  2606. avg_pixels8_mmx(dst, src, stride, 8);
  2607. }
  2608. void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  2609. put_pixels16_mmx(dst, src, stride, 16);
  2610. }
  2611. void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
  2612. avg_pixels16_mmx(dst, src, stride, 16);
  2613. }
  2614. /* VC1 specific */
  2615. void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx);
  2616. void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
  2617. put_pixels8_mmx(dst, src, stride, 8);
  2618. }
  2619. /* external functions, from idct_mmx.c */
  2620. void ff_mmx_idct(DCTELEM *block);
  2621. void ff_mmxext_idct(DCTELEM *block);
  2622. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  2623. converted */
  2624. #ifdef CONFIG_GPL
  2625. static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  2626. {
  2627. ff_mmx_idct (block);
  2628. put_pixels_clamped_mmx(block, dest, line_size);
  2629. }
  2630. static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  2631. {
  2632. ff_mmx_idct (block);
  2633. add_pixels_clamped_mmx(block, dest, line_size);
  2634. }
  2635. static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  2636. {
  2637. ff_mmxext_idct (block);
  2638. put_pixels_clamped_mmx(block, dest, line_size);
  2639. }
  2640. static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  2641. {
  2642. ff_mmxext_idct (block);
  2643. add_pixels_clamped_mmx(block, dest, line_size);
  2644. }
  2645. #endif
  2646. static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
  2647. {
  2648. ff_idct_xvid_mmx (block);
  2649. put_pixels_clamped_mmx(block, dest, line_size);
  2650. }
  2651. static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
  2652. {
  2653. ff_idct_xvid_mmx (block);
  2654. add_pixels_clamped_mmx(block, dest, line_size);
  2655. }
  2656. static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
  2657. {
  2658. ff_idct_xvid_mmx2 (block);
  2659. put_pixels_clamped_mmx(block, dest, line_size);
  2660. }
  2661. static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
  2662. {
  2663. ff_idct_xvid_mmx2 (block);
  2664. add_pixels_clamped_mmx(block, dest, line_size);
  2665. }
  2666. static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
  2667. {
  2668. int i;
  2669. asm volatile("pxor %%mm7, %%mm7":);
  2670. for(i=0; i<blocksize; i+=2) {
  2671. asm volatile(
  2672. "movq %0, %%mm0 \n\t"
  2673. "movq %1, %%mm1 \n\t"
  2674. "movq %%mm0, %%mm2 \n\t"
  2675. "movq %%mm1, %%mm3 \n\t"
  2676. "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
  2677. "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
  2678. "pslld $31, %%mm2 \n\t" // keep only the sign bit
  2679. "pxor %%mm2, %%mm1 \n\t"
  2680. "movq %%mm3, %%mm4 \n\t"
  2681. "pand %%mm1, %%mm3 \n\t"
  2682. "pandn %%mm1, %%mm4 \n\t"
  2683. "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
  2684. "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
  2685. "movq %%mm3, %1 \n\t"
  2686. "movq %%mm0, %0 \n\t"
  2687. :"+m"(mag[i]), "+m"(ang[i])
  2688. ::"memory"
  2689. );
  2690. }
  2691. asm volatile("femms");
  2692. }
  2693. static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
  2694. {
  2695. int i;
  2696. asm volatile(
  2697. "movaps %0, %%xmm5 \n\t"
  2698. ::"m"(ff_pdw_80000000[0])
  2699. );
  2700. for(i=0; i<blocksize; i+=4) {
  2701. asm volatile(
  2702. "movaps %0, %%xmm0 \n\t"
  2703. "movaps %1, %%xmm1 \n\t"
  2704. "xorps %%xmm2, %%xmm2 \n\t"
  2705. "xorps %%xmm3, %%xmm3 \n\t"
  2706. "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
  2707. "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
  2708. "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
  2709. "xorps %%xmm2, %%xmm1 \n\t"
  2710. "movaps %%xmm3, %%xmm4 \n\t"
  2711. "andps %%xmm1, %%xmm3 \n\t"
  2712. "andnps %%xmm1, %%xmm4 \n\t"
  2713. "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
  2714. "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
  2715. "movaps %%xmm3, %1 \n\t"
  2716. "movaps %%xmm0, %0 \n\t"
  2717. :"+m"(mag[i]), "+m"(ang[i])
  2718. ::"memory"
  2719. );
  2720. }
  2721. }
  2722. #ifdef CONFIG_ENCODERS
  2723. static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
  2724. {
  2725. double c = 2.0 / (len-1.0);
  2726. int n2 = len>>1;
  2727. long i = -n2*sizeof(int32_t);
  2728. long j = n2*sizeof(int32_t);
  2729. asm volatile(
  2730. "movsd %0, %%xmm7 \n\t"
  2731. "movapd %1, %%xmm6 \n\t"
  2732. "movapd %2, %%xmm5 \n\t"
  2733. "movlhps %%xmm7, %%xmm7 \n\t"
  2734. "subpd %%xmm5, %%xmm7 \n\t"
  2735. "addsd %%xmm6, %%xmm7 \n\t"
  2736. ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2)
  2737. );
  2738. #define WELCH(MOVPD)\
  2739. asm volatile(\
  2740. "1: \n\t"\
  2741. "movapd %%xmm7, %%xmm1 \n\t"\
  2742. "mulpd %%xmm1, %%xmm1 \n\t"\
  2743. "movapd %%xmm6, %%xmm0 \n\t"\
  2744. "subpd %%xmm1, %%xmm0 \n\t"\
  2745. "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
  2746. "cvtpi2pd (%4,%0), %%xmm2 \n\t"\
  2747. "cvtpi2pd (%5,%1), %%xmm3 \n\t"\
  2748. "mulpd %%xmm0, %%xmm2 \n\t"\
  2749. "mulpd %%xmm1, %%xmm3 \n\t"\
  2750. "movapd %%xmm2, (%2,%0,2) \n\t"\
  2751. MOVPD" %%xmm3, (%3,%1,2) \n\t"\
  2752. "subpd %%xmm5, %%xmm7 \n\t"\
  2753. "sub $8, %1 \n\t"\
  2754. "add $8, %0 \n\t"\
  2755. "jl 1b \n\t"\
  2756. :"+&r"(i), "+&r"(j)\
  2757. :"r"(w_data+n2), "r"(w_data+len-2-n2),\
  2758. "r"(data+n2), "r"(data+len-2-n2)\
  2759. );
  2760. if(len&1)
  2761. WELCH("movupd")
  2762. else
  2763. WELCH("movapd")
  2764. #undef WELCH
  2765. }
  2766. static void flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
  2767. double *autoc)
  2768. {
  2769. double tmp[len + lag + 2];
  2770. double *data1 = tmp + lag;
  2771. int j;
  2772. if((long)data1 & 15)
  2773. data1++;
  2774. apply_welch_window_sse2(data, len, data1);
  2775. for(j=0; j<lag; j++)
  2776. data1[j-lag]= 0.0;
  2777. data1[len] = 0.0;
  2778. for(j=0; j<lag; j+=2){
  2779. long i = -len*sizeof(double);
  2780. if(j == lag-2) {
  2781. asm volatile(
  2782. "movsd %6, %%xmm0 \n\t"
  2783. "movsd %6, %%xmm1 \n\t"
  2784. "movsd %6, %%xmm2 \n\t"
  2785. "1: \n\t"
  2786. "movapd (%4,%0), %%xmm3 \n\t"
  2787. "movupd -8(%5,%0), %%xmm4 \n\t"
  2788. "movapd (%5,%0), %%xmm5 \n\t"
  2789. "mulpd %%xmm3, %%xmm4 \n\t"
  2790. "mulpd %%xmm3, %%xmm5 \n\t"
  2791. "mulpd -16(%5,%0), %%xmm3 \n\t"
  2792. "addpd %%xmm4, %%xmm1 \n\t"
  2793. "addpd %%xmm5, %%xmm0 \n\t"
  2794. "addpd %%xmm3, %%xmm2 \n\t"
  2795. "add $16, %0 \n\t"
  2796. "jl 1b \n\t"
  2797. "movhlps %%xmm0, %%xmm3 \n\t"
  2798. "movhlps %%xmm1, %%xmm4 \n\t"
  2799. "movhlps %%xmm2, %%xmm5 \n\t"
  2800. "addsd %%xmm3, %%xmm0 \n\t"
  2801. "addsd %%xmm4, %%xmm1 \n\t"
  2802. "addsd %%xmm5, %%xmm2 \n\t"
  2803. "movsd %%xmm0, %1 \n\t"
  2804. "movsd %%xmm1, %2 \n\t"
  2805. "movsd %%xmm2, %3 \n\t"
  2806. :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]), "=m"(autoc[j+2])
  2807. :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
  2808. );
  2809. } else {
  2810. asm volatile(
  2811. "movsd %5, %%xmm0 \n\t"
  2812. "movsd %5, %%xmm1 \n\t"
  2813. "1: \n\t"
  2814. "movapd (%3,%0), %%xmm3 \n\t"
  2815. "movupd -8(%4,%0), %%xmm4 \n\t"
  2816. "mulpd %%xmm3, %%xmm4 \n\t"
  2817. "mulpd (%4,%0), %%xmm3 \n\t"
  2818. "addpd %%xmm4, %%xmm1 \n\t"
  2819. "addpd %%xmm3, %%xmm0 \n\t"
  2820. "add $16, %0 \n\t"
  2821. "jl 1b \n\t"
  2822. "movhlps %%xmm0, %%xmm3 \n\t"
  2823. "movhlps %%xmm1, %%xmm4 \n\t"
  2824. "addsd %%xmm3, %%xmm0 \n\t"
  2825. "addsd %%xmm4, %%xmm1 \n\t"
  2826. "movsd %%xmm0, %1 \n\t"
  2827. "movsd %%xmm1, %2 \n\t"
  2828. :"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
  2829. :"r"(data1+len), "r"(data1+len-j), "m"(*ff_pd_1)
  2830. );
  2831. }
  2832. }
  2833. }
  2834. #endif // CONFIG_ENCODERS
  2835. static void vector_fmul_3dnow(float *dst, const float *src, int len){
  2836. long i = (len-4)*4;
  2837. asm volatile(
  2838. "1: \n\t"
  2839. "movq (%1,%0), %%mm0 \n\t"
  2840. "movq 8(%1,%0), %%mm1 \n\t"
  2841. "pfmul (%2,%0), %%mm0 \n\t"
  2842. "pfmul 8(%2,%0), %%mm1 \n\t"
  2843. "movq %%mm0, (%1,%0) \n\t"
  2844. "movq %%mm1, 8(%1,%0) \n\t"
  2845. "sub $16, %0 \n\t"
  2846. "jge 1b \n\t"
  2847. "femms \n\t"
  2848. :"+r"(i)
  2849. :"r"(dst), "r"(src)
  2850. :"memory"
  2851. );
  2852. }
  2853. static void vector_fmul_sse(float *dst, const float *src, int len){
  2854. long i = (len-8)*4;
  2855. asm volatile(
  2856. "1: \n\t"
  2857. "movaps (%1,%0), %%xmm0 \n\t"
  2858. "movaps 16(%1,%0), %%xmm1 \n\t"
  2859. "mulps (%2,%0), %%xmm0 \n\t"
  2860. "mulps 16(%2,%0), %%xmm1 \n\t"
  2861. "movaps %%xmm0, (%1,%0) \n\t"
  2862. "movaps %%xmm1, 16(%1,%0) \n\t"
  2863. "sub $32, %0 \n\t"
  2864. "jge 1b \n\t"
  2865. :"+r"(i)
  2866. :"r"(dst), "r"(src)
  2867. :"memory"
  2868. );
  2869. }
  2870. static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
  2871. long i = len*4-16;
  2872. asm volatile(
  2873. "1: \n\t"
  2874. "pswapd 8(%1), %%mm0 \n\t"
  2875. "pswapd (%1), %%mm1 \n\t"
  2876. "pfmul (%3,%0), %%mm0 \n\t"
  2877. "pfmul 8(%3,%0), %%mm1 \n\t"
  2878. "movq %%mm0, (%2,%0) \n\t"
  2879. "movq %%mm1, 8(%2,%0) \n\t"
  2880. "add $16, %1 \n\t"
  2881. "sub $16, %0 \n\t"
  2882. "jge 1b \n\t"
  2883. :"+r"(i), "+r"(src1)
  2884. :"r"(dst), "r"(src0)
  2885. );
  2886. asm volatile("femms");
  2887. }
  2888. static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
  2889. long i = len*4-32;
  2890. asm volatile(
  2891. "1: \n\t"
  2892. "movaps 16(%1), %%xmm0 \n\t"
  2893. "movaps (%1), %%xmm1 \n\t"
  2894. "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
  2895. "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
  2896. "mulps (%3,%0), %%xmm0 \n\t"
  2897. "mulps 16(%3,%0), %%xmm1 \n\t"
  2898. "movaps %%xmm0, (%2,%0) \n\t"
  2899. "movaps %%xmm1, 16(%2,%0) \n\t"
  2900. "add $32, %1 \n\t"
  2901. "sub $32, %0 \n\t"
  2902. "jge 1b \n\t"
  2903. :"+r"(i), "+r"(src1)
  2904. :"r"(dst), "r"(src0)
  2905. );
  2906. }
  2907. static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
  2908. const float *src2, int src3, int len, int step){
  2909. long i = (len-4)*4;
  2910. if(step == 2 && src3 == 0){
  2911. dst += (len-4)*2;
  2912. asm volatile(
  2913. "1: \n\t"
  2914. "movq (%2,%0), %%mm0 \n\t"
  2915. "movq 8(%2,%0), %%mm1 \n\t"
  2916. "pfmul (%3,%0), %%mm0 \n\t"
  2917. "pfmul 8(%3,%0), %%mm1 \n\t"
  2918. "pfadd (%4,%0), %%mm0 \n\t"
  2919. "pfadd 8(%4,%0), %%mm1 \n\t"
  2920. "movd %%mm0, (%1) \n\t"
  2921. "movd %%mm1, 16(%1) \n\t"
  2922. "psrlq $32, %%mm0 \n\t"
  2923. "psrlq $32, %%mm1 \n\t"
  2924. "movd %%mm0, 8(%1) \n\t"
  2925. "movd %%mm1, 24(%1) \n\t"
  2926. "sub $32, %1 \n\t"
  2927. "sub $16, %0 \n\t"
  2928. "jge 1b \n\t"
  2929. :"+r"(i), "+r"(dst)
  2930. :"r"(src0), "r"(src1), "r"(src2)
  2931. :"memory"
  2932. );
  2933. }
  2934. else if(step == 1 && src3 == 0){
  2935. asm volatile(
  2936. "1: \n\t"
  2937. "movq (%2,%0), %%mm0 \n\t"
  2938. "movq 8(%2,%0), %%mm1 \n\t"
  2939. "pfmul (%3,%0), %%mm0 \n\t"
  2940. "pfmul 8(%3,%0), %%mm1 \n\t"
  2941. "pfadd (%4,%0), %%mm0 \n\t"
  2942. "pfadd 8(%4,%0), %%mm1 \n\t"
  2943. "movq %%mm0, (%1,%0) \n\t"
  2944. "movq %%mm1, 8(%1,%0) \n\t"
  2945. "sub $16, %0 \n\t"
  2946. "jge 1b \n\t"
  2947. :"+r"(i)
  2948. :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
  2949. :"memory"
  2950. );
  2951. }
  2952. else
  2953. ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
  2954. asm volatile("femms");
  2955. }
  2956. static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
  2957. const float *src2, int src3, int len, int step){
  2958. long i = (len-8)*4;
  2959. if(step == 2 && src3 == 0){
  2960. dst += (len-8)*2;
  2961. asm volatile(
  2962. "1: \n\t"
  2963. "movaps (%2,%0), %%xmm0 \n\t"
  2964. "movaps 16(%2,%0), %%xmm1 \n\t"
  2965. "mulps (%3,%0), %%xmm0 \n\t"
  2966. "mulps 16(%3,%0), %%xmm1 \n\t"
  2967. "addps (%4,%0), %%xmm0 \n\t"
  2968. "addps 16(%4,%0), %%xmm1 \n\t"
  2969. "movss %%xmm0, (%1) \n\t"
  2970. "movss %%xmm1, 32(%1) \n\t"
  2971. "movhlps %%xmm0, %%xmm2 \n\t"
  2972. "movhlps %%xmm1, %%xmm3 \n\t"
  2973. "movss %%xmm2, 16(%1) \n\t"
  2974. "movss %%xmm3, 48(%1) \n\t"
  2975. "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
  2976. "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
  2977. "movss %%xmm0, 8(%1) \n\t"
  2978. "movss %%xmm1, 40(%1) \n\t"
  2979. "movhlps %%xmm0, %%xmm2 \n\t"
  2980. "movhlps %%xmm1, %%xmm3 \n\t"
  2981. "movss %%xmm2, 24(%1) \n\t"
  2982. "movss %%xmm3, 56(%1) \n\t"
  2983. "sub $64, %1 \n\t"
  2984. "sub $32, %0 \n\t"
  2985. "jge 1b \n\t"
  2986. :"+r"(i), "+r"(dst)
  2987. :"r"(src0), "r"(src1), "r"(src2)
  2988. :"memory"
  2989. );
  2990. }
  2991. else if(step == 1 && src3 == 0){
  2992. asm volatile(
  2993. "1: \n\t"
  2994. "movaps (%2,%0), %%xmm0 \n\t"
  2995. "movaps 16(%2,%0), %%xmm1 \n\t"
  2996. "mulps (%3,%0), %%xmm0 \n\t"
  2997. "mulps 16(%3,%0), %%xmm1 \n\t"
  2998. "addps (%4,%0), %%xmm0 \n\t"
  2999. "addps 16(%4,%0), %%xmm1 \n\t"
  3000. "movaps %%xmm0, (%1,%0) \n\t"
  3001. "movaps %%xmm1, 16(%1,%0) \n\t"
  3002. "sub $32, %0 \n\t"
  3003. "jge 1b \n\t"
  3004. :"+r"(i)
  3005. :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
  3006. :"memory"
  3007. );
  3008. }
  3009. else
  3010. ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
  3011. }
  3012. static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
  3013. // not bit-exact: pf2id uses different rounding than C and SSE
  3014. int i;
  3015. for(i=0; i<len; i+=4) {
  3016. asm volatile(
  3017. "pf2id %1, %%mm0 \n\t"
  3018. "pf2id %2, %%mm1 \n\t"
  3019. "packssdw %%mm1, %%mm0 \n\t"
  3020. "movq %%mm0, %0 \n\t"
  3021. :"=m"(dst[i])
  3022. :"m"(src[i]), "m"(src[i+2])
  3023. );
  3024. }
  3025. asm volatile("femms");
  3026. }
  3027. static void float_to_int16_sse(int16_t *dst, const float *src, int len){
  3028. int i;
  3029. for(i=0; i<len; i+=4) {
  3030. asm volatile(
  3031. "cvtps2pi %1, %%mm0 \n\t"
  3032. "cvtps2pi %2, %%mm1 \n\t"
  3033. "packssdw %%mm1, %%mm0 \n\t"
  3034. "movq %%mm0, %0 \n\t"
  3035. :"=m"(dst[i])
  3036. :"m"(src[i]), "m"(src[i+2])
  3037. );
  3038. }
  3039. asm volatile("emms");
  3040. }
  3041. extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
  3042. extern void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width);
  3043. extern void ff_snow_vertical_compose97i_sse2(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
  3044. extern void ff_snow_vertical_compose97i_mmx(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width);
  3045. extern void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  3046. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
  3047. extern void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
  3048. int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
  3049. void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  3050. {
  3051. mm_flags = mm_support();
  3052. if (avctx->dsp_mask) {
  3053. if (avctx->dsp_mask & FF_MM_FORCE)
  3054. mm_flags |= (avctx->dsp_mask & 0xffff);
  3055. else
  3056. mm_flags &= ~(avctx->dsp_mask & 0xffff);
  3057. }
  3058. #if 0
  3059. av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
  3060. if (mm_flags & MM_MMX)
  3061. av_log(avctx, AV_LOG_INFO, " mmx");
  3062. if (mm_flags & MM_MMXEXT)
  3063. av_log(avctx, AV_LOG_INFO, " mmxext");
  3064. if (mm_flags & MM_3DNOW)
  3065. av_log(avctx, AV_LOG_INFO, " 3dnow");
  3066. if (mm_flags & MM_SSE)
  3067. av_log(avctx, AV_LOG_INFO, " sse");
  3068. if (mm_flags & MM_SSE2)
  3069. av_log(avctx, AV_LOG_INFO, " sse2");
  3070. av_log(avctx, AV_LOG_INFO, "\n");
  3071. #endif
  3072. if (mm_flags & MM_MMX) {
  3073. const int idct_algo= avctx->idct_algo;
  3074. #ifdef CONFIG_ENCODERS
  3075. const int dct_algo = avctx->dct_algo;
  3076. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
  3077. if(mm_flags & MM_SSE2){
  3078. c->fdct = ff_fdct_sse2;
  3079. }else if(mm_flags & MM_MMXEXT){
  3080. c->fdct = ff_fdct_mmx2;
  3081. }else{
  3082. c->fdct = ff_fdct_mmx;
  3083. }
  3084. }
  3085. #endif //CONFIG_ENCODERS
  3086. if(avctx->lowres==0){
  3087. if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
  3088. c->idct_put= ff_simple_idct_put_mmx;
  3089. c->idct_add= ff_simple_idct_add_mmx;
  3090. c->idct = ff_simple_idct_mmx;
  3091. c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
  3092. #ifdef CONFIG_GPL
  3093. }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
  3094. if(mm_flags & MM_MMXEXT){
  3095. c->idct_put= ff_libmpeg2mmx2_idct_put;
  3096. c->idct_add= ff_libmpeg2mmx2_idct_add;
  3097. c->idct = ff_mmxext_idct;
  3098. }else{
  3099. c->idct_put= ff_libmpeg2mmx_idct_put;
  3100. c->idct_add= ff_libmpeg2mmx_idct_add;
  3101. c->idct = ff_mmx_idct;
  3102. }
  3103. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  3104. #endif
  3105. }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) &&
  3106. idct_algo==FF_IDCT_VP3 &&
  3107. avctx->codec->id!=CODEC_ID_THEORA &&
  3108. !(avctx->flags & CODEC_FLAG_BITEXACT)){
  3109. if(mm_flags & MM_SSE2){
  3110. c->idct_put= ff_vp3_idct_put_sse2;
  3111. c->idct_add= ff_vp3_idct_add_sse2;
  3112. c->idct = ff_vp3_idct_sse2;
  3113. c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  3114. }else{
  3115. ff_vp3_dsp_init_mmx();
  3116. c->idct_put= ff_vp3_idct_put_mmx;
  3117. c->idct_add= ff_vp3_idct_add_mmx;
  3118. c->idct = ff_vp3_idct_mmx;
  3119. c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
  3120. }
  3121. }else if(idct_algo==FF_IDCT_CAVS){
  3122. c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
  3123. }else if(idct_algo==FF_IDCT_XVIDMMX){
  3124. if(mm_flags & MM_MMXEXT){
  3125. c->idct_put= ff_idct_xvid_mmx2_put;
  3126. c->idct_add= ff_idct_xvid_mmx2_add;
  3127. c->idct = ff_idct_xvid_mmx2;
  3128. }else{
  3129. c->idct_put= ff_idct_xvid_mmx_put;
  3130. c->idct_add= ff_idct_xvid_mmx_add;
  3131. c->idct = ff_idct_xvid_mmx;
  3132. }
  3133. }
  3134. }
  3135. #ifdef CONFIG_ENCODERS
  3136. c->get_pixels = get_pixels_mmx;
  3137. c->diff_pixels = diff_pixels_mmx;
  3138. #endif //CONFIG_ENCODERS
  3139. c->put_pixels_clamped = put_pixels_clamped_mmx;
  3140. c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
  3141. c->add_pixels_clamped = add_pixels_clamped_mmx;
  3142. c->clear_blocks = clear_blocks_mmx;
  3143. #ifdef CONFIG_ENCODERS
  3144. c->pix_sum = pix_sum16_mmx;
  3145. #endif //CONFIG_ENCODERS
  3146. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  3147. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  3148. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  3149. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  3150. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  3151. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  3152. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  3153. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  3154. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  3155. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  3156. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  3157. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  3158. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  3159. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  3160. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  3161. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  3162. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  3163. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  3164. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  3165. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  3166. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  3167. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  3168. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  3169. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  3170. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  3171. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  3172. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  3173. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  3174. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  3175. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  3176. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  3177. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  3178. c->gmc= gmc_mmx;
  3179. c->add_bytes= add_bytes_mmx;
  3180. #ifdef CONFIG_ENCODERS
  3181. c->diff_bytes= diff_bytes_mmx;
  3182. c->sum_abs_dctelem= sum_abs_dctelem_mmx;
  3183. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  3184. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  3185. c->pix_norm1 = pix_norm1_mmx;
  3186. c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
  3187. c->sse[1] = sse8_mmx;
  3188. c->vsad[4]= vsad_intra16_mmx;
  3189. c->nsse[0] = nsse16_mmx;
  3190. c->nsse[1] = nsse8_mmx;
  3191. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3192. c->vsad[0] = vsad16_mmx;
  3193. }
  3194. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3195. c->try_8x8basis= try_8x8basis_mmx;
  3196. }
  3197. c->add_8x8basis= add_8x8basis_mmx;
  3198. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  3199. #endif //CONFIG_ENCODERS
  3200. if (ENABLE_ANY_H263) {
  3201. c->h263_v_loop_filter= h263_v_loop_filter_mmx;
  3202. c->h263_h_loop_filter= h263_h_loop_filter_mmx;
  3203. }
  3204. c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
  3205. c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
  3206. c->h264_idct_dc_add=
  3207. c->h264_idct_add= ff_h264_idct_add_mmx;
  3208. c->h264_idct8_dc_add=
  3209. c->h264_idct8_add= ff_h264_idct8_add_mmx;
  3210. if (mm_flags & MM_MMXEXT) {
  3211. c->prefetch = prefetch_mmx2;
  3212. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  3213. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  3214. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  3215. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  3216. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  3217. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  3218. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  3219. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  3220. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  3221. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  3222. #ifdef CONFIG_ENCODERS
  3223. c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
  3224. c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
  3225. c->hadamard8_diff[1]= hadamard8_diff_mmx2;
  3226. c->vsad[4]= vsad_intra16_mmx2;
  3227. #endif //CONFIG_ENCODERS
  3228. c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
  3229. c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
  3230. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3231. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  3232. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  3233. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  3234. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  3235. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  3236. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  3237. #ifdef CONFIG_ENCODERS
  3238. c->vsad[0] = vsad16_mmx2;
  3239. #endif //CONFIG_ENCODERS
  3240. }
  3241. #if 1
  3242. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  3243. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  3244. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  3245. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  3246. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  3247. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  3248. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  3249. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  3250. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  3251. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  3252. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  3253. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  3254. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  3255. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  3256. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  3257. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  3258. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  3259. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  3260. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  3261. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  3262. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  3263. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  3264. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  3265. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  3266. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  3267. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  3268. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  3269. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  3270. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  3271. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  3272. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  3273. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  3274. #endif
  3275. //FIXME 3dnow too
  3276. #define dspfunc(PFX, IDX, NUM) \
  3277. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
  3278. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
  3279. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
  3280. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
  3281. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
  3282. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
  3283. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
  3284. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
  3285. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
  3286. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
  3287. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
  3288. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
  3289. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
  3290. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
  3291. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
  3292. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
  3293. dspfunc(put_h264_qpel, 0, 16);
  3294. dspfunc(put_h264_qpel, 1, 8);
  3295. dspfunc(put_h264_qpel, 2, 4);
  3296. dspfunc(avg_h264_qpel, 0, 16);
  3297. dspfunc(avg_h264_qpel, 1, 8);
  3298. dspfunc(avg_h264_qpel, 2, 4);
  3299. dspfunc(put_2tap_qpel, 0, 16);
  3300. dspfunc(put_2tap_qpel, 1, 8);
  3301. dspfunc(avg_2tap_qpel, 0, 16);
  3302. dspfunc(avg_2tap_qpel, 1, 8);
  3303. #undef dspfunc
  3304. c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
  3305. c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
  3306. c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
  3307. c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2;
  3308. c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
  3309. c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
  3310. c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
  3311. c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
  3312. c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
  3313. c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
  3314. c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
  3315. c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
  3316. c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
  3317. c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
  3318. c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
  3319. c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
  3320. c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
  3321. c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
  3322. c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
  3323. c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
  3324. c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
  3325. c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
  3326. c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
  3327. c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
  3328. c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
  3329. c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
  3330. c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
  3331. if (ENABLE_CAVS_DECODER)
  3332. ff_cavsdsp_init_mmx2(c, avctx);
  3333. if (ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER)
  3334. ff_vc1dsp_init_mmx(c, avctx);
  3335. #ifdef CONFIG_ENCODERS
  3336. c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  3337. #endif //CONFIG_ENCODERS
  3338. } else if (mm_flags & MM_3DNOW) {
  3339. c->prefetch = prefetch_3dnow;
  3340. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  3341. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  3342. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  3343. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  3344. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  3345. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  3346. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  3347. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  3348. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  3349. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  3350. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3351. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  3352. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  3353. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  3354. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  3355. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  3356. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  3357. }
  3358. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  3359. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  3360. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  3361. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  3362. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  3363. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  3364. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  3365. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  3366. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  3367. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  3368. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  3369. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  3370. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  3371. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  3372. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  3373. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  3374. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  3375. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  3376. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  3377. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  3378. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  3379. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  3380. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  3381. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  3382. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  3383. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  3384. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  3385. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  3386. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  3387. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  3388. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  3389. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  3390. #define dspfunc(PFX, IDX, NUM) \
  3391. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
  3392. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
  3393. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
  3394. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
  3395. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
  3396. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
  3397. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
  3398. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
  3399. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
  3400. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
  3401. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
  3402. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
  3403. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
  3404. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
  3405. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
  3406. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
  3407. dspfunc(put_h264_qpel, 0, 16);
  3408. dspfunc(put_h264_qpel, 1, 8);
  3409. dspfunc(put_h264_qpel, 2, 4);
  3410. dspfunc(avg_h264_qpel, 0, 16);
  3411. dspfunc(avg_h264_qpel, 1, 8);
  3412. dspfunc(avg_h264_qpel, 2, 4);
  3413. dspfunc(put_2tap_qpel, 0, 16);
  3414. dspfunc(put_2tap_qpel, 1, 8);
  3415. dspfunc(avg_2tap_qpel, 0, 16);
  3416. dspfunc(avg_2tap_qpel, 1, 8);
  3417. c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
  3418. c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
  3419. }
  3420. #ifdef CONFIG_ENCODERS
  3421. if(mm_flags & MM_SSE2){
  3422. c->sum_abs_dctelem= sum_abs_dctelem_sse2;
  3423. c->hadamard8_diff[0]= hadamard8_diff16_sse2;
  3424. c->hadamard8_diff[1]= hadamard8_diff_sse2;
  3425. c->flac_compute_autocorr = flac_compute_autocorr_sse2;
  3426. }
  3427. #ifdef HAVE_SSSE3
  3428. if(mm_flags & MM_SSSE3){
  3429. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3430. c->try_8x8basis= try_8x8basis_ssse3;
  3431. }
  3432. c->add_8x8basis= add_8x8basis_ssse3;
  3433. c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
  3434. c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
  3435. c->hadamard8_diff[1]= hadamard8_diff_ssse3;
  3436. }
  3437. #endif
  3438. #endif
  3439. #ifdef CONFIG_SNOW_DECODER
  3440. if(mm_flags & MM_SSE2 & 0){
  3441. c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
  3442. #ifdef HAVE_7REGS
  3443. c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
  3444. #endif
  3445. c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
  3446. }
  3447. else{
  3448. if(mm_flags & MM_MMXEXT){
  3449. c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
  3450. #ifdef HAVE_7REGS
  3451. c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
  3452. #endif
  3453. }
  3454. c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
  3455. }
  3456. #endif
  3457. if(mm_flags & MM_3DNOW){
  3458. #ifdef CONFIG_ENCODERS
  3459. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  3460. c->try_8x8basis= try_8x8basis_3dnow;
  3461. }
  3462. c->add_8x8basis= add_8x8basis_3dnow;
  3463. #endif //CONFIG_ENCODERS
  3464. c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
  3465. c->vector_fmul = vector_fmul_3dnow;
  3466. if(!(avctx->flags & CODEC_FLAG_BITEXACT))
  3467. c->float_to_int16 = float_to_int16_3dnow;
  3468. }
  3469. if(mm_flags & MM_3DNOWEXT)
  3470. c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
  3471. if(mm_flags & MM_SSE){
  3472. c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
  3473. c->vector_fmul = vector_fmul_sse;
  3474. c->float_to_int16 = float_to_int16_sse;
  3475. c->vector_fmul_reverse = vector_fmul_reverse_sse;
  3476. c->vector_fmul_add_add = vector_fmul_add_add_sse;
  3477. }
  3478. if(mm_flags & MM_3DNOW)
  3479. c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
  3480. }
  3481. #ifdef CONFIG_ENCODERS
  3482. dsputil_init_pix_mmx(c, avctx);
  3483. #endif //CONFIG_ENCODERS
  3484. #if 0
  3485. // for speed testing
  3486. get_pixels = just_return;
  3487. put_pixels_clamped = just_return;
  3488. add_pixels_clamped = just_return;
  3489. pix_abs16x16 = just_return;
  3490. pix_abs16x16_x2 = just_return;
  3491. pix_abs16x16_y2 = just_return;
  3492. pix_abs16x16_xy2 = just_return;
  3493. put_pixels_tab[0] = just_return;
  3494. put_pixels_tab[1] = just_return;
  3495. put_pixels_tab[2] = just_return;
  3496. put_pixels_tab[3] = just_return;
  3497. put_no_rnd_pixels_tab[0] = just_return;
  3498. put_no_rnd_pixels_tab[1] = just_return;
  3499. put_no_rnd_pixels_tab[2] = just_return;
  3500. put_no_rnd_pixels_tab[3] = just_return;
  3501. avg_pixels_tab[0] = just_return;
  3502. avg_pixels_tab[1] = just_return;
  3503. avg_pixels_tab[2] = just_return;
  3504. avg_pixels_tab[3] = just_return;
  3505. avg_no_rnd_pixels_tab[0] = just_return;
  3506. avg_no_rnd_pixels_tab[1] = just_return;
  3507. avg_no_rnd_pixels_tab[2] = just_return;
  3508. avg_no_rnd_pixels_tab[3] = just_return;
  3509. //av_fdct = just_return;
  3510. //ff_idct = just_return;
  3511. #endif
  3512. }