You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2422 lines
87KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. *
  20. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  21. */
  22. #include "../dsputil.h"
  23. #include "../simple_idct.h"
  24. #include "mmx.h"
  25. //#undef NDEBUG
  26. //#include <assert.h>
  27. extern const uint8_t ff_h263_loop_filter_strength[32];
  28. int mm_flags; /* multimedia extension flags */
  29. /* pixel operations */
  30. static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
  31. static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  32. static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
  33. static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
  34. static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
  35. static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
  36. static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
  37. static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
  38. #define JUMPALIGN() __asm __volatile (".balign 8"::)
  39. #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
  40. #define MOVQ_WONE(regd) \
  41. __asm __volatile ( \
  42. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  43. "psrlw $15, %%" #regd ::)
  44. #define MOVQ_BFE(regd) \
  45. __asm __volatile ( \
  46. "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
  47. "paddb %%" #regd ", %%" #regd " \n\t" ::)
  48. #ifndef PIC
  49. #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
  50. #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
  51. #else
  52. // for shared library it's better to use this way for accessing constants
  53. // pcmpeqd -> -1
  54. #define MOVQ_BONE(regd) \
  55. __asm __volatile ( \
  56. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  57. "psrlw $15, %%" #regd " \n\t" \
  58. "packuswb %%" #regd ", %%" #regd " \n\t" ::)
  59. #define MOVQ_WTWO(regd) \
  60. __asm __volatile ( \
  61. "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
  62. "psrlw $15, %%" #regd " \n\t" \
  63. "psllw $1, %%" #regd " \n\t"::)
  64. #endif
  65. // using regr as temporary and for the output result
  66. // first argument is unmodifed and second is trashed
  67. // regfe is supposed to contain 0xfefefefefefefefe
  68. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  69. "movq " #rega ", " #regr " \n\t"\
  70. "pand " #regb ", " #regr " \n\t"\
  71. "pxor " #rega ", " #regb " \n\t"\
  72. "pand " #regfe "," #regb " \n\t"\
  73. "psrlq $1, " #regb " \n\t"\
  74. "paddb " #regb ", " #regr " \n\t"
  75. #define PAVGB_MMX(rega, regb, regr, regfe) \
  76. "movq " #rega ", " #regr " \n\t"\
  77. "por " #regb ", " #regr " \n\t"\
  78. "pxor " #rega ", " #regb " \n\t"\
  79. "pand " #regfe "," #regb " \n\t"\
  80. "psrlq $1, " #regb " \n\t"\
  81. "psubb " #regb ", " #regr " \n\t"
  82. // mm6 is supposed to contain 0xfefefefefefefefe
  83. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  84. "movq " #rega ", " #regr " \n\t"\
  85. "movq " #regc ", " #regp " \n\t"\
  86. "pand " #regb ", " #regr " \n\t"\
  87. "pand " #regd ", " #regp " \n\t"\
  88. "pxor " #rega ", " #regb " \n\t"\
  89. "pxor " #regc ", " #regd " \n\t"\
  90. "pand %%mm6, " #regb " \n\t"\
  91. "pand %%mm6, " #regd " \n\t"\
  92. "psrlq $1, " #regb " \n\t"\
  93. "psrlq $1, " #regd " \n\t"\
  94. "paddb " #regb ", " #regr " \n\t"\
  95. "paddb " #regd ", " #regp " \n\t"
  96. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  97. "movq " #rega ", " #regr " \n\t"\
  98. "movq " #regc ", " #regp " \n\t"\
  99. "por " #regb ", " #regr " \n\t"\
  100. "por " #regd ", " #regp " \n\t"\
  101. "pxor " #rega ", " #regb " \n\t"\
  102. "pxor " #regc ", " #regd " \n\t"\
  103. "pand %%mm6, " #regb " \n\t"\
  104. "pand %%mm6, " #regd " \n\t"\
  105. "psrlq $1, " #regd " \n\t"\
  106. "psrlq $1, " #regb " \n\t"\
  107. "psubb " #regb ", " #regr " \n\t"\
  108. "psubb " #regd ", " #regp " \n\t"
  109. /***********************************/
  110. /* MMX no rounding */
  111. #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
  112. #define SET_RND MOVQ_WONE
  113. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  114. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  115. #include "dsputil_mmx_rnd.h"
  116. #undef DEF
  117. #undef SET_RND
  118. #undef PAVGBP
  119. #undef PAVGB
  120. /***********************************/
  121. /* MMX rounding */
  122. #define DEF(x, y) x ## _ ## y ##_mmx
  123. #define SET_RND MOVQ_WTWO
  124. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  125. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  126. #include "dsputil_mmx_rnd.h"
  127. #undef DEF
  128. #undef SET_RND
  129. #undef PAVGBP
  130. #undef PAVGB
  131. /***********************************/
  132. /* 3Dnow specific */
  133. #define DEF(x) x ## _3dnow
  134. /* for Athlons PAVGUSB is prefered */
  135. #define PAVGB "pavgusb"
  136. #include "dsputil_mmx_avg.h"
  137. #undef DEF
  138. #undef PAVGB
  139. /***********************************/
  140. /* MMX2 specific */
  141. #define DEF(x) x ## _mmx2
  142. /* Introduced only in MMX2 set */
  143. #define PAVGB "pavgb"
  144. #include "dsputil_mmx_avg.h"
  145. #undef DEF
  146. #undef PAVGB
  147. /***********************************/
  148. /* standard MMX */
  149. #ifdef CONFIG_ENCODERS
  150. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  151. {
  152. asm volatile(
  153. "movl $-128, %%eax \n\t"
  154. "pxor %%mm7, %%mm7 \n\t"
  155. ".balign 16 \n\t"
  156. "1: \n\t"
  157. "movq (%0), %%mm0 \n\t"
  158. "movq (%0, %2), %%mm2 \n\t"
  159. "movq %%mm0, %%mm1 \n\t"
  160. "movq %%mm2, %%mm3 \n\t"
  161. "punpcklbw %%mm7, %%mm0 \n\t"
  162. "punpckhbw %%mm7, %%mm1 \n\t"
  163. "punpcklbw %%mm7, %%mm2 \n\t"
  164. "punpckhbw %%mm7, %%mm3 \n\t"
  165. "movq %%mm0, (%1, %%eax)\n\t"
  166. "movq %%mm1, 8(%1, %%eax)\n\t"
  167. "movq %%mm2, 16(%1, %%eax)\n\t"
  168. "movq %%mm3, 24(%1, %%eax)\n\t"
  169. "addl %3, %0 \n\t"
  170. "addl $32, %%eax \n\t"
  171. "js 1b \n\t"
  172. : "+r" (pixels)
  173. : "r" (block+64), "r" (line_size), "r" (line_size*2)
  174. : "%eax"
  175. );
  176. }
  177. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  178. {
  179. asm volatile(
  180. "pxor %%mm7, %%mm7 \n\t"
  181. "movl $-128, %%eax \n\t"
  182. ".balign 16 \n\t"
  183. "1: \n\t"
  184. "movq (%0), %%mm0 \n\t"
  185. "movq (%1), %%mm2 \n\t"
  186. "movq %%mm0, %%mm1 \n\t"
  187. "movq %%mm2, %%mm3 \n\t"
  188. "punpcklbw %%mm7, %%mm0 \n\t"
  189. "punpckhbw %%mm7, %%mm1 \n\t"
  190. "punpcklbw %%mm7, %%mm2 \n\t"
  191. "punpckhbw %%mm7, %%mm3 \n\t"
  192. "psubw %%mm2, %%mm0 \n\t"
  193. "psubw %%mm3, %%mm1 \n\t"
  194. "movq %%mm0, (%2, %%eax)\n\t"
  195. "movq %%mm1, 8(%2, %%eax)\n\t"
  196. "addl %3, %0 \n\t"
  197. "addl %3, %1 \n\t"
  198. "addl $16, %%eax \n\t"
  199. "jnz 1b \n\t"
  200. : "+r" (s1), "+r" (s2)
  201. : "r" (block+64), "r" (stride)
  202. : "%eax"
  203. );
  204. }
  205. #endif //CONFIG_ENCODERS
  206. void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  207. {
  208. const DCTELEM *p;
  209. uint8_t *pix;
  210. /* read the pixels */
  211. p = block;
  212. pix = pixels;
  213. /* unrolled loop */
  214. __asm __volatile(
  215. "movq %3, %%mm0\n\t"
  216. "movq 8%3, %%mm1\n\t"
  217. "movq 16%3, %%mm2\n\t"
  218. "movq 24%3, %%mm3\n\t"
  219. "movq 32%3, %%mm4\n\t"
  220. "movq 40%3, %%mm5\n\t"
  221. "movq 48%3, %%mm6\n\t"
  222. "movq 56%3, %%mm7\n\t"
  223. "packuswb %%mm1, %%mm0\n\t"
  224. "packuswb %%mm3, %%mm2\n\t"
  225. "packuswb %%mm5, %%mm4\n\t"
  226. "packuswb %%mm7, %%mm6\n\t"
  227. "movq %%mm0, (%0)\n\t"
  228. "movq %%mm2, (%0, %1)\n\t"
  229. "movq %%mm4, (%0, %1, 2)\n\t"
  230. "movq %%mm6, (%0, %2)\n\t"
  231. ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p)
  232. :"memory");
  233. pix += line_size*4;
  234. p += 32;
  235. // if here would be an exact copy of the code above
  236. // compiler would generate some very strange code
  237. // thus using "r"
  238. __asm __volatile(
  239. "movq (%3), %%mm0\n\t"
  240. "movq 8(%3), %%mm1\n\t"
  241. "movq 16(%3), %%mm2\n\t"
  242. "movq 24(%3), %%mm3\n\t"
  243. "movq 32(%3), %%mm4\n\t"
  244. "movq 40(%3), %%mm5\n\t"
  245. "movq 48(%3), %%mm6\n\t"
  246. "movq 56(%3), %%mm7\n\t"
  247. "packuswb %%mm1, %%mm0\n\t"
  248. "packuswb %%mm3, %%mm2\n\t"
  249. "packuswb %%mm5, %%mm4\n\t"
  250. "packuswb %%mm7, %%mm6\n\t"
  251. "movq %%mm0, (%0)\n\t"
  252. "movq %%mm2, (%0, %1)\n\t"
  253. "movq %%mm4, (%0, %1, 2)\n\t"
  254. "movq %%mm6, (%0, %2)\n\t"
  255. ::"r" (pix), "r" (line_size), "r" (line_size*3), "r"(p)
  256. :"memory");
  257. }
  258. static unsigned char __align8 vector128[8] =
  259. { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
  260. void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  261. {
  262. int i;
  263. movq_m2r(*vector128, mm1);
  264. for (i = 0; i < 8; i++) {
  265. movq_m2r(*(block), mm0);
  266. packsswb_m2r(*(block + 4), mm0);
  267. block += 8;
  268. paddb_r2r(mm1, mm0);
  269. movq_r2m(mm0, *pixels);
  270. pixels += line_size;
  271. }
  272. }
  273. void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
  274. {
  275. const DCTELEM *p;
  276. uint8_t *pix;
  277. int i;
  278. /* read the pixels */
  279. p = block;
  280. pix = pixels;
  281. MOVQ_ZERO(mm7);
  282. i = 4;
  283. do {
  284. __asm __volatile(
  285. "movq (%2), %%mm0\n\t"
  286. "movq 8(%2), %%mm1\n\t"
  287. "movq 16(%2), %%mm2\n\t"
  288. "movq 24(%2), %%mm3\n\t"
  289. "movq %0, %%mm4\n\t"
  290. "movq %1, %%mm6\n\t"
  291. "movq %%mm4, %%mm5\n\t"
  292. "punpcklbw %%mm7, %%mm4\n\t"
  293. "punpckhbw %%mm7, %%mm5\n\t"
  294. "paddsw %%mm4, %%mm0\n\t"
  295. "paddsw %%mm5, %%mm1\n\t"
  296. "movq %%mm6, %%mm5\n\t"
  297. "punpcklbw %%mm7, %%mm6\n\t"
  298. "punpckhbw %%mm7, %%mm5\n\t"
  299. "paddsw %%mm6, %%mm2\n\t"
  300. "paddsw %%mm5, %%mm3\n\t"
  301. "packuswb %%mm1, %%mm0\n\t"
  302. "packuswb %%mm3, %%mm2\n\t"
  303. "movq %%mm0, %0\n\t"
  304. "movq %%mm2, %1\n\t"
  305. :"+m"(*pix), "+m"(*(pix+line_size))
  306. :"r"(p)
  307. :"memory");
  308. pix += line_size*2;
  309. p += 16;
  310. } while (--i);
  311. }
  312. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  313. {
  314. __asm __volatile(
  315. "lea (%3, %3), %%eax \n\t"
  316. ".balign 8 \n\t"
  317. "1: \n\t"
  318. "movq (%1), %%mm0 \n\t"
  319. "movq (%1, %3), %%mm1 \n\t"
  320. "movq %%mm0, (%2) \n\t"
  321. "movq %%mm1, (%2, %3) \n\t"
  322. "addl %%eax, %1 \n\t"
  323. "addl %%eax, %2 \n\t"
  324. "movq (%1), %%mm0 \n\t"
  325. "movq (%1, %3), %%mm1 \n\t"
  326. "movq %%mm0, (%2) \n\t"
  327. "movq %%mm1, (%2, %3) \n\t"
  328. "addl %%eax, %1 \n\t"
  329. "addl %%eax, %2 \n\t"
  330. "subl $4, %0 \n\t"
  331. "jnz 1b \n\t"
  332. : "+g"(h), "+r" (pixels), "+r" (block)
  333. : "r"(line_size)
  334. : "%eax", "memory"
  335. );
  336. }
  337. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  338. {
  339. __asm __volatile(
  340. "lea (%3, %3), %%eax \n\t"
  341. ".balign 8 \n\t"
  342. "1: \n\t"
  343. "movq (%1), %%mm0 \n\t"
  344. "movq 8(%1), %%mm4 \n\t"
  345. "movq (%1, %3), %%mm1 \n\t"
  346. "movq 8(%1, %3), %%mm5 \n\t"
  347. "movq %%mm0, (%2) \n\t"
  348. "movq %%mm4, 8(%2) \n\t"
  349. "movq %%mm1, (%2, %3) \n\t"
  350. "movq %%mm5, 8(%2, %3) \n\t"
  351. "addl %%eax, %1 \n\t"
  352. "addl %%eax, %2 \n\t"
  353. "movq (%1), %%mm0 \n\t"
  354. "movq 8(%1), %%mm4 \n\t"
  355. "movq (%1, %3), %%mm1 \n\t"
  356. "movq 8(%1, %3), %%mm5 \n\t"
  357. "movq %%mm0, (%2) \n\t"
  358. "movq %%mm4, 8(%2) \n\t"
  359. "movq %%mm1, (%2, %3) \n\t"
  360. "movq %%mm5, 8(%2, %3) \n\t"
  361. "addl %%eax, %1 \n\t"
  362. "addl %%eax, %2 \n\t"
  363. "subl $4, %0 \n\t"
  364. "jnz 1b \n\t"
  365. : "+g"(h), "+r" (pixels), "+r" (block)
  366. : "r"(line_size)
  367. : "%eax", "memory"
  368. );
  369. }
  370. static void clear_blocks_mmx(DCTELEM *blocks)
  371. {
  372. __asm __volatile(
  373. "pxor %%mm7, %%mm7 \n\t"
  374. "movl $-128*6, %%eax \n\t"
  375. "1: \n\t"
  376. "movq %%mm7, (%0, %%eax) \n\t"
  377. "movq %%mm7, 8(%0, %%eax) \n\t"
  378. "movq %%mm7, 16(%0, %%eax) \n\t"
  379. "movq %%mm7, 24(%0, %%eax) \n\t"
  380. "addl $32, %%eax \n\t"
  381. " js 1b \n\t"
  382. : : "r" (((int)blocks)+128*6)
  383. : "%eax"
  384. );
  385. }
  386. #ifdef CONFIG_ENCODERS
  387. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  388. const int h=16;
  389. int sum;
  390. int index= -line_size*h;
  391. __asm __volatile(
  392. "pxor %%mm7, %%mm7 \n\t"
  393. "pxor %%mm6, %%mm6 \n\t"
  394. "1: \n\t"
  395. "movq (%2, %1), %%mm0 \n\t"
  396. "movq (%2, %1), %%mm1 \n\t"
  397. "movq 8(%2, %1), %%mm2 \n\t"
  398. "movq 8(%2, %1), %%mm3 \n\t"
  399. "punpcklbw %%mm7, %%mm0 \n\t"
  400. "punpckhbw %%mm7, %%mm1 \n\t"
  401. "punpcklbw %%mm7, %%mm2 \n\t"
  402. "punpckhbw %%mm7, %%mm3 \n\t"
  403. "paddw %%mm0, %%mm1 \n\t"
  404. "paddw %%mm2, %%mm3 \n\t"
  405. "paddw %%mm1, %%mm3 \n\t"
  406. "paddw %%mm3, %%mm6 \n\t"
  407. "addl %3, %1 \n\t"
  408. " js 1b \n\t"
  409. "movq %%mm6, %%mm5 \n\t"
  410. "psrlq $32, %%mm6 \n\t"
  411. "paddw %%mm5, %%mm6 \n\t"
  412. "movq %%mm6, %%mm5 \n\t"
  413. "psrlq $16, %%mm6 \n\t"
  414. "paddw %%mm5, %%mm6 \n\t"
  415. "movd %%mm6, %0 \n\t"
  416. "andl $0xFFFF, %0 \n\t"
  417. : "=&r" (sum), "+r" (index)
  418. : "r" (pix - index), "r" (line_size)
  419. );
  420. return sum;
  421. }
  422. #endif //CONFIG_ENCODERS
  423. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
  424. int i=0;
  425. asm volatile(
  426. "1: \n\t"
  427. "movq (%1, %0), %%mm0 \n\t"
  428. "movq (%2, %0), %%mm1 \n\t"
  429. "paddb %%mm0, %%mm1 \n\t"
  430. "movq %%mm1, (%2, %0) \n\t"
  431. "movq 8(%1, %0), %%mm0 \n\t"
  432. "movq 8(%2, %0), %%mm1 \n\t"
  433. "paddb %%mm0, %%mm1 \n\t"
  434. "movq %%mm1, 8(%2, %0) \n\t"
  435. "addl $16, %0 \n\t"
  436. "cmpl %3, %0 \n\t"
  437. " jb 1b \n\t"
  438. : "+r" (i)
  439. : "r"(src), "r"(dst), "r"(w-15)
  440. );
  441. for(; i<w; i++)
  442. dst[i+0] += src[i+0];
  443. }
  444. #define H263_LOOP_FILTER \
  445. "pxor %%mm7, %%mm7 \n\t"\
  446. "movq %0, %%mm0 \n\t"\
  447. "movq %0, %%mm1 \n\t"\
  448. "movq %3, %%mm2 \n\t"\
  449. "movq %3, %%mm3 \n\t"\
  450. "punpcklbw %%mm7, %%mm0 \n\t"\
  451. "punpckhbw %%mm7, %%mm1 \n\t"\
  452. "punpcklbw %%mm7, %%mm2 \n\t"\
  453. "punpckhbw %%mm7, %%mm3 \n\t"\
  454. "psubw %%mm2, %%mm0 \n\t"\
  455. "psubw %%mm3, %%mm1 \n\t"\
  456. "movq %1, %%mm2 \n\t"\
  457. "movq %1, %%mm3 \n\t"\
  458. "movq %2, %%mm4 \n\t"\
  459. "movq %2, %%mm5 \n\t"\
  460. "punpcklbw %%mm7, %%mm2 \n\t"\
  461. "punpckhbw %%mm7, %%mm3 \n\t"\
  462. "punpcklbw %%mm7, %%mm4 \n\t"\
  463. "punpckhbw %%mm7, %%mm5 \n\t"\
  464. "psubw %%mm2, %%mm4 \n\t"\
  465. "psubw %%mm3, %%mm5 \n\t"\
  466. "psllw $2, %%mm4 \n\t"\
  467. "psllw $2, %%mm5 \n\t"\
  468. "paddw %%mm0, %%mm4 \n\t"\
  469. "paddw %%mm1, %%mm5 \n\t"\
  470. "pxor %%mm6, %%mm6 \n\t"\
  471. "pcmpgtw %%mm4, %%mm6 \n\t"\
  472. "pcmpgtw %%mm5, %%mm7 \n\t"\
  473. "pxor %%mm6, %%mm4 \n\t"\
  474. "pxor %%mm7, %%mm5 \n\t"\
  475. "psubw %%mm6, %%mm4 \n\t"\
  476. "psubw %%mm7, %%mm5 \n\t"\
  477. "psrlw $3, %%mm4 \n\t"\
  478. "psrlw $3, %%mm5 \n\t"\
  479. "packuswb %%mm5, %%mm4 \n\t"\
  480. "packsswb %%mm7, %%mm6 \n\t"\
  481. "pxor %%mm7, %%mm7 \n\t"\
  482. "movd %4, %%mm2 \n\t"\
  483. "punpcklbw %%mm2, %%mm2 \n\t"\
  484. "punpcklbw %%mm2, %%mm2 \n\t"\
  485. "punpcklbw %%mm2, %%mm2 \n\t"\
  486. "psubusb %%mm4, %%mm2 \n\t"\
  487. "movq %%mm2, %%mm3 \n\t"\
  488. "psubusb %%mm4, %%mm3 \n\t"\
  489. "psubb %%mm3, %%mm2 \n\t"\
  490. "movq %1, %%mm3 \n\t"\
  491. "movq %2, %%mm4 \n\t"\
  492. "pxor %%mm6, %%mm3 \n\t"\
  493. "pxor %%mm6, %%mm4 \n\t"\
  494. "paddusb %%mm2, %%mm3 \n\t"\
  495. "psubusb %%mm2, %%mm4 \n\t"\
  496. "pxor %%mm6, %%mm3 \n\t"\
  497. "pxor %%mm6, %%mm4 \n\t"\
  498. "paddusb %%mm2, %%mm2 \n\t"\
  499. "packsswb %%mm1, %%mm0 \n\t"\
  500. "pcmpgtb %%mm0, %%mm7 \n\t"\
  501. "pxor %%mm7, %%mm0 \n\t"\
  502. "psubb %%mm7, %%mm0 \n\t"\
  503. "movq %%mm0, %%mm1 \n\t"\
  504. "psubusb %%mm2, %%mm0 \n\t"\
  505. "psubb %%mm0, %%mm1 \n\t"\
  506. "pand %5, %%mm1 \n\t"\
  507. "psrlw $2, %%mm1 \n\t"\
  508. "pxor %%mm7, %%mm1 \n\t"\
  509. "psubb %%mm7, %%mm1 \n\t"\
  510. "movq %0, %%mm5 \n\t"\
  511. "movq %3, %%mm6 \n\t"\
  512. "psubb %%mm1, %%mm5 \n\t"\
  513. "paddb %%mm1, %%mm6 \n\t"
  514. static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  515. const int strength= ff_h263_loop_filter_strength[qscale];
  516. asm volatile(
  517. H263_LOOP_FILTER
  518. "movq %%mm3, %1 \n\t"
  519. "movq %%mm4, %2 \n\t"
  520. "movq %%mm5, %0 \n\t"
  521. "movq %%mm6, %3 \n\t"
  522. : "+m" (*(uint64_t*)(src - 2*stride)),
  523. "+m" (*(uint64_t*)(src - 1*stride)),
  524. "+m" (*(uint64_t*)(src + 0*stride)),
  525. "+m" (*(uint64_t*)(src + 1*stride))
  526. : "g" (2*strength), "m"(ff_pb_FC)
  527. );
  528. }
  529. static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
  530. asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
  531. "movd %4, %%mm0 \n\t"
  532. "movd %5, %%mm1 \n\t"
  533. "movd %6, %%mm2 \n\t"
  534. "movd %7, %%mm3 \n\t"
  535. "punpcklbw %%mm1, %%mm0 \n\t"
  536. "punpcklbw %%mm3, %%mm2 \n\t"
  537. "movq %%mm0, %%mm1 \n\t"
  538. "punpcklwd %%mm2, %%mm0 \n\t"
  539. "punpckhwd %%mm2, %%mm1 \n\t"
  540. "movd %%mm0, %0 \n\t"
  541. "punpckhdq %%mm0, %%mm0 \n\t"
  542. "movd %%mm0, %1 \n\t"
  543. "movd %%mm1, %2 \n\t"
  544. "punpckhdq %%mm1, %%mm1 \n\t"
  545. "movd %%mm1, %3 \n\t"
  546. : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
  547. "=m" (*(uint32_t*)(dst + 1*dst_stride)),
  548. "=m" (*(uint32_t*)(dst + 2*dst_stride)),
  549. "=m" (*(uint32_t*)(dst + 3*dst_stride))
  550. : "m" (*(uint32_t*)(src + 0*src_stride)),
  551. "m" (*(uint32_t*)(src + 1*src_stride)),
  552. "m" (*(uint32_t*)(src + 2*src_stride)),
  553. "m" (*(uint32_t*)(src + 3*src_stride))
  554. );
  555. }
  556. static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
  557. const int strength= ff_h263_loop_filter_strength[qscale];
  558. uint64_t temp[4] __attribute__ ((aligned(8)));
  559. uint8_t *btemp= (uint8_t*)temp;
  560. src -= 2;
  561. transpose4x4(btemp , src , 8, stride);
  562. transpose4x4(btemp+4, src + 4*stride, 8, stride);
  563. asm volatile(
  564. H263_LOOP_FILTER // 5 3 4 6
  565. : "+m" (temp[0]),
  566. "+m" (temp[1]),
  567. "+m" (temp[2]),
  568. "+m" (temp[3])
  569. : "g" (2*strength), "m"(ff_pb_FC)
  570. );
  571. asm volatile(
  572. "movq %%mm5, %%mm1 \n\t"
  573. "movq %%mm4, %%mm0 \n\t"
  574. "punpcklbw %%mm3, %%mm5 \n\t"
  575. "punpcklbw %%mm6, %%mm4 \n\t"
  576. "punpckhbw %%mm3, %%mm1 \n\t"
  577. "punpckhbw %%mm6, %%mm0 \n\t"
  578. "movq %%mm5, %%mm3 \n\t"
  579. "movq %%mm1, %%mm6 \n\t"
  580. "punpcklwd %%mm4, %%mm5 \n\t"
  581. "punpcklwd %%mm0, %%mm1 \n\t"
  582. "punpckhwd %%mm4, %%mm3 \n\t"
  583. "punpckhwd %%mm0, %%mm6 \n\t"
  584. "movd %%mm5, %0 \n\t"
  585. "punpckhdq %%mm5, %%mm5 \n\t"
  586. "movd %%mm5, %1 \n\t"
  587. "movd %%mm3, %2 \n\t"
  588. "punpckhdq %%mm3, %%mm3 \n\t"
  589. "movd %%mm3, %3 \n\t"
  590. "movd %%mm1, %4 \n\t"
  591. "punpckhdq %%mm1, %%mm1 \n\t"
  592. "movd %%mm1, %5 \n\t"
  593. "movd %%mm6, %6 \n\t"
  594. "punpckhdq %%mm6, %%mm6 \n\t"
  595. "movd %%mm6, %7 \n\t"
  596. : "=m" (*(uint32_t*)(src + 0*stride)),
  597. "=m" (*(uint32_t*)(src + 1*stride)),
  598. "=m" (*(uint32_t*)(src + 2*stride)),
  599. "=m" (*(uint32_t*)(src + 3*stride)),
  600. "=m" (*(uint32_t*)(src + 4*stride)),
  601. "=m" (*(uint32_t*)(src + 5*stride)),
  602. "=m" (*(uint32_t*)(src + 6*stride)),
  603. "=m" (*(uint32_t*)(src + 7*stride))
  604. );
  605. }
  606. #ifdef CONFIG_ENCODERS
  607. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  608. int tmp;
  609. asm volatile (
  610. "movl $16,%%ecx\n"
  611. "pxor %%mm0,%%mm0\n"
  612. "pxor %%mm7,%%mm7\n"
  613. "1:\n"
  614. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  615. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  616. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  617. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  618. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  619. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  620. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  621. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  622. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  623. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  624. "pmaddwd %%mm3,%%mm3\n"
  625. "pmaddwd %%mm4,%%mm4\n"
  626. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  627. pix2^2+pix3^2+pix6^2+pix7^2) */
  628. "paddd %%mm3,%%mm4\n"
  629. "paddd %%mm2,%%mm7\n"
  630. "addl %2, %0\n"
  631. "paddd %%mm4,%%mm7\n"
  632. "dec %%ecx\n"
  633. "jnz 1b\n"
  634. "movq %%mm7,%%mm1\n"
  635. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  636. "paddd %%mm7,%%mm1\n"
  637. "movd %%mm1,%1\n"
  638. : "+r" (pix), "=r"(tmp) : "r" (line_size) : "%ecx" );
  639. return tmp;
  640. }
  641. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  642. int tmp;
  643. asm volatile (
  644. "movl %4,%%ecx\n"
  645. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  646. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  647. "1:\n"
  648. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  649. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  650. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  651. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  652. /* todo: mm1-mm2, mm3-mm4 */
  653. /* algo: substract mm1 from mm2 with saturation and vice versa */
  654. /* OR the results to get absolute difference */
  655. "movq %%mm1,%%mm5\n"
  656. "movq %%mm3,%%mm6\n"
  657. "psubusb %%mm2,%%mm1\n"
  658. "psubusb %%mm4,%%mm3\n"
  659. "psubusb %%mm5,%%mm2\n"
  660. "psubusb %%mm6,%%mm4\n"
  661. "por %%mm1,%%mm2\n"
  662. "por %%mm3,%%mm4\n"
  663. /* now convert to 16-bit vectors so we can square them */
  664. "movq %%mm2,%%mm1\n"
  665. "movq %%mm4,%%mm3\n"
  666. "punpckhbw %%mm0,%%mm2\n"
  667. "punpckhbw %%mm0,%%mm4\n"
  668. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  669. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  670. "pmaddwd %%mm2,%%mm2\n"
  671. "pmaddwd %%mm4,%%mm4\n"
  672. "pmaddwd %%mm1,%%mm1\n"
  673. "pmaddwd %%mm3,%%mm3\n"
  674. "addl %3,%0\n"
  675. "addl %3,%1\n"
  676. "paddd %%mm2,%%mm1\n"
  677. "paddd %%mm4,%%mm3\n"
  678. "paddd %%mm1,%%mm7\n"
  679. "paddd %%mm3,%%mm7\n"
  680. "decl %%ecx\n"
  681. "jnz 1b\n"
  682. "movq %%mm7,%%mm1\n"
  683. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  684. "paddd %%mm7,%%mm1\n"
  685. "movd %%mm1,%2\n"
  686. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  687. : "r" (line_size) , "m" (h)
  688. : "%ecx");
  689. return tmp;
  690. }
  691. static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  692. int tmp;
  693. assert( (((int)pix) & 7) == 0);
  694. assert((line_size &7) ==0);
  695. #define SUM(in0, in1, out0, out1) \
  696. "movq (%0), %%mm2\n"\
  697. "movq 8(%0), %%mm3\n"\
  698. "addl %2,%0\n"\
  699. "movq %%mm2, " #out0 "\n"\
  700. "movq %%mm3, " #out1 "\n"\
  701. "psubusb " #in0 ", %%mm2\n"\
  702. "psubusb " #in1 ", %%mm3\n"\
  703. "psubusb " #out0 ", " #in0 "\n"\
  704. "psubusb " #out1 ", " #in1 "\n"\
  705. "por %%mm2, " #in0 "\n"\
  706. "por %%mm3, " #in1 "\n"\
  707. "movq " #in0 ", %%mm2\n"\
  708. "movq " #in1 ", %%mm3\n"\
  709. "punpcklbw %%mm7, " #in0 "\n"\
  710. "punpcklbw %%mm7, " #in1 "\n"\
  711. "punpckhbw %%mm7, %%mm2\n"\
  712. "punpckhbw %%mm7, %%mm3\n"\
  713. "paddw " #in1 ", " #in0 "\n"\
  714. "paddw %%mm3, %%mm2\n"\
  715. "paddw %%mm2, " #in0 "\n"\
  716. "paddw " #in0 ", %%mm6\n"
  717. asm volatile (
  718. "movl %3,%%ecx\n"
  719. "pxor %%mm6,%%mm6\n"
  720. "pxor %%mm7,%%mm7\n"
  721. "movq (%0),%%mm0\n"
  722. "movq 8(%0),%%mm1\n"
  723. "addl %2,%0\n"
  724. "subl $2, %%ecx\n"
  725. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  726. "1:\n"
  727. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  728. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  729. "subl $2, %%ecx\n"
  730. "jnz 1b\n"
  731. "movq %%mm6,%%mm0\n"
  732. "psrlq $32, %%mm6\n"
  733. "paddw %%mm6,%%mm0\n"
  734. "movq %%mm0,%%mm6\n"
  735. "psrlq $16, %%mm0\n"
  736. "paddw %%mm6,%%mm0\n"
  737. "movd %%mm0,%1\n"
  738. : "+r" (pix), "=r"(tmp)
  739. : "r" (line_size) , "m" (h)
  740. : "%ecx");
  741. return tmp & 0xFFFF;
  742. }
  743. #undef SUM
  744. static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  745. int tmp;
  746. assert( (((int)pix) & 7) == 0);
  747. assert((line_size &7) ==0);
  748. #define SUM(in0, in1, out0, out1) \
  749. "movq (%0), " #out0 "\n"\
  750. "movq 8(%0), " #out1 "\n"\
  751. "addl %2,%0\n"\
  752. "psadbw " #out0 ", " #in0 "\n"\
  753. "psadbw " #out1 ", " #in1 "\n"\
  754. "paddw " #in1 ", " #in0 "\n"\
  755. "paddw " #in0 ", %%mm6\n"
  756. asm volatile (
  757. "movl %3,%%ecx\n"
  758. "pxor %%mm6,%%mm6\n"
  759. "pxor %%mm7,%%mm7\n"
  760. "movq (%0),%%mm0\n"
  761. "movq 8(%0),%%mm1\n"
  762. "addl %2,%0\n"
  763. "subl $2, %%ecx\n"
  764. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  765. "1:\n"
  766. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  767. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  768. "subl $2, %%ecx\n"
  769. "jnz 1b\n"
  770. "movd %%mm6,%1\n"
  771. : "+r" (pix), "=r"(tmp)
  772. : "r" (line_size) , "m" (h)
  773. : "%ecx");
  774. return tmp;
  775. }
  776. #undef SUM
  777. static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  778. int tmp;
  779. assert( (((int)pix1) & 7) == 0);
  780. assert( (((int)pix2) & 7) == 0);
  781. assert((line_size &7) ==0);
  782. #define SUM(in0, in1, out0, out1) \
  783. "movq (%0),%%mm2\n"\
  784. "movq (%1)," #out0 "\n"\
  785. "movq 8(%0),%%mm3\n"\
  786. "movq 8(%1)," #out1 "\n"\
  787. "addl %3,%0\n"\
  788. "addl %3,%1\n"\
  789. "psubb " #out0 ", %%mm2\n"\
  790. "psubb " #out1 ", %%mm3\n"\
  791. "pxor %%mm7, %%mm2\n"\
  792. "pxor %%mm7, %%mm3\n"\
  793. "movq %%mm2, " #out0 "\n"\
  794. "movq %%mm3, " #out1 "\n"\
  795. "psubusb " #in0 ", %%mm2\n"\
  796. "psubusb " #in1 ", %%mm3\n"\
  797. "psubusb " #out0 ", " #in0 "\n"\
  798. "psubusb " #out1 ", " #in1 "\n"\
  799. "por %%mm2, " #in0 "\n"\
  800. "por %%mm3, " #in1 "\n"\
  801. "movq " #in0 ", %%mm2\n"\
  802. "movq " #in1 ", %%mm3\n"\
  803. "punpcklbw %%mm7, " #in0 "\n"\
  804. "punpcklbw %%mm7, " #in1 "\n"\
  805. "punpckhbw %%mm7, %%mm2\n"\
  806. "punpckhbw %%mm7, %%mm3\n"\
  807. "paddw " #in1 ", " #in0 "\n"\
  808. "paddw %%mm3, %%mm2\n"\
  809. "paddw %%mm2, " #in0 "\n"\
  810. "paddw " #in0 ", %%mm6\n"
  811. asm volatile (
  812. "movl %4,%%ecx\n"
  813. "pxor %%mm6,%%mm6\n"
  814. "pcmpeqw %%mm7,%%mm7\n"
  815. "psllw $15, %%mm7\n"
  816. "packsswb %%mm7, %%mm7\n"
  817. "movq (%0),%%mm0\n"
  818. "movq (%1),%%mm2\n"
  819. "movq 8(%0),%%mm1\n"
  820. "movq 8(%1),%%mm3\n"
  821. "addl %3,%0\n"
  822. "addl %3,%1\n"
  823. "subl $2, %%ecx\n"
  824. "psubb %%mm2, %%mm0\n"
  825. "psubb %%mm3, %%mm1\n"
  826. "pxor %%mm7, %%mm0\n"
  827. "pxor %%mm7, %%mm1\n"
  828. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  829. "1:\n"
  830. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  831. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  832. "subl $2, %%ecx\n"
  833. "jnz 1b\n"
  834. "movq %%mm6,%%mm0\n"
  835. "psrlq $32, %%mm6\n"
  836. "paddw %%mm6,%%mm0\n"
  837. "movq %%mm0,%%mm6\n"
  838. "psrlq $16, %%mm0\n"
  839. "paddw %%mm6,%%mm0\n"
  840. "movd %%mm0,%2\n"
  841. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  842. : "r" (line_size) , "m" (h)
  843. : "%ecx");
  844. return tmp & 0x7FFF;
  845. }
  846. #undef SUM
  847. static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  848. int tmp;
  849. assert( (((int)pix1) & 7) == 0);
  850. assert( (((int)pix2) & 7) == 0);
  851. assert((line_size &7) ==0);
  852. #define SUM(in0, in1, out0, out1) \
  853. "movq (%0)," #out0 "\n"\
  854. "movq (%1),%%mm2\n"\
  855. "movq 8(%0)," #out1 "\n"\
  856. "movq 8(%1),%%mm3\n"\
  857. "addl %3,%0\n"\
  858. "addl %3,%1\n"\
  859. "psubb %%mm2, " #out0 "\n"\
  860. "psubb %%mm3, " #out1 "\n"\
  861. "pxor %%mm7, " #out0 "\n"\
  862. "pxor %%mm7, " #out1 "\n"\
  863. "psadbw " #out0 ", " #in0 "\n"\
  864. "psadbw " #out1 ", " #in1 "\n"\
  865. "paddw " #in1 ", " #in0 "\n"\
  866. "paddw " #in0 ", %%mm6\n"
  867. asm volatile (
  868. "movl %4,%%ecx\n"
  869. "pxor %%mm6,%%mm6\n"
  870. "pcmpeqw %%mm7,%%mm7\n"
  871. "psllw $15, %%mm7\n"
  872. "packsswb %%mm7, %%mm7\n"
  873. "movq (%0),%%mm0\n"
  874. "movq (%1),%%mm2\n"
  875. "movq 8(%0),%%mm1\n"
  876. "movq 8(%1),%%mm3\n"
  877. "addl %3,%0\n"
  878. "addl %3,%1\n"
  879. "subl $2, %%ecx\n"
  880. "psubb %%mm2, %%mm0\n"
  881. "psubb %%mm3, %%mm1\n"
  882. "pxor %%mm7, %%mm0\n"
  883. "pxor %%mm7, %%mm1\n"
  884. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  885. "1:\n"
  886. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  887. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  888. "subl $2, %%ecx\n"
  889. "jnz 1b\n"
  890. "movd %%mm6,%2\n"
  891. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  892. : "r" (line_size) , "m" (h)
  893. : "%ecx");
  894. return tmp;
  895. }
  896. #undef SUM
  897. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  898. int i=0;
  899. asm volatile(
  900. "1: \n\t"
  901. "movq (%2, %0), %%mm0 \n\t"
  902. "movq (%1, %0), %%mm1 \n\t"
  903. "psubb %%mm0, %%mm1 \n\t"
  904. "movq %%mm1, (%3, %0) \n\t"
  905. "movq 8(%2, %0), %%mm0 \n\t"
  906. "movq 8(%1, %0), %%mm1 \n\t"
  907. "psubb %%mm0, %%mm1 \n\t"
  908. "movq %%mm1, 8(%3, %0) \n\t"
  909. "addl $16, %0 \n\t"
  910. "cmpl %4, %0 \n\t"
  911. " jb 1b \n\t"
  912. : "+r" (i)
  913. : "r"(src1), "r"(src2), "r"(dst), "r"(w-15)
  914. );
  915. for(; i<w; i++)
  916. dst[i+0] = src1[i+0]-src2[i+0];
  917. }
  918. static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
  919. int i=0;
  920. uint8_t l, lt;
  921. asm volatile(
  922. "1: \n\t"
  923. "movq -1(%1, %0), %%mm0 \n\t" // LT
  924. "movq (%1, %0), %%mm1 \n\t" // T
  925. "movq -1(%2, %0), %%mm2 \n\t" // L
  926. "movq (%2, %0), %%mm3 \n\t" // X
  927. "movq %%mm2, %%mm4 \n\t" // L
  928. "psubb %%mm0, %%mm2 \n\t"
  929. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  930. "movq %%mm4, %%mm5 \n\t" // L
  931. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  932. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  933. "pminub %%mm2, %%mm4 \n\t"
  934. "pmaxub %%mm1, %%mm4 \n\t"
  935. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  936. "movq %%mm3, (%3, %0) \n\t"
  937. "addl $8, %0 \n\t"
  938. "cmpl %4, %0 \n\t"
  939. " jb 1b \n\t"
  940. : "+r" (i)
  941. : "r"(src1), "r"(src2), "r"(dst), "r"(w)
  942. );
  943. l= *left;
  944. lt= *left_top;
  945. dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  946. *left_top= src1[w-1];
  947. *left = src2[w-1];
  948. }
  949. #define LBUTTERFLY2(a1,b1,a2,b2)\
  950. "paddw " #b1 ", " #a1 " \n\t"\
  951. "paddw " #b2 ", " #a2 " \n\t"\
  952. "paddw " #b1 ", " #b1 " \n\t"\
  953. "paddw " #b2 ", " #b2 " \n\t"\
  954. "psubw " #a1 ", " #b1 " \n\t"\
  955. "psubw " #a2 ", " #b2 " \n\t"
  956. #define HADAMARD48\
  957. LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
  958. LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
  959. LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
  960. LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
  961. LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
  962. LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
  963. #define MMABS(a,z)\
  964. "pxor " #z ", " #z " \n\t"\
  965. "pcmpgtw " #a ", " #z " \n\t"\
  966. "pxor " #z ", " #a " \n\t"\
  967. "psubw " #z ", " #a " \n\t"
  968. #define MMABS_SUM(a,z, sum)\
  969. "pxor " #z ", " #z " \n\t"\
  970. "pcmpgtw " #a ", " #z " \n\t"\
  971. "pxor " #z ", " #a " \n\t"\
  972. "psubw " #z ", " #a " \n\t"\
  973. "paddusw " #a ", " #sum " \n\t"
  974. #define MMABS_MMX2(a,z)\
  975. "pxor " #z ", " #z " \n\t"\
  976. "psubw " #a ", " #z " \n\t"\
  977. "pmaxsw " #z ", " #a " \n\t"
  978. #define MMABS_SUM_MMX2(a,z, sum)\
  979. "pxor " #z ", " #z " \n\t"\
  980. "psubw " #a ", " #z " \n\t"\
  981. "pmaxsw " #z ", " #a " \n\t"\
  982. "paddusw " #a ", " #sum " \n\t"
  983. #define SBUTTERFLY(a,b,t,n)\
  984. "movq " #a ", " #t " \n\t" /* abcd */\
  985. "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
  986. "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
  987. #define TRANSPOSE4(a,b,c,d,t)\
  988. SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
  989. SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
  990. SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
  991. SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
  992. #define LOAD4(o, a, b, c, d)\
  993. "movq "#o"(%1), " #a " \n\t"\
  994. "movq "#o"+16(%1), " #b " \n\t"\
  995. "movq "#o"+32(%1), " #c " \n\t"\
  996. "movq "#o"+48(%1), " #d " \n\t"
  997. #define STORE4(o, a, b, c, d)\
  998. "movq "#a", "#o"(%1) \n\t"\
  999. "movq "#b", "#o"+16(%1) \n\t"\
  1000. "movq "#c", "#o"+32(%1) \n\t"\
  1001. "movq "#d", "#o"+48(%1) \n\t"\
  1002. static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  1003. uint64_t temp[16] __align8;
  1004. int sum=0;
  1005. assert(h==8);
  1006. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  1007. asm volatile(
  1008. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1009. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  1010. HADAMARD48
  1011. "movq %%mm7, 112(%1) \n\t"
  1012. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1013. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  1014. "movq 112(%1), %%mm7 \n\t"
  1015. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1016. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  1017. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  1018. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1019. HADAMARD48
  1020. "movq %%mm7, 120(%1) \n\t"
  1021. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1022. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  1023. "movq 120(%1), %%mm7 \n\t"
  1024. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1025. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  1026. "movq %%mm6, %%mm7 \n\t"
  1027. "movq %%mm0, %%mm6 \n\t"
  1028. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  1029. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  1030. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1031. HADAMARD48
  1032. "movq %%mm7, 64(%1) \n\t"
  1033. MMABS(%%mm0, %%mm7)
  1034. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1035. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  1036. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  1037. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  1038. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  1039. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  1040. "movq 64(%1), %%mm1 \n\t"
  1041. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1042. "movq %%mm0, 64(%1) \n\t"
  1043. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1044. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  1045. HADAMARD48
  1046. "movq %%mm7, (%1) \n\t"
  1047. MMABS(%%mm0, %%mm7)
  1048. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1049. MMABS_SUM(%%mm2, %%mm7, %%mm0)
  1050. MMABS_SUM(%%mm3, %%mm7, %%mm0)
  1051. MMABS_SUM(%%mm4, %%mm7, %%mm0)
  1052. MMABS_SUM(%%mm5, %%mm7, %%mm0)
  1053. MMABS_SUM(%%mm6, %%mm7, %%mm0)
  1054. "movq (%1), %%mm1 \n\t"
  1055. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1056. "movq 64(%1), %%mm1 \n\t"
  1057. MMABS_SUM(%%mm1, %%mm7, %%mm0)
  1058. "movq %%mm0, %%mm1 \n\t"
  1059. "psrlq $32, %%mm0 \n\t"
  1060. "paddusw %%mm1, %%mm0 \n\t"
  1061. "movq %%mm0, %%mm1 \n\t"
  1062. "psrlq $16, %%mm0 \n\t"
  1063. "paddusw %%mm1, %%mm0 \n\t"
  1064. "movd %%mm0, %0 \n\t"
  1065. : "=r" (sum)
  1066. : "r"(temp)
  1067. );
  1068. return sum&0xFFFF;
  1069. }
  1070. static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
  1071. uint64_t temp[16] __align8;
  1072. int sum=0;
  1073. assert(h==8);
  1074. diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
  1075. asm volatile(
  1076. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1077. LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
  1078. HADAMARD48
  1079. "movq %%mm7, 112(%1) \n\t"
  1080. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1081. STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
  1082. "movq 112(%1), %%mm7 \n\t"
  1083. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1084. STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
  1085. LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
  1086. LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1087. HADAMARD48
  1088. "movq %%mm7, 120(%1) \n\t"
  1089. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
  1090. STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
  1091. "movq 120(%1), %%mm7 \n\t"
  1092. TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
  1093. "movq %%mm7, %%mm5 \n\t"//FIXME remove
  1094. "movq %%mm6, %%mm7 \n\t"
  1095. "movq %%mm0, %%mm6 \n\t"
  1096. // STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
  1097. LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
  1098. // LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
  1099. HADAMARD48
  1100. "movq %%mm7, 64(%1) \n\t"
  1101. MMABS_MMX2(%%mm0, %%mm7)
  1102. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1103. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  1104. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  1105. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  1106. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  1107. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  1108. "movq 64(%1), %%mm1 \n\t"
  1109. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1110. "movq %%mm0, 64(%1) \n\t"
  1111. LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
  1112. LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
  1113. HADAMARD48
  1114. "movq %%mm7, (%1) \n\t"
  1115. MMABS_MMX2(%%mm0, %%mm7)
  1116. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1117. MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
  1118. MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
  1119. MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
  1120. MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
  1121. MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
  1122. "movq (%1), %%mm1 \n\t"
  1123. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1124. "movq 64(%1), %%mm1 \n\t"
  1125. MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
  1126. "movq %%mm0, %%mm1 \n\t"
  1127. "psrlq $32, %%mm0 \n\t"
  1128. "paddusw %%mm1, %%mm0 \n\t"
  1129. "movq %%mm0, %%mm1 \n\t"
  1130. "psrlq $16, %%mm0 \n\t"
  1131. "paddusw %%mm1, %%mm0 \n\t"
  1132. "movd %%mm0, %0 \n\t"
  1133. : "=r" (sum)
  1134. : "r"(temp)
  1135. );
  1136. return sum&0xFFFF;
  1137. }
  1138. WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
  1139. WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
  1140. #endif //CONFIG_ENCODERS
  1141. #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
  1142. #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
  1143. #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
  1144. "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
  1145. "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
  1146. "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
  1147. "movq "#in7", " #m3 " \n\t" /* d */\
  1148. "movq "#in0", %%mm5 \n\t" /* D */\
  1149. "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
  1150. "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
  1151. "movq "#in1", %%mm5 \n\t" /* C */\
  1152. "movq "#in2", %%mm6 \n\t" /* B */\
  1153. "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
  1154. "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
  1155. "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
  1156. "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
  1157. "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
  1158. "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
  1159. "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
  1160. "psraw $5, %%mm5 \n\t"\
  1161. "packuswb %%mm5, %%mm5 \n\t"\
  1162. OP(%%mm5, out, %%mm7, d)
  1163. #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
  1164. static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1165. uint64_t temp;\
  1166. \
  1167. asm volatile(\
  1168. "pxor %%mm7, %%mm7 \n\t"\
  1169. "1: \n\t"\
  1170. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1171. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1172. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1173. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1174. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1175. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1176. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1177. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1178. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1179. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1180. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1181. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1182. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1183. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1184. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1185. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1186. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1187. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1188. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1189. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1190. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1191. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1192. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1193. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1194. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1195. "paddw %6, %%mm6 \n\t"\
  1196. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1197. "psraw $5, %%mm0 \n\t"\
  1198. "movq %%mm0, %5 \n\t"\
  1199. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1200. \
  1201. "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
  1202. "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
  1203. "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
  1204. "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
  1205. "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
  1206. "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
  1207. "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
  1208. "paddw %%mm0, %%mm2 \n\t" /* b */\
  1209. "paddw %%mm5, %%mm3 \n\t" /* c */\
  1210. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1211. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1212. "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
  1213. "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
  1214. "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
  1215. "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
  1216. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1217. "paddw %%mm2, %%mm1 \n\t" /* a */\
  1218. "paddw %%mm6, %%mm4 \n\t" /* d */\
  1219. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1220. "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
  1221. "paddw %6, %%mm1 \n\t"\
  1222. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
  1223. "psraw $5, %%mm3 \n\t"\
  1224. "movq %5, %%mm1 \n\t"\
  1225. "packuswb %%mm3, %%mm1 \n\t"\
  1226. OP_MMX2(%%mm1, (%1),%%mm4, q)\
  1227. /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
  1228. \
  1229. "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
  1230. "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
  1231. "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
  1232. "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
  1233. "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
  1234. "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
  1235. "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
  1236. "paddw %%mm1, %%mm5 \n\t" /* b */\
  1237. "paddw %%mm4, %%mm0 \n\t" /* c */\
  1238. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1239. "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
  1240. "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
  1241. "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
  1242. "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
  1243. "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
  1244. "paddw %%mm3, %%mm2 \n\t" /* d */\
  1245. "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
  1246. "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
  1247. "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
  1248. "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
  1249. "paddw %%mm2, %%mm6 \n\t" /* a */\
  1250. "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
  1251. "paddw %6, %%mm0 \n\t"\
  1252. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1253. "psraw $5, %%mm0 \n\t"\
  1254. /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
  1255. \
  1256. "paddw %%mm5, %%mm3 \n\t" /* a */\
  1257. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
  1258. "paddw %%mm4, %%mm6 \n\t" /* b */\
  1259. "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
  1260. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
  1261. "paddw %%mm1, %%mm4 \n\t" /* c */\
  1262. "paddw %%mm2, %%mm5 \n\t" /* d */\
  1263. "paddw %%mm6, %%mm6 \n\t" /* 2b */\
  1264. "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
  1265. "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
  1266. "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
  1267. "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
  1268. "paddw %6, %%mm4 \n\t"\
  1269. "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
  1270. "psraw $5, %%mm4 \n\t"\
  1271. "packuswb %%mm4, %%mm0 \n\t"\
  1272. OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
  1273. \
  1274. "addl %3, %0 \n\t"\
  1275. "addl %4, %1 \n\t"\
  1276. "decl %2 \n\t"\
  1277. " jnz 1b \n\t"\
  1278. : "+a"(src), "+c"(dst), "+m"(h)\
  1279. : "d"(srcStride), "S"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1280. : "memory"\
  1281. );\
  1282. }\
  1283. \
  1284. static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1285. int i;\
  1286. int16_t temp[16];\
  1287. /* quick HACK, XXX FIXME MUST be optimized */\
  1288. for(i=0; i<h; i++)\
  1289. {\
  1290. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1291. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1292. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1293. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1294. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1295. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
  1296. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
  1297. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
  1298. temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
  1299. temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
  1300. temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
  1301. temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
  1302. temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
  1303. temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
  1304. temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
  1305. temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
  1306. asm volatile(\
  1307. "movq (%0), %%mm0 \n\t"\
  1308. "movq 8(%0), %%mm1 \n\t"\
  1309. "paddw %2, %%mm0 \n\t"\
  1310. "paddw %2, %%mm1 \n\t"\
  1311. "psraw $5, %%mm0 \n\t"\
  1312. "psraw $5, %%mm1 \n\t"\
  1313. "packuswb %%mm1, %%mm0 \n\t"\
  1314. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1315. "movq 16(%0), %%mm0 \n\t"\
  1316. "movq 24(%0), %%mm1 \n\t"\
  1317. "paddw %2, %%mm0 \n\t"\
  1318. "paddw %2, %%mm1 \n\t"\
  1319. "psraw $5, %%mm0 \n\t"\
  1320. "psraw $5, %%mm1 \n\t"\
  1321. "packuswb %%mm1, %%mm0 \n\t"\
  1322. OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
  1323. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1324. : "memory"\
  1325. );\
  1326. dst+=dstStride;\
  1327. src+=srcStride;\
  1328. }\
  1329. }\
  1330. \
  1331. static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1332. uint64_t temp;\
  1333. \
  1334. asm volatile(\
  1335. "pxor %%mm7, %%mm7 \n\t"\
  1336. "1: \n\t"\
  1337. "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
  1338. "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
  1339. "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
  1340. "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
  1341. "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
  1342. "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
  1343. "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
  1344. "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
  1345. "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
  1346. "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
  1347. "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
  1348. "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
  1349. "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
  1350. "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
  1351. "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
  1352. "paddw %%mm3, %%mm5 \n\t" /* b */\
  1353. "paddw %%mm2, %%mm6 \n\t" /* c */\
  1354. "paddw %%mm5, %%mm5 \n\t" /* 2b */\
  1355. "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
  1356. "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
  1357. "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
  1358. "paddw %%mm4, %%mm0 \n\t" /* a */\
  1359. "paddw %%mm1, %%mm5 \n\t" /* d */\
  1360. "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
  1361. "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
  1362. "paddw %6, %%mm6 \n\t"\
  1363. "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
  1364. "psraw $5, %%mm0 \n\t"\
  1365. /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
  1366. \
  1367. "movd 5(%0), %%mm5 \n\t" /* FGHI */\
  1368. "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
  1369. "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
  1370. "paddw %%mm5, %%mm1 \n\t" /* a */\
  1371. "paddw %%mm6, %%mm2 \n\t" /* b */\
  1372. "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
  1373. "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
  1374. "paddw %%mm6, %%mm3 \n\t" /* c */\
  1375. "paddw %%mm5, %%mm4 \n\t" /* d */\
  1376. "paddw %%mm2, %%mm2 \n\t" /* 2b */\
  1377. "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
  1378. "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
  1379. "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
  1380. "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
  1381. "paddw %6, %%mm1 \n\t"\
  1382. "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
  1383. "psraw $5, %%mm3 \n\t"\
  1384. "packuswb %%mm3, %%mm0 \n\t"\
  1385. OP_MMX2(%%mm0, (%1), %%mm4, q)\
  1386. \
  1387. "addl %3, %0 \n\t"\
  1388. "addl %4, %1 \n\t"\
  1389. "decl %2 \n\t"\
  1390. " jnz 1b \n\t"\
  1391. : "+a"(src), "+c"(dst), "+m"(h)\
  1392. : "S"(srcStride), "D"(dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
  1393. : "memory"\
  1394. );\
  1395. }\
  1396. \
  1397. static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
  1398. int i;\
  1399. int16_t temp[8];\
  1400. /* quick HACK, XXX FIXME MUST be optimized */\
  1401. for(i=0; i<h; i++)\
  1402. {\
  1403. temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
  1404. temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
  1405. temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
  1406. temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
  1407. temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
  1408. temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
  1409. temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
  1410. temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
  1411. asm volatile(\
  1412. "movq (%0), %%mm0 \n\t"\
  1413. "movq 8(%0), %%mm1 \n\t"\
  1414. "paddw %2, %%mm0 \n\t"\
  1415. "paddw %2, %%mm1 \n\t"\
  1416. "psraw $5, %%mm0 \n\t"\
  1417. "psraw $5, %%mm1 \n\t"\
  1418. "packuswb %%mm1, %%mm0 \n\t"\
  1419. OP_3DNOW(%%mm0, (%1), %%mm1, q)\
  1420. :: "r"(temp), "r"(dst), "m"(ROUNDER)\
  1421. :"memory"\
  1422. );\
  1423. dst+=dstStride;\
  1424. src+=srcStride;\
  1425. }\
  1426. }
  1427. #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
  1428. \
  1429. static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1430. uint64_t temp[17*4];\
  1431. uint64_t *temp_ptr= temp;\
  1432. int count= 17;\
  1433. \
  1434. /*FIXME unroll */\
  1435. asm volatile(\
  1436. "pxor %%mm7, %%mm7 \n\t"\
  1437. "1: \n\t"\
  1438. "movq (%0), %%mm0 \n\t"\
  1439. "movq (%0), %%mm1 \n\t"\
  1440. "movq 8(%0), %%mm2 \n\t"\
  1441. "movq 8(%0), %%mm3 \n\t"\
  1442. "punpcklbw %%mm7, %%mm0 \n\t"\
  1443. "punpckhbw %%mm7, %%mm1 \n\t"\
  1444. "punpcklbw %%mm7, %%mm2 \n\t"\
  1445. "punpckhbw %%mm7, %%mm3 \n\t"\
  1446. "movq %%mm0, (%1) \n\t"\
  1447. "movq %%mm1, 17*8(%1) \n\t"\
  1448. "movq %%mm2, 2*17*8(%1) \n\t"\
  1449. "movq %%mm3, 3*17*8(%1) \n\t"\
  1450. "addl $8, %1 \n\t"\
  1451. "addl %3, %0 \n\t"\
  1452. "decl %2 \n\t"\
  1453. " jnz 1b \n\t"\
  1454. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1455. : "r" (srcStride)\
  1456. : "memory"\
  1457. );\
  1458. \
  1459. temp_ptr= temp;\
  1460. count=4;\
  1461. \
  1462. /*FIXME reorder for speed */\
  1463. asm volatile(\
  1464. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1465. "1: \n\t"\
  1466. "movq (%0), %%mm0 \n\t"\
  1467. "movq 8(%0), %%mm1 \n\t"\
  1468. "movq 16(%0), %%mm2 \n\t"\
  1469. "movq 24(%0), %%mm3 \n\t"\
  1470. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1471. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1472. "addl %4, %1 \n\t"\
  1473. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1474. \
  1475. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1476. "addl %4, %1 \n\t"\
  1477. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1478. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
  1479. "addl %4, %1 \n\t"\
  1480. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
  1481. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
  1482. "addl %4, %1 \n\t"\
  1483. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
  1484. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
  1485. "addl %4, %1 \n\t"\
  1486. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
  1487. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
  1488. "addl %4, %1 \n\t"\
  1489. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
  1490. \
  1491. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
  1492. "addl %4, %1 \n\t" \
  1493. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
  1494. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
  1495. \
  1496. "addl $136, %0 \n\t"\
  1497. "addl %6, %1 \n\t"\
  1498. "decl %2 \n\t"\
  1499. " jnz 1b \n\t"\
  1500. \
  1501. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1502. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*dstStride)\
  1503. :"memory"\
  1504. );\
  1505. }\
  1506. \
  1507. static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
  1508. uint64_t temp[9*4];\
  1509. uint64_t *temp_ptr= temp;\
  1510. int count= 9;\
  1511. \
  1512. /*FIXME unroll */\
  1513. asm volatile(\
  1514. "pxor %%mm7, %%mm7 \n\t"\
  1515. "1: \n\t"\
  1516. "movq (%0), %%mm0 \n\t"\
  1517. "movq (%0), %%mm1 \n\t"\
  1518. "punpcklbw %%mm7, %%mm0 \n\t"\
  1519. "punpckhbw %%mm7, %%mm1 \n\t"\
  1520. "movq %%mm0, (%1) \n\t"\
  1521. "movq %%mm1, 9*8(%1) \n\t"\
  1522. "addl $8, %1 \n\t"\
  1523. "addl %3, %0 \n\t"\
  1524. "decl %2 \n\t"\
  1525. " jnz 1b \n\t"\
  1526. : "+r" (src), "+r" (temp_ptr), "+r"(count)\
  1527. : "r" (srcStride)\
  1528. : "memory"\
  1529. );\
  1530. \
  1531. temp_ptr= temp;\
  1532. count=2;\
  1533. \
  1534. /*FIXME reorder for speed */\
  1535. asm volatile(\
  1536. /*"pxor %%mm7, %%mm7 \n\t"*/\
  1537. "1: \n\t"\
  1538. "movq (%0), %%mm0 \n\t"\
  1539. "movq 8(%0), %%mm1 \n\t"\
  1540. "movq 16(%0), %%mm2 \n\t"\
  1541. "movq 24(%0), %%mm3 \n\t"\
  1542. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
  1543. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
  1544. "addl %4, %1 \n\t"\
  1545. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
  1546. \
  1547. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
  1548. "addl %4, %1 \n\t"\
  1549. QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
  1550. \
  1551. QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
  1552. "addl %4, %1 \n\t"\
  1553. QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
  1554. QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
  1555. \
  1556. "addl $72, %0 \n\t"\
  1557. "addl %6, %1 \n\t"\
  1558. "decl %2 \n\t"\
  1559. " jnz 1b \n\t"\
  1560. \
  1561. : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
  1562. : "r"(dstStride), "r"(2*dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*dstStride)\
  1563. : "memory"\
  1564. );\
  1565. }\
  1566. \
  1567. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1568. OPNAME ## pixels8_mmx(dst, src, stride, 8);\
  1569. }\
  1570. \
  1571. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1572. uint64_t temp[8];\
  1573. uint8_t * const half= (uint8_t*)temp;\
  1574. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1575. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1576. }\
  1577. \
  1578. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1579. OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
  1580. }\
  1581. \
  1582. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1583. uint64_t temp[8];\
  1584. uint8_t * const half= (uint8_t*)temp;\
  1585. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
  1586. OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
  1587. }\
  1588. \
  1589. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1590. uint64_t temp[8];\
  1591. uint8_t * const half= (uint8_t*)temp;\
  1592. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1593. OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
  1594. }\
  1595. \
  1596. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1597. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1598. }\
  1599. \
  1600. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1601. uint64_t temp[8];\
  1602. uint8_t * const half= (uint8_t*)temp;\
  1603. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
  1604. OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
  1605. }\
  1606. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1607. uint64_t half[8 + 9];\
  1608. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1609. uint8_t * const halfHV= ((uint8_t*)half);\
  1610. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1611. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1612. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1613. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1614. }\
  1615. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1616. uint64_t half[8 + 9];\
  1617. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1618. uint8_t * const halfHV= ((uint8_t*)half);\
  1619. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1620. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1621. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1622. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1623. }\
  1624. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1625. uint64_t half[8 + 9];\
  1626. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1627. uint8_t * const halfHV= ((uint8_t*)half);\
  1628. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1629. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1630. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1631. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1632. }\
  1633. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1634. uint64_t half[8 + 9];\
  1635. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1636. uint8_t * const halfHV= ((uint8_t*)half);\
  1637. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1638. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1639. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1640. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1641. }\
  1642. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1643. uint64_t half[8 + 9];\
  1644. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1645. uint8_t * const halfHV= ((uint8_t*)half);\
  1646. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1647. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1648. OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
  1649. }\
  1650. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1651. uint64_t half[8 + 9];\
  1652. uint8_t * const halfH= ((uint8_t*)half) + 64;\
  1653. uint8_t * const halfHV= ((uint8_t*)half);\
  1654. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1655. put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  1656. OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
  1657. }\
  1658. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1659. uint64_t half[8 + 9];\
  1660. uint8_t * const halfH= ((uint8_t*)half);\
  1661. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1662. put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
  1663. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1664. }\
  1665. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1666. uint64_t half[8 + 9];\
  1667. uint8_t * const halfH= ((uint8_t*)half);\
  1668. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1669. put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
  1670. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1671. }\
  1672. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1673. uint64_t half[9];\
  1674. uint8_t * const halfH= ((uint8_t*)half);\
  1675. put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
  1676. OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
  1677. }\
  1678. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
  1679. OPNAME ## pixels16_mmx(dst, src, stride, 16);\
  1680. }\
  1681. \
  1682. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1683. uint64_t temp[32];\
  1684. uint8_t * const half= (uint8_t*)temp;\
  1685. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1686. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1687. }\
  1688. \
  1689. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1690. OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
  1691. }\
  1692. \
  1693. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1694. uint64_t temp[32];\
  1695. uint8_t * const half= (uint8_t*)temp;\
  1696. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
  1697. OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
  1698. }\
  1699. \
  1700. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1701. uint64_t temp[32];\
  1702. uint8_t * const half= (uint8_t*)temp;\
  1703. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1704. OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
  1705. }\
  1706. \
  1707. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1708. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
  1709. }\
  1710. \
  1711. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1712. uint64_t temp[32];\
  1713. uint8_t * const half= (uint8_t*)temp;\
  1714. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
  1715. OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
  1716. }\
  1717. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1718. uint64_t half[16*2 + 17*2];\
  1719. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1720. uint8_t * const halfHV= ((uint8_t*)half);\
  1721. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1722. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1723. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1724. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1725. }\
  1726. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1727. uint64_t half[16*2 + 17*2];\
  1728. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1729. uint8_t * const halfHV= ((uint8_t*)half);\
  1730. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1731. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1732. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1733. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1734. }\
  1735. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1736. uint64_t half[16*2 + 17*2];\
  1737. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1738. uint8_t * const halfHV= ((uint8_t*)half);\
  1739. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1740. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1741. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1742. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1743. }\
  1744. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1745. uint64_t half[16*2 + 17*2];\
  1746. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1747. uint8_t * const halfHV= ((uint8_t*)half);\
  1748. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1749. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1750. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1751. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1752. }\
  1753. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1754. uint64_t half[16*2 + 17*2];\
  1755. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1756. uint8_t * const halfHV= ((uint8_t*)half);\
  1757. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1758. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1759. OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
  1760. }\
  1761. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1762. uint64_t half[16*2 + 17*2];\
  1763. uint8_t * const halfH= ((uint8_t*)half) + 256;\
  1764. uint8_t * const halfHV= ((uint8_t*)half);\
  1765. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1766. put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
  1767. OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
  1768. }\
  1769. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1770. uint64_t half[17*2];\
  1771. uint8_t * const halfH= ((uint8_t*)half);\
  1772. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1773. put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
  1774. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1775. }\
  1776. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1777. uint64_t half[17*2];\
  1778. uint8_t * const halfH= ((uint8_t*)half);\
  1779. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1780. put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
  1781. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1782. }\
  1783. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
  1784. uint64_t half[17*2];\
  1785. uint8_t * const halfH= ((uint8_t*)half);\
  1786. put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
  1787. OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
  1788. }
  1789. #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
  1790. #define AVG_3DNOW_OP(a,b,temp, size) \
  1791. "mov" #size " " #b ", " #temp " \n\t"\
  1792. "pavgusb " #temp ", " #a " \n\t"\
  1793. "mov" #size " " #a ", " #b " \n\t"
  1794. #define AVG_MMX2_OP(a,b,temp, size) \
  1795. "mov" #size " " #b ", " #temp " \n\t"\
  1796. "pavgb " #temp ", " #a " \n\t"\
  1797. "mov" #size " " #a ", " #b " \n\t"
  1798. QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
  1799. QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
  1800. QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
  1801. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
  1802. QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
  1803. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
  1804. QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
  1805. QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
  1806. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
  1807. #if 0
  1808. static void just_return() { return; }
  1809. #endif
  1810. #define SET_QPEL_FUNC(postfix1, postfix2) \
  1811. c->put_ ## postfix1 = put_ ## postfix2;\
  1812. c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
  1813. c->avg_ ## postfix1 = avg_ ## postfix2;
  1814. static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
  1815. int i=0;
  1816. assert(ABS(scale) < 256);
  1817. scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
  1818. asm volatile(
  1819. "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
  1820. "psrlw $15, %%mm6 \n\t" // 1w
  1821. "pxor %%mm7, %%mm7 \n\t"
  1822. "movd %4, %%mm5 \n\t"
  1823. "punpcklwd %%mm5, %%mm5 \n\t"
  1824. "punpcklwd %%mm5, %%mm5 \n\t"
  1825. "1: \n\t"
  1826. "movq (%1, %0), %%mm0 \n\t"
  1827. "movq 8(%1, %0), %%mm1 \n\t"
  1828. "pmulhw %%mm5, %%mm0 \n\t"
  1829. "pmulhw %%mm5, %%mm1 \n\t"
  1830. "paddw %%mm6, %%mm0 \n\t"
  1831. "paddw %%mm6, %%mm1 \n\t"
  1832. "psraw $1, %%mm0 \n\t"
  1833. "psraw $1, %%mm1 \n\t"
  1834. "paddw (%2, %0), %%mm0 \n\t"
  1835. "paddw 8(%2, %0), %%mm1 \n\t"
  1836. "psraw $6, %%mm0 \n\t"
  1837. "psraw $6, %%mm1 \n\t"
  1838. "pmullw (%3, %0), %%mm0 \n\t"
  1839. "pmullw 8(%3, %0), %%mm1 \n\t"
  1840. "pmaddwd %%mm0, %%mm0 \n\t"
  1841. "pmaddwd %%mm1, %%mm1 \n\t"
  1842. "paddd %%mm1, %%mm0 \n\t"
  1843. "psrld $4, %%mm0 \n\t"
  1844. "paddd %%mm0, %%mm7 \n\t"
  1845. "addl $16, %0 \n\t"
  1846. "cmpl $128, %0 \n\t" //FIXME optimize & bench
  1847. " jb 1b \n\t"
  1848. "movq %%mm7, %%mm6 \n\t"
  1849. "psrlq $32, %%mm7 \n\t"
  1850. "paddd %%mm6, %%mm7 \n\t"
  1851. "psrld $2, %%mm7 \n\t"
  1852. "movd %%mm7, %0 \n\t"
  1853. : "+r" (i)
  1854. : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
  1855. );
  1856. return i;
  1857. }
  1858. static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
  1859. int i=0;
  1860. if(ABS(scale) < 256){
  1861. scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
  1862. asm volatile(
  1863. "pcmpeqw %%mm6, %%mm6 \n\t" // -1w
  1864. "psrlw $15, %%mm6 \n\t" // 1w
  1865. "movd %3, %%mm5 \n\t"
  1866. "punpcklwd %%mm5, %%mm5 \n\t"
  1867. "punpcklwd %%mm5, %%mm5 \n\t"
  1868. "1: \n\t"
  1869. "movq (%1, %0), %%mm0 \n\t"
  1870. "movq 8(%1, %0), %%mm1 \n\t"
  1871. "pmulhw %%mm5, %%mm0 \n\t"
  1872. "pmulhw %%mm5, %%mm1 \n\t"
  1873. "paddw %%mm6, %%mm0 \n\t"
  1874. "paddw %%mm6, %%mm1 \n\t"
  1875. "psraw $1, %%mm0 \n\t"
  1876. "psraw $1, %%mm1 \n\t"
  1877. "paddw (%2, %0), %%mm0 \n\t"
  1878. "paddw 8(%2, %0), %%mm1 \n\t"
  1879. "movq %%mm0, (%2, %0) \n\t"
  1880. "movq %%mm1, 8(%2, %0) \n\t"
  1881. "addl $16, %0 \n\t"
  1882. "cmpl $128, %0 \n\t" //FIXME optimize & bench
  1883. " jb 1b \n\t"
  1884. : "+r" (i)
  1885. : "r"(basis), "r"(rem), "g"(scale)
  1886. );
  1887. }else{
  1888. for(i=0; i<8*8; i++){
  1889. rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
  1890. }
  1891. }
  1892. }
  1893. /* external functions, from idct_mmx.c */
  1894. void ff_mmx_idct(DCTELEM *block);
  1895. void ff_mmxext_idct(DCTELEM *block);
  1896. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  1897. converted */
  1898. static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1899. {
  1900. ff_mmx_idct (block);
  1901. put_pixels_clamped_mmx(block, dest, line_size);
  1902. }
  1903. static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1904. {
  1905. ff_mmx_idct (block);
  1906. add_pixels_clamped_mmx(block, dest, line_size);
  1907. }
  1908. static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
  1909. {
  1910. ff_mmxext_idct (block);
  1911. put_pixels_clamped_mmx(block, dest, line_size);
  1912. }
  1913. static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
  1914. {
  1915. ff_mmxext_idct (block);
  1916. add_pixels_clamped_mmx(block, dest, line_size);
  1917. }
  1918. void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
  1919. {
  1920. mm_flags = mm_support();
  1921. if (avctx->dsp_mask) {
  1922. if (avctx->dsp_mask & FF_MM_FORCE)
  1923. mm_flags |= (avctx->dsp_mask & 0xffff);
  1924. else
  1925. mm_flags &= ~(avctx->dsp_mask & 0xffff);
  1926. }
  1927. #if 0
  1928. av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
  1929. if (mm_flags & MM_MMX)
  1930. av_log(avctx, AV_LOG_INFO, " mmx");
  1931. if (mm_flags & MM_MMXEXT)
  1932. av_log(avctx, AV_LOG_INFO, " mmxext");
  1933. if (mm_flags & MM_3DNOW)
  1934. av_log(avctx, AV_LOG_INFO, " 3dnow");
  1935. if (mm_flags & MM_SSE)
  1936. av_log(avctx, AV_LOG_INFO, " sse");
  1937. if (mm_flags & MM_SSE2)
  1938. av_log(avctx, AV_LOG_INFO, " sse2");
  1939. av_log(avctx, AV_LOG_INFO, "\n");
  1940. #endif
  1941. if (mm_flags & MM_MMX) {
  1942. const int idct_algo= avctx->idct_algo;
  1943. #ifdef CONFIG_ENCODERS
  1944. const int dct_algo = avctx->dct_algo;
  1945. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
  1946. if(mm_flags & MM_SSE2){
  1947. c->fdct = ff_fdct_sse2;
  1948. }else if(mm_flags & MM_MMXEXT){
  1949. c->fdct = ff_fdct_mmx2;
  1950. }else{
  1951. c->fdct = ff_fdct_mmx;
  1952. }
  1953. }
  1954. #endif //CONFIG_ENCODERS
  1955. if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
  1956. c->idct_put= ff_simple_idct_put_mmx;
  1957. c->idct_add= ff_simple_idct_add_mmx;
  1958. c->idct = ff_simple_idct_mmx;
  1959. c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
  1960. }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
  1961. if(mm_flags & MM_MMXEXT){
  1962. c->idct_put= ff_libmpeg2mmx2_idct_put;
  1963. c->idct_add= ff_libmpeg2mmx2_idct_add;
  1964. c->idct = ff_mmxext_idct;
  1965. }else{
  1966. c->idct_put= ff_libmpeg2mmx_idct_put;
  1967. c->idct_add= ff_libmpeg2mmx_idct_add;
  1968. c->idct = ff_mmx_idct;
  1969. }
  1970. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  1971. }
  1972. /* VP3 optimized DSP functions */
  1973. if (mm_flags & MM_SSE2) {
  1974. c->vp3_dsp_init = vp3_dsp_init_sse2;
  1975. c->vp3_idct = vp3_idct_sse2;
  1976. } else {
  1977. c->vp3_dsp_init = vp3_dsp_init_mmx;
  1978. c->vp3_idct = vp3_idct_mmx;
  1979. }
  1980. #ifdef CONFIG_ENCODERS
  1981. c->get_pixels = get_pixels_mmx;
  1982. c->diff_pixels = diff_pixels_mmx;
  1983. #endif //CONFIG_ENCODERS
  1984. c->put_pixels_clamped = put_pixels_clamped_mmx;
  1985. c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
  1986. c->add_pixels_clamped = add_pixels_clamped_mmx;
  1987. c->clear_blocks = clear_blocks_mmx;
  1988. #ifdef CONFIG_ENCODERS
  1989. c->pix_sum = pix_sum16_mmx;
  1990. #endif //CONFIG_ENCODERS
  1991. c->put_pixels_tab[0][0] = put_pixels16_mmx;
  1992. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
  1993. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
  1994. c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
  1995. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
  1996. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
  1997. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
  1998. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
  1999. c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
  2000. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
  2001. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
  2002. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
  2003. c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
  2004. c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
  2005. c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
  2006. c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
  2007. c->put_pixels_tab[1][0] = put_pixels8_mmx;
  2008. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
  2009. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
  2010. c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
  2011. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
  2012. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
  2013. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
  2014. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
  2015. c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
  2016. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
  2017. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
  2018. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
  2019. c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
  2020. c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
  2021. c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
  2022. c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
  2023. c->add_bytes= add_bytes_mmx;
  2024. #ifdef CONFIG_ENCODERS
  2025. c->diff_bytes= diff_bytes_mmx;
  2026. c->hadamard8_diff[0]= hadamard8_diff16_mmx;
  2027. c->hadamard8_diff[1]= hadamard8_diff_mmx;
  2028. c->pix_norm1 = pix_norm1_mmx;
  2029. c->sse[0] = sse16_mmx;
  2030. c->vsad[4]= vsad_intra16_mmx;
  2031. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2032. c->vsad[0] = vsad16_mmx;
  2033. }
  2034. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2035. c->try_8x8basis= try_8x8basis_mmx;
  2036. }
  2037. c->add_8x8basis= add_8x8basis_mmx;
  2038. #endif //CONFIG_ENCODERS
  2039. c->h263_v_loop_filter= h263_v_loop_filter_mmx;
  2040. c->h263_h_loop_filter= h263_h_loop_filter_mmx;
  2041. if (mm_flags & MM_MMXEXT) {
  2042. c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
  2043. c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
  2044. c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
  2045. c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
  2046. c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
  2047. c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
  2048. c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
  2049. c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
  2050. c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
  2051. c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
  2052. #ifdef CONFIG_ENCODERS
  2053. c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
  2054. c->hadamard8_diff[1]= hadamard8_diff_mmx2;
  2055. c->vsad[4]= vsad_intra16_mmx2;
  2056. #endif //CONFIG_ENCODERS
  2057. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2058. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
  2059. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
  2060. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
  2061. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
  2062. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
  2063. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
  2064. #ifdef CONFIG_ENCODERS
  2065. c->vsad[0] = vsad16_mmx2;
  2066. #endif //CONFIG_ENCODERS
  2067. }
  2068. #if 1
  2069. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
  2070. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
  2071. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
  2072. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
  2073. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
  2074. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
  2075. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
  2076. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
  2077. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
  2078. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
  2079. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
  2080. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
  2081. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
  2082. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
  2083. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
  2084. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
  2085. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
  2086. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
  2087. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
  2088. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
  2089. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
  2090. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
  2091. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
  2092. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
  2093. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
  2094. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
  2095. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
  2096. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
  2097. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
  2098. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
  2099. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
  2100. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
  2101. #endif
  2102. #ifdef CONFIG_ENCODERS
  2103. c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  2104. #endif //CONFIG_ENCODERS
  2105. } else if (mm_flags & MM_3DNOW) {
  2106. c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
  2107. c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
  2108. c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
  2109. c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
  2110. c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
  2111. c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
  2112. c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
  2113. c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
  2114. c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
  2115. c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
  2116. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  2117. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
  2118. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
  2119. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
  2120. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
  2121. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
  2122. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
  2123. }
  2124. SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
  2125. SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
  2126. SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
  2127. SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
  2128. SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
  2129. SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
  2130. SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
  2131. SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
  2132. SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
  2133. SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
  2134. SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
  2135. SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
  2136. SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
  2137. SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
  2138. SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
  2139. SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
  2140. SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
  2141. SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
  2142. SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
  2143. SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
  2144. SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
  2145. SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
  2146. SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
  2147. SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
  2148. SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
  2149. SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
  2150. SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
  2151. SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
  2152. SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
  2153. SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
  2154. SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
  2155. SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
  2156. }
  2157. }
  2158. #ifdef CONFIG_ENCODERS
  2159. dsputil_init_pix_mmx(c, avctx);
  2160. #endif //CONFIG_ENCODERS
  2161. #if 0
  2162. // for speed testing
  2163. get_pixels = just_return;
  2164. put_pixels_clamped = just_return;
  2165. add_pixels_clamped = just_return;
  2166. pix_abs16x16 = just_return;
  2167. pix_abs16x16_x2 = just_return;
  2168. pix_abs16x16_y2 = just_return;
  2169. pix_abs16x16_xy2 = just_return;
  2170. put_pixels_tab[0] = just_return;
  2171. put_pixels_tab[1] = just_return;
  2172. put_pixels_tab[2] = just_return;
  2173. put_pixels_tab[3] = just_return;
  2174. put_no_rnd_pixels_tab[0] = just_return;
  2175. put_no_rnd_pixels_tab[1] = just_return;
  2176. put_no_rnd_pixels_tab[2] = just_return;
  2177. put_no_rnd_pixels_tab[3] = just_return;
  2178. avg_pixels_tab[0] = just_return;
  2179. avg_pixels_tab[1] = just_return;
  2180. avg_pixels_tab[2] = just_return;
  2181. avg_pixels_tab[3] = just_return;
  2182. avg_no_rnd_pixels_tab[0] = just_return;
  2183. avg_no_rnd_pixels_tab[1] = just_return;
  2184. avg_no_rnd_pixels_tab[2] = just_return;
  2185. avg_no_rnd_pixels_tab[3] = just_return;
  2186. //av_fdct = just_return;
  2187. //ff_idct = just_return;
  2188. #endif
  2189. }