You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1272 lines
64KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavcodec/dsputil.h"
  28. #include "libavcodec/h264dsp.h"
  29. #include "libavcodec/mpegvideo.h"
  30. #include "libavcodec/simple_idct.h"
  31. #include "dsputil_mmx.h"
  32. #include "idct_xvid.h"
  33. //#undef NDEBUG
  34. //#include <assert.h>
  35. /* pixel operations */
  36. DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
  37. DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
  38. DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
  39. DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
  40. DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
  41. DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
  42. DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
  43. DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
  44. DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
  45. DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
  46. DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
  47. DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
  48. DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
  49. void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  50. int dstStride, int src1Stride, int h);
  51. void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
  52. uint8_t *src2, int dstStride,
  53. int src1Stride, int h);
  54. void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  55. int dstStride, int src1Stride, int h);
  56. void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  57. int dstStride, int src1Stride, int h);
  58. void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  59. int dstStride, int src1Stride, int h);
  60. void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  61. int dstStride, int src1Stride, int h);
  62. void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  63. int dstStride, int srcStride, int h);
  64. void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  65. int dstStride, int srcStride, int h);
  66. void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  67. int dstStride, int srcStride,
  68. int h);
  69. void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  70. int dstStride, int srcStride, int h);
  71. void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  72. int dstStride, int srcStride, int h);
  73. void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  74. int dstStride, int srcStride,
  75. int h);
  76. void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  77. int dstStride, int srcStride);
  78. void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  79. int dstStride, int srcStride);
  80. void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  81. int dstStride, int srcStride);
  82. void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  83. int dstStride, int srcStride);
  84. void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  85. int dstStride, int srcStride);
  86. void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  87. int dstStride, int srcStride);
  88. #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
  89. #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
  90. #if HAVE_INLINE_ASM
  91. /***********************************/
  92. /* standard MMX */
  93. void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
  94. int line_size)
  95. {
  96. const int16_t *p;
  97. uint8_t *pix;
  98. /* read the pixels */
  99. p = block;
  100. pix = pixels;
  101. /* unrolled loop */
  102. __asm__ volatile (
  103. "movq (%3), %%mm0 \n\t"
  104. "movq 8(%3), %%mm1 \n\t"
  105. "movq 16(%3), %%mm2 \n\t"
  106. "movq 24(%3), %%mm3 \n\t"
  107. "movq 32(%3), %%mm4 \n\t"
  108. "movq 40(%3), %%mm5 \n\t"
  109. "movq 48(%3), %%mm6 \n\t"
  110. "movq 56(%3), %%mm7 \n\t"
  111. "packuswb %%mm1, %%mm0 \n\t"
  112. "packuswb %%mm3, %%mm2 \n\t"
  113. "packuswb %%mm5, %%mm4 \n\t"
  114. "packuswb %%mm7, %%mm6 \n\t"
  115. "movq %%mm0, (%0) \n\t"
  116. "movq %%mm2, (%0, %1) \n\t"
  117. "movq %%mm4, (%0, %1, 2) \n\t"
  118. "movq %%mm6, (%0, %2) \n\t"
  119. :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
  120. "r"(p)
  121. : "memory");
  122. pix += line_size * 4;
  123. p += 32;
  124. // if here would be an exact copy of the code above
  125. // compiler would generate some very strange code
  126. // thus using "r"
  127. __asm__ volatile (
  128. "movq (%3), %%mm0 \n\t"
  129. "movq 8(%3), %%mm1 \n\t"
  130. "movq 16(%3), %%mm2 \n\t"
  131. "movq 24(%3), %%mm3 \n\t"
  132. "movq 32(%3), %%mm4 \n\t"
  133. "movq 40(%3), %%mm5 \n\t"
  134. "movq 48(%3), %%mm6 \n\t"
  135. "movq 56(%3), %%mm7 \n\t"
  136. "packuswb %%mm1, %%mm0 \n\t"
  137. "packuswb %%mm3, %%mm2 \n\t"
  138. "packuswb %%mm5, %%mm4 \n\t"
  139. "packuswb %%mm7, %%mm6 \n\t"
  140. "movq %%mm0, (%0) \n\t"
  141. "movq %%mm2, (%0, %1) \n\t"
  142. "movq %%mm4, (%0, %1, 2) \n\t"
  143. "movq %%mm6, (%0, %2) \n\t"
  144. :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
  145. : "memory");
  146. }
  147. #define put_signed_pixels_clamped_mmx_half(off) \
  148. "movq "#off"(%2), %%mm1 \n\t" \
  149. "movq 16 + "#off"(%2), %%mm2 \n\t" \
  150. "movq 32 + "#off"(%2), %%mm3 \n\t" \
  151. "movq 48 + "#off"(%2), %%mm4 \n\t" \
  152. "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
  153. "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
  154. "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
  155. "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
  156. "paddb %%mm0, %%mm1 \n\t" \
  157. "paddb %%mm0, %%mm2 \n\t" \
  158. "paddb %%mm0, %%mm3 \n\t" \
  159. "paddb %%mm0, %%mm4 \n\t" \
  160. "movq %%mm1, (%0) \n\t" \
  161. "movq %%mm2, (%0, %3) \n\t" \
  162. "movq %%mm3, (%0, %3, 2) \n\t" \
  163. "movq %%mm4, (%0, %1) \n\t"
  164. void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
  165. int line_size)
  166. {
  167. x86_reg line_skip = line_size;
  168. x86_reg line_skip3;
  169. __asm__ volatile (
  170. "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
  171. "lea (%3, %3, 2), %1 \n\t"
  172. put_signed_pixels_clamped_mmx_half(0)
  173. "lea (%0, %3, 4), %0 \n\t"
  174. put_signed_pixels_clamped_mmx_half(64)
  175. : "+&r"(pixels), "=&r"(line_skip3)
  176. : "r"(block), "r"(line_skip)
  177. : "memory");
  178. }
  179. void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
  180. int line_size)
  181. {
  182. const int16_t *p;
  183. uint8_t *pix;
  184. int i;
  185. /* read the pixels */
  186. p = block;
  187. pix = pixels;
  188. MOVQ_ZERO(mm7);
  189. i = 4;
  190. do {
  191. __asm__ volatile (
  192. "movq (%2), %%mm0 \n\t"
  193. "movq 8(%2), %%mm1 \n\t"
  194. "movq 16(%2), %%mm2 \n\t"
  195. "movq 24(%2), %%mm3 \n\t"
  196. "movq %0, %%mm4 \n\t"
  197. "movq %1, %%mm6 \n\t"
  198. "movq %%mm4, %%mm5 \n\t"
  199. "punpcklbw %%mm7, %%mm4 \n\t"
  200. "punpckhbw %%mm7, %%mm5 \n\t"
  201. "paddsw %%mm4, %%mm0 \n\t"
  202. "paddsw %%mm5, %%mm1 \n\t"
  203. "movq %%mm6, %%mm5 \n\t"
  204. "punpcklbw %%mm7, %%mm6 \n\t"
  205. "punpckhbw %%mm7, %%mm5 \n\t"
  206. "paddsw %%mm6, %%mm2 \n\t"
  207. "paddsw %%mm5, %%mm3 \n\t"
  208. "packuswb %%mm1, %%mm0 \n\t"
  209. "packuswb %%mm3, %%mm2 \n\t"
  210. "movq %%mm0, %0 \n\t"
  211. "movq %%mm2, %1 \n\t"
  212. : "+m"(*pix), "+m"(*(pix + line_size))
  213. : "r"(p)
  214. : "memory");
  215. pix += line_size * 2;
  216. p += 16;
  217. } while (--i);
  218. }
  219. #define CLEAR_BLOCKS(name, n) \
  220. static void name(int16_t *blocks) \
  221. { \
  222. __asm__ volatile ( \
  223. "pxor %%mm7, %%mm7 \n\t" \
  224. "mov %1, %%"REG_a" \n\t" \
  225. "1: \n\t" \
  226. "movq %%mm7, (%0, %%"REG_a") \n\t" \
  227. "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
  228. "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
  229. "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
  230. "add $32, %%"REG_a" \n\t" \
  231. "js 1b \n\t" \
  232. :: "r"(((uint8_t *)blocks) + 128 * n), \
  233. "i"(-128 * n) \
  234. : "%"REG_a \
  235. ); \
  236. }
  237. CLEAR_BLOCKS(clear_blocks_mmx, 6)
  238. CLEAR_BLOCKS(clear_block_mmx, 1)
  239. static void clear_block_sse(int16_t *block)
  240. {
  241. __asm__ volatile (
  242. "xorps %%xmm0, %%xmm0 \n"
  243. "movaps %%xmm0, (%0) \n"
  244. "movaps %%xmm0, 16(%0) \n"
  245. "movaps %%xmm0, 32(%0) \n"
  246. "movaps %%xmm0, 48(%0) \n"
  247. "movaps %%xmm0, 64(%0) \n"
  248. "movaps %%xmm0, 80(%0) \n"
  249. "movaps %%xmm0, 96(%0) \n"
  250. "movaps %%xmm0, 112(%0) \n"
  251. :: "r"(block)
  252. : "memory"
  253. );
  254. }
  255. static void clear_blocks_sse(int16_t *blocks)
  256. {
  257. __asm__ volatile (
  258. "xorps %%xmm0, %%xmm0 \n"
  259. "mov %1, %%"REG_a" \n"
  260. "1: \n"
  261. "movaps %%xmm0, (%0, %%"REG_a") \n"
  262. "movaps %%xmm0, 16(%0, %%"REG_a") \n"
  263. "movaps %%xmm0, 32(%0, %%"REG_a") \n"
  264. "movaps %%xmm0, 48(%0, %%"REG_a") \n"
  265. "movaps %%xmm0, 64(%0, %%"REG_a") \n"
  266. "movaps %%xmm0, 80(%0, %%"REG_a") \n"
  267. "movaps %%xmm0, 96(%0, %%"REG_a") \n"
  268. "movaps %%xmm0, 112(%0, %%"REG_a") \n"
  269. "add $128, %%"REG_a" \n"
  270. "js 1b \n"
  271. :: "r"(((uint8_t *)blocks) + 128 * 6),
  272. "i"(-128 * 6)
  273. : "%"REG_a
  274. );
  275. }
  276. static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
  277. {
  278. x86_reg i = 0;
  279. __asm__ volatile (
  280. "jmp 2f \n\t"
  281. "1: \n\t"
  282. "movq (%1, %0), %%mm0 \n\t"
  283. "movq (%2, %0), %%mm1 \n\t"
  284. "paddb %%mm0, %%mm1 \n\t"
  285. "movq %%mm1, (%2, %0) \n\t"
  286. "movq 8(%1, %0), %%mm0 \n\t"
  287. "movq 8(%2, %0), %%mm1 \n\t"
  288. "paddb %%mm0, %%mm1 \n\t"
  289. "movq %%mm1, 8(%2, %0) \n\t"
  290. "add $16, %0 \n\t"
  291. "2: \n\t"
  292. "cmp %3, %0 \n\t"
  293. "js 1b \n\t"
  294. : "+r"(i)
  295. : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
  296. );
  297. for ( ; i < w; i++)
  298. dst[i + 0] += src[i + 0];
  299. }
  300. #if HAVE_7REGS
  301. static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
  302. const uint8_t *diff, int w,
  303. int *left, int *left_top)
  304. {
  305. x86_reg w2 = -w;
  306. x86_reg x;
  307. int l = *left & 0xff;
  308. int tl = *left_top & 0xff;
  309. int t;
  310. __asm__ volatile (
  311. "mov %7, %3 \n"
  312. "1: \n"
  313. "movzbl (%3, %4), %2 \n"
  314. "mov %2, %k3 \n"
  315. "sub %b1, %b3 \n"
  316. "add %b0, %b3 \n"
  317. "mov %2, %1 \n"
  318. "cmp %0, %2 \n"
  319. "cmovg %0, %2 \n"
  320. "cmovg %1, %0 \n"
  321. "cmp %k3, %0 \n"
  322. "cmovg %k3, %0 \n"
  323. "mov %7, %3 \n"
  324. "cmp %2, %0 \n"
  325. "cmovl %2, %0 \n"
  326. "add (%6, %4), %b0 \n"
  327. "mov %b0, (%5, %4) \n"
  328. "inc %4 \n"
  329. "jl 1b \n"
  330. : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
  331. : "r"(dst + w), "r"(diff + w), "rm"(top + w)
  332. );
  333. *left = l;
  334. *left_top = tl;
  335. }
  336. #endif
  337. /* Draw the edges of width 'w' of an image of size width, height
  338. * this MMX version can only handle w == 8 || w == 16. */
  339. static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
  340. int w, int h, int sides)
  341. {
  342. uint8_t *ptr, *last_line;
  343. int i;
  344. last_line = buf + (height - 1) * wrap;
  345. /* left and right */
  346. ptr = buf;
  347. if (w == 8) {
  348. __asm__ volatile (
  349. "1: \n\t"
  350. "movd (%0), %%mm0 \n\t"
  351. "punpcklbw %%mm0, %%mm0 \n\t"
  352. "punpcklwd %%mm0, %%mm0 \n\t"
  353. "punpckldq %%mm0, %%mm0 \n\t"
  354. "movq %%mm0, -8(%0) \n\t"
  355. "movq -8(%0, %2), %%mm1 \n\t"
  356. "punpckhbw %%mm1, %%mm1 \n\t"
  357. "punpckhwd %%mm1, %%mm1 \n\t"
  358. "punpckhdq %%mm1, %%mm1 \n\t"
  359. "movq %%mm1, (%0, %2) \n\t"
  360. "add %1, %0 \n\t"
  361. "cmp %3, %0 \n\t"
  362. "jb 1b \n\t"
  363. : "+r"(ptr)
  364. : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
  365. );
  366. } else {
  367. __asm__ volatile (
  368. "1: \n\t"
  369. "movd (%0), %%mm0 \n\t"
  370. "punpcklbw %%mm0, %%mm0 \n\t"
  371. "punpcklwd %%mm0, %%mm0 \n\t"
  372. "punpckldq %%mm0, %%mm0 \n\t"
  373. "movq %%mm0, -8(%0) \n\t"
  374. "movq %%mm0, -16(%0) \n\t"
  375. "movq -8(%0, %2), %%mm1 \n\t"
  376. "punpckhbw %%mm1, %%mm1 \n\t"
  377. "punpckhwd %%mm1, %%mm1 \n\t"
  378. "punpckhdq %%mm1, %%mm1 \n\t"
  379. "movq %%mm1, (%0, %2) \n\t"
  380. "movq %%mm1, 8(%0, %2) \n\t"
  381. "add %1, %0 \n\t"
  382. "cmp %3, %0 \n\t"
  383. "jb 1b \n\t"
  384. : "+r"(ptr)
  385. : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
  386. );
  387. }
  388. /* top and bottom (and hopefully also the corners) */
  389. if (sides & EDGE_TOP) {
  390. for (i = 0; i < h; i += 4) {
  391. ptr = buf - (i + 1) * wrap - w;
  392. __asm__ volatile (
  393. "1: \n\t"
  394. "movq (%1, %0), %%mm0 \n\t"
  395. "movq %%mm0, (%0) \n\t"
  396. "movq %%mm0, (%0, %2) \n\t"
  397. "movq %%mm0, (%0, %2, 2) \n\t"
  398. "movq %%mm0, (%0, %3) \n\t"
  399. "add $8, %0 \n\t"
  400. "cmp %4, %0 \n\t"
  401. "jb 1b \n\t"
  402. : "+r"(ptr)
  403. : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
  404. "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
  405. );
  406. }
  407. }
  408. if (sides & EDGE_BOTTOM) {
  409. for (i = 0; i < h; i += 4) {
  410. ptr = last_line + (i + 1) * wrap - w;
  411. __asm__ volatile (
  412. "1: \n\t"
  413. "movq (%1, %0), %%mm0 \n\t"
  414. "movq %%mm0, (%0) \n\t"
  415. "movq %%mm0, (%0, %2) \n\t"
  416. "movq %%mm0, (%0, %2, 2) \n\t"
  417. "movq %%mm0, (%0, %3) \n\t"
  418. "add $8, %0 \n\t"
  419. "cmp %4, %0 \n\t"
  420. "jb 1b \n\t"
  421. : "+r"(ptr)
  422. : "r"((x86_reg)last_line - (x86_reg)ptr - w),
  423. "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
  424. "r"(ptr + width + 2 * w)
  425. );
  426. }
  427. }
  428. }
  429. #endif /* HAVE_INLINE_ASM */
  430. #if HAVE_YASM
  431. static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
  432. int line_size, int h)
  433. {
  434. ff_avg_pixels8_mmxext(block, pixels, line_size, h);
  435. ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
  436. }
  437. static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
  438. ptrdiff_t line_size, int h)
  439. {
  440. ff_put_pixels8_mmxext(block, pixels, line_size, h);
  441. ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
  442. }
  443. #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
  444. static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
  445. ptrdiff_t stride) \
  446. { \
  447. ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
  448. } \
  449. \
  450. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
  451. ptrdiff_t stride) \
  452. { \
  453. uint64_t temp[8]; \
  454. uint8_t * const half = (uint8_t*)temp; \
  455. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
  456. stride, 8); \
  457. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
  458. stride, stride, 8); \
  459. } \
  460. \
  461. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
  462. ptrdiff_t stride) \
  463. { \
  464. ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
  465. stride, 8); \
  466. } \
  467. \
  468. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
  469. ptrdiff_t stride) \
  470. { \
  471. uint64_t temp[8]; \
  472. uint8_t * const half = (uint8_t*)temp; \
  473. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
  474. stride, 8); \
  475. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
  476. stride, 8); \
  477. } \
  478. \
  479. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
  480. ptrdiff_t stride) \
  481. { \
  482. uint64_t temp[8]; \
  483. uint8_t * const half = (uint8_t*)temp; \
  484. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
  485. 8, stride); \
  486. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
  487. stride, stride, 8); \
  488. } \
  489. \
  490. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
  491. ptrdiff_t stride) \
  492. { \
  493. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
  494. stride, stride); \
  495. } \
  496. \
  497. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
  498. ptrdiff_t stride) \
  499. { \
  500. uint64_t temp[8]; \
  501. uint8_t * const half = (uint8_t*)temp; \
  502. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
  503. 8, stride); \
  504. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
  505. stride, 8); \
  506. } \
  507. \
  508. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
  509. ptrdiff_t stride) \
  510. { \
  511. uint64_t half[8 + 9]; \
  512. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  513. uint8_t * const halfHV = ((uint8_t*)half); \
  514. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  515. stride, 9); \
  516. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
  517. stride, 9); \
  518. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  519. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  520. stride, 8, 8); \
  521. } \
  522. \
  523. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
  524. ptrdiff_t stride) \
  525. { \
  526. uint64_t half[8 + 9]; \
  527. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  528. uint8_t * const halfHV = ((uint8_t*)half); \
  529. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  530. stride, 9); \
  531. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  532. stride, 9); \
  533. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  534. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  535. stride, 8, 8); \
  536. } \
  537. \
  538. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
  539. ptrdiff_t stride) \
  540. { \
  541. uint64_t half[8 + 9]; \
  542. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  543. uint8_t * const halfHV = ((uint8_t*)half); \
  544. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  545. stride, 9); \
  546. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
  547. stride, 9); \
  548. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  549. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  550. stride, 8, 8); \
  551. } \
  552. \
  553. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
  554. ptrdiff_t stride) \
  555. { \
  556. uint64_t half[8 + 9]; \
  557. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  558. uint8_t * const halfHV = ((uint8_t*)half); \
  559. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  560. stride, 9); \
  561. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  562. stride, 9); \
  563. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  564. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  565. stride, 8, 8); \
  566. } \
  567. \
  568. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
  569. ptrdiff_t stride) \
  570. { \
  571. uint64_t half[8 + 9]; \
  572. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  573. uint8_t * const halfHV = ((uint8_t*)half); \
  574. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  575. stride, 9); \
  576. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  577. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  578. stride, 8, 8); \
  579. } \
  580. \
  581. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
  582. ptrdiff_t stride) \
  583. { \
  584. uint64_t half[8 + 9]; \
  585. uint8_t * const halfH = ((uint8_t*)half) + 64; \
  586. uint8_t * const halfHV = ((uint8_t*)half); \
  587. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  588. stride, 9); \
  589. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  590. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  591. stride, 8, 8); \
  592. } \
  593. \
  594. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
  595. ptrdiff_t stride) \
  596. { \
  597. uint64_t half[8 + 9]; \
  598. uint8_t * const halfH = ((uint8_t*)half); \
  599. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  600. stride, 9); \
  601. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
  602. 8, stride, 9); \
  603. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  604. stride, 8); \
  605. } \
  606. \
  607. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
  608. ptrdiff_t stride) \
  609. { \
  610. uint64_t half[8 + 9]; \
  611. uint8_t * const halfH = ((uint8_t*)half); \
  612. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  613. stride, 9); \
  614. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  615. stride, 9); \
  616. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  617. stride, 8); \
  618. } \
  619. \
  620. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
  621. ptrdiff_t stride) \
  622. { \
  623. uint64_t half[9]; \
  624. uint8_t * const halfH = ((uint8_t*)half); \
  625. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  626. stride, 9); \
  627. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  628. stride, 8); \
  629. } \
  630. \
  631. static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
  632. ptrdiff_t stride) \
  633. { \
  634. ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
  635. } \
  636. \
  637. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
  638. ptrdiff_t stride) \
  639. { \
  640. uint64_t temp[32]; \
  641. uint8_t * const half = (uint8_t*)temp; \
  642. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
  643. stride, 16); \
  644. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
  645. stride, 16); \
  646. } \
  647. \
  648. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
  649. ptrdiff_t stride) \
  650. { \
  651. ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
  652. stride, stride, 16);\
  653. } \
  654. \
  655. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
  656. ptrdiff_t stride) \
  657. { \
  658. uint64_t temp[32]; \
  659. uint8_t * const half = (uint8_t*)temp; \
  660. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
  661. stride, 16); \
  662. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
  663. stride, stride, 16); \
  664. } \
  665. \
  666. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
  667. ptrdiff_t stride) \
  668. { \
  669. uint64_t temp[32]; \
  670. uint8_t * const half = (uint8_t*)temp; \
  671. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
  672. stride); \
  673. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
  674. stride, 16); \
  675. } \
  676. \
  677. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
  678. ptrdiff_t stride) \
  679. { \
  680. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
  681. stride, stride); \
  682. } \
  683. \
  684. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
  685. ptrdiff_t stride) \
  686. { \
  687. uint64_t temp[32]; \
  688. uint8_t * const half = (uint8_t*)temp; \
  689. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
  690. stride); \
  691. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
  692. stride, stride, 16); \
  693. } \
  694. \
  695. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
  696. ptrdiff_t stride) \
  697. { \
  698. uint64_t half[16 * 2 + 17 * 2]; \
  699. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  700. uint8_t * const halfHV = ((uint8_t*)half); \
  701. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  702. stride, 17); \
  703. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  704. stride, 17); \
  705. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  706. 16, 16); \
  707. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  708. stride, 16, 16); \
  709. } \
  710. \
  711. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
  712. ptrdiff_t stride) \
  713. { \
  714. uint64_t half[16 * 2 + 17 * 2]; \
  715. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  716. uint8_t * const halfHV = ((uint8_t*)half); \
  717. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  718. stride, 17); \
  719. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  720. stride, 17); \
  721. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  722. 16, 16); \
  723. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  724. stride, 16, 16); \
  725. } \
  726. \
  727. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
  728. ptrdiff_t stride) \
  729. { \
  730. uint64_t half[16 * 2 + 17 * 2]; \
  731. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  732. uint8_t * const halfHV = ((uint8_t*)half); \
  733. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  734. stride, 17); \
  735. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  736. stride, 17); \
  737. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  738. 16, 16); \
  739. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  740. stride, 16, 16); \
  741. } \
  742. \
  743. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
  744. ptrdiff_t stride) \
  745. { \
  746. uint64_t half[16 * 2 + 17 * 2]; \
  747. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  748. uint8_t * const halfHV = ((uint8_t*)half); \
  749. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  750. stride, 17); \
  751. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  752. stride, 17); \
  753. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  754. 16, 16); \
  755. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  756. stride, 16, 16); \
  757. } \
  758. \
  759. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
  760. ptrdiff_t stride) \
  761. { \
  762. uint64_t half[16 * 2 + 17 * 2]; \
  763. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  764. uint8_t * const halfHV = ((uint8_t*)half); \
  765. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  766. stride, 17); \
  767. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  768. 16, 16); \
  769. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  770. stride, 16, 16); \
  771. } \
  772. \
  773. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
  774. ptrdiff_t stride) \
  775. { \
  776. uint64_t half[16 * 2 + 17 * 2]; \
  777. uint8_t * const halfH = ((uint8_t*)half) + 256; \
  778. uint8_t * const halfHV = ((uint8_t*)half); \
  779. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  780. stride, 17); \
  781. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  782. 16, 16); \
  783. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  784. stride, 16, 16); \
  785. } \
  786. \
  787. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
  788. ptrdiff_t stride) \
  789. { \
  790. uint64_t half[17 * 2]; \
  791. uint8_t * const halfH = ((uint8_t*)half); \
  792. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  793. stride, 17); \
  794. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  795. stride, 17); \
  796. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  797. stride, 16); \
  798. } \
  799. \
  800. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
  801. ptrdiff_t stride) \
  802. { \
  803. uint64_t half[17 * 2]; \
  804. uint8_t * const halfH = ((uint8_t*)half); \
  805. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  806. stride, 17); \
  807. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  808. stride, 17); \
  809. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  810. stride, 16); \
  811. } \
  812. \
  813. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
  814. ptrdiff_t stride) \
  815. { \
  816. uint64_t half[17 * 2]; \
  817. uint8_t * const halfH = ((uint8_t*)half); \
  818. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  819. stride, 17); \
  820. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  821. stride, 16); \
  822. }
  823. QPEL_OP(put_, ff_pw_16, _, mmxext)
  824. QPEL_OP(avg_, ff_pw_16, _, mmxext)
  825. QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
  826. #endif /* HAVE_YASM */
  827. #if HAVE_INLINE_ASM
  828. static void gmc_mmx(uint8_t *dst, uint8_t *src,
  829. int stride, int h, int ox, int oy,
  830. int dxx, int dxy, int dyx, int dyy,
  831. int shift, int r, int width, int height)
  832. {
  833. const int w = 8;
  834. const int ix = ox >> (16 + shift);
  835. const int iy = oy >> (16 + shift);
  836. const int oxs = ox >> 4;
  837. const int oys = oy >> 4;
  838. const int dxxs = dxx >> 4;
  839. const int dxys = dxy >> 4;
  840. const int dyxs = dyx >> 4;
  841. const int dyys = dyy >> 4;
  842. const uint16_t r4[4] = { r, r, r, r };
  843. const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
  844. const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
  845. const uint64_t shift2 = 2 * shift;
  846. int x, y;
  847. const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
  848. const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
  849. const int dxh = dxy * (h - 1);
  850. const int dyw = dyx * (w - 1);
  851. if ( // non-constant fullpel offset (3% of blocks)
  852. ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
  853. (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
  854. // uses more than 16 bits of subpel mv (only at huge resolution)
  855. || (dxx | dxy | dyx | dyy) & 15 ||
  856. (unsigned)ix >= width - w ||
  857. (unsigned)iy >= height - h) {
  858. // FIXME could still use mmx for some of the rows
  859. ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
  860. shift, r, width, height);
  861. return;
  862. }
  863. src += ix + iy * stride;
  864. __asm__ volatile (
  865. "movd %0, %%mm6 \n\t"
  866. "pxor %%mm7, %%mm7 \n\t"
  867. "punpcklwd %%mm6, %%mm6 \n\t"
  868. "punpcklwd %%mm6, %%mm6 \n\t"
  869. :: "r"(1<<shift)
  870. );
  871. for (x = 0; x < w; x += 4) {
  872. uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
  873. oxs - dxys + dxxs * (x + 1),
  874. oxs - dxys + dxxs * (x + 2),
  875. oxs - dxys + dxxs * (x + 3) };
  876. uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
  877. oys - dyys + dyxs * (x + 1),
  878. oys - dyys + dyxs * (x + 2),
  879. oys - dyys + dyxs * (x + 3) };
  880. for (y = 0; y < h; y++) {
  881. __asm__ volatile (
  882. "movq %0, %%mm4 \n\t"
  883. "movq %1, %%mm5 \n\t"
  884. "paddw %2, %%mm4 \n\t"
  885. "paddw %3, %%mm5 \n\t"
  886. "movq %%mm4, %0 \n\t"
  887. "movq %%mm5, %1 \n\t"
  888. "psrlw $12, %%mm4 \n\t"
  889. "psrlw $12, %%mm5 \n\t"
  890. : "+m"(*dx4), "+m"(*dy4)
  891. : "m"(*dxy4), "m"(*dyy4)
  892. );
  893. __asm__ volatile (
  894. "movq %%mm6, %%mm2 \n\t"
  895. "movq %%mm6, %%mm1 \n\t"
  896. "psubw %%mm4, %%mm2 \n\t"
  897. "psubw %%mm5, %%mm1 \n\t"
  898. "movq %%mm2, %%mm0 \n\t"
  899. "movq %%mm4, %%mm3 \n\t"
  900. "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
  901. "pmullw %%mm5, %%mm3 \n\t" // dx * dy
  902. "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
  903. "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
  904. "movd %4, %%mm5 \n\t"
  905. "movd %3, %%mm4 \n\t"
  906. "punpcklbw %%mm7, %%mm5 \n\t"
  907. "punpcklbw %%mm7, %%mm4 \n\t"
  908. "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
  909. "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
  910. "movd %2, %%mm5 \n\t"
  911. "movd %1, %%mm4 \n\t"
  912. "punpcklbw %%mm7, %%mm5 \n\t"
  913. "punpcklbw %%mm7, %%mm4 \n\t"
  914. "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
  915. "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
  916. "paddw %5, %%mm1 \n\t"
  917. "paddw %%mm3, %%mm2 \n\t"
  918. "paddw %%mm1, %%mm0 \n\t"
  919. "paddw %%mm2, %%mm0 \n\t"
  920. "psrlw %6, %%mm0 \n\t"
  921. "packuswb %%mm0, %%mm0 \n\t"
  922. "movd %%mm0, %0 \n\t"
  923. : "=m"(dst[x + y * stride])
  924. : "m"(src[0]), "m"(src[1]),
  925. "m"(src[stride]), "m"(src[stride + 1]),
  926. "m"(*r4), "m"(shift2)
  927. );
  928. src += stride;
  929. }
  930. src += 4 - h * stride;
  931. }
  932. }
  933. static void vector_clipf_sse(float *dst, const float *src,
  934. float min, float max, int len)
  935. {
  936. x86_reg i = (len - 16) * 4;
  937. __asm__ volatile (
  938. "movss %3, %%xmm4 \n\t"
  939. "movss %4, %%xmm5 \n\t"
  940. "shufps $0, %%xmm4, %%xmm4 \n\t"
  941. "shufps $0, %%xmm5, %%xmm5 \n\t"
  942. "1: \n\t"
  943. "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
  944. "movaps 16(%2, %0), %%xmm1 \n\t"
  945. "movaps 32(%2, %0), %%xmm2 \n\t"
  946. "movaps 48(%2, %0), %%xmm3 \n\t"
  947. "maxps %%xmm4, %%xmm0 \n\t"
  948. "maxps %%xmm4, %%xmm1 \n\t"
  949. "maxps %%xmm4, %%xmm2 \n\t"
  950. "maxps %%xmm4, %%xmm3 \n\t"
  951. "minps %%xmm5, %%xmm0 \n\t"
  952. "minps %%xmm5, %%xmm1 \n\t"
  953. "minps %%xmm5, %%xmm2 \n\t"
  954. "minps %%xmm5, %%xmm3 \n\t"
  955. "movaps %%xmm0, (%1, %0) \n\t"
  956. "movaps %%xmm1, 16(%1, %0) \n\t"
  957. "movaps %%xmm2, 32(%1, %0) \n\t"
  958. "movaps %%xmm3, 48(%1, %0) \n\t"
  959. "sub $64, %0 \n\t"
  960. "jge 1b \n\t"
  961. : "+&r"(i)
  962. : "r"(dst), "r"(src), "m"(min), "m"(max)
  963. : "memory"
  964. );
  965. }
  966. #endif /* HAVE_INLINE_ASM */
  967. void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
  968. void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
  969. int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
  970. int order);
  971. int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
  972. int order);
  973. int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
  974. const int16_t *v3,
  975. int order, int mul);
  976. int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
  977. const int16_t *v3,
  978. int order, int mul);
  979. int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
  980. const int16_t *v3,
  981. int order, int mul);
  982. void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
  983. const int16_t *window, unsigned int len);
  984. void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
  985. const int16_t *window, unsigned int len);
  986. void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
  987. const int16_t *window, unsigned int len);
  988. void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
  989. const int16_t *window, unsigned int len);
  990. void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
  991. const int16_t *window, unsigned int len);
  992. void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
  993. const int16_t *window, unsigned int len);
  994. void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
  995. void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
  996. void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
  997. const uint8_t *diff, int w,
  998. int *left, int *left_top);
  999. int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
  1000. int w, int left);
  1001. int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
  1002. int w, int left);
  1003. void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
  1004. int32_t min, int32_t max, unsigned int len);
  1005. void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
  1006. int32_t min, int32_t max, unsigned int len);
  1007. void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
  1008. int32_t min, int32_t max, unsigned int len);
  1009. void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
  1010. int32_t min, int32_t max, unsigned int len);
  1011. #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
  1012. do { \
  1013. c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
  1014. c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
  1015. c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
  1016. c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
  1017. c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
  1018. c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
  1019. c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
  1020. c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
  1021. c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
  1022. c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
  1023. c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
  1024. c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
  1025. c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
  1026. c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
  1027. c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
  1028. c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
  1029. } while (0)
  1030. static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
  1031. int mm_flags)
  1032. {
  1033. #if HAVE_MMX_INLINE
  1034. const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  1035. c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
  1036. c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
  1037. c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
  1038. if (!high_bit_depth) {
  1039. c->clear_block = clear_block_mmx;
  1040. c->clear_blocks = clear_blocks_mmx;
  1041. c->draw_edges = draw_edges_mmx;
  1042. switch (avctx->idct_algo) {
  1043. case FF_IDCT_AUTO:
  1044. case FF_IDCT_SIMPLEMMX:
  1045. c->idct_put = ff_simple_idct_put_mmx;
  1046. c->idct_add = ff_simple_idct_add_mmx;
  1047. c->idct = ff_simple_idct_mmx;
  1048. c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
  1049. break;
  1050. case FF_IDCT_XVIDMMX:
  1051. c->idct_put = ff_idct_xvid_mmx_put;
  1052. c->idct_add = ff_idct_xvid_mmx_add;
  1053. c->idct = ff_idct_xvid_mmx;
  1054. break;
  1055. }
  1056. }
  1057. c->gmc = gmc_mmx;
  1058. c->add_bytes = add_bytes_mmx;
  1059. #endif /* HAVE_MMX_INLINE */
  1060. #if HAVE_MMX_EXTERNAL
  1061. if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
  1062. c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
  1063. c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
  1064. }
  1065. c->vector_clip_int32 = ff_vector_clip_int32_mmx;
  1066. #endif /* HAVE_MMX_EXTERNAL */
  1067. }
  1068. static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
  1069. int mm_flags)
  1070. {
  1071. #if HAVE_MMXEXT_INLINE
  1072. const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  1073. if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
  1074. c->idct_put = ff_idct_xvid_mmxext_put;
  1075. c->idct_add = ff_idct_xvid_mmxext_add;
  1076. c->idct = ff_idct_xvid_mmxext;
  1077. }
  1078. #endif /* HAVE_MMXEXT_INLINE */
  1079. #if HAVE_MMXEXT_EXTERNAL
  1080. SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
  1081. SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
  1082. SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
  1083. SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
  1084. SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
  1085. SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
  1086. /* slower than cmov version on AMD */
  1087. if (!(mm_flags & AV_CPU_FLAG_3DNOW))
  1088. c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
  1089. c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
  1090. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
  1091. if (avctx->flags & CODEC_FLAG_BITEXACT) {
  1092. c->apply_window_int16 = ff_apply_window_int16_mmxext;
  1093. } else {
  1094. c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
  1095. }
  1096. #endif /* HAVE_MMXEXT_EXTERNAL */
  1097. }
  1098. static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
  1099. int mm_flags)
  1100. {
  1101. #if HAVE_SSE_INLINE
  1102. const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  1103. if (!high_bit_depth) {
  1104. if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
  1105. /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
  1106. c->clear_block = clear_block_sse;
  1107. c->clear_blocks = clear_blocks_sse;
  1108. }
  1109. }
  1110. c->vector_clipf = vector_clipf_sse;
  1111. #endif /* HAVE_SSE_INLINE */
  1112. }
  1113. static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
  1114. int mm_flags)
  1115. {
  1116. #if HAVE_SSE2_INLINE
  1117. const int high_bit_depth = avctx->bits_per_raw_sample > 8;
  1118. if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
  1119. c->idct_put = ff_idct_xvid_sse2_put;
  1120. c->idct_add = ff_idct_xvid_sse2_add;
  1121. c->idct = ff_idct_xvid_sse2;
  1122. c->idct_permutation_type = FF_SSE2_IDCT_PERM;
  1123. }
  1124. #endif /* HAVE_SSE2_INLINE */
  1125. #if HAVE_SSE2_EXTERNAL
  1126. c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
  1127. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
  1128. if (mm_flags & AV_CPU_FLAG_ATOM) {
  1129. c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
  1130. } else {
  1131. c->vector_clip_int32 = ff_vector_clip_int32_sse2;
  1132. }
  1133. if (avctx->flags & CODEC_FLAG_BITEXACT) {
  1134. c->apply_window_int16 = ff_apply_window_int16_sse2;
  1135. } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
  1136. c->apply_window_int16 = ff_apply_window_int16_round_sse2;
  1137. }
  1138. c->bswap_buf = ff_bswap32_buf_sse2;
  1139. #endif /* HAVE_SSE2_EXTERNAL */
  1140. }
  1141. static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
  1142. int mm_flags)
  1143. {
  1144. #if HAVE_SSSE3_EXTERNAL
  1145. c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
  1146. if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
  1147. c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
  1148. if (mm_flags & AV_CPU_FLAG_ATOM)
  1149. c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
  1150. else
  1151. c->apply_window_int16 = ff_apply_window_int16_ssse3;
  1152. if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
  1153. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
  1154. c->bswap_buf = ff_bswap32_buf_ssse3;
  1155. #endif /* HAVE_SSSE3_EXTERNAL */
  1156. }
  1157. static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
  1158. int mm_flags)
  1159. {
  1160. #if HAVE_SSE4_EXTERNAL
  1161. c->vector_clip_int32 = ff_vector_clip_int32_sse4;
  1162. #endif /* HAVE_SSE4_EXTERNAL */
  1163. }
  1164. av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
  1165. {
  1166. int mm_flags = av_get_cpu_flags();
  1167. #if HAVE_7REGS && HAVE_INLINE_ASM
  1168. if (mm_flags & AV_CPU_FLAG_CMOV)
  1169. c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
  1170. #endif
  1171. if (mm_flags & AV_CPU_FLAG_MMX)
  1172. dsputil_init_mmx(c, avctx, mm_flags);
  1173. if (mm_flags & AV_CPU_FLAG_MMXEXT)
  1174. dsputil_init_mmxext(c, avctx, mm_flags);
  1175. if (mm_flags & AV_CPU_FLAG_SSE)
  1176. dsputil_init_sse(c, avctx, mm_flags);
  1177. if (mm_flags & AV_CPU_FLAG_SSE2)
  1178. dsputil_init_sse2(c, avctx, mm_flags);
  1179. if (mm_flags & AV_CPU_FLAG_SSSE3)
  1180. dsputil_init_ssse3(c, avctx, mm_flags);
  1181. if (mm_flags & AV_CPU_FLAG_SSE4)
  1182. dsputil_init_sse4(c, avctx, mm_flags);
  1183. if (CONFIG_ENCODERS)
  1184. ff_dsputilenc_init_mmx(c, avctx);
  1185. }