You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

457 lines
14KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/dct.h"
  29. #include "libavcodec/dsputil.h"
  30. #include "libavcodec/mpegvideo.h"
  31. #include "dsputil_x86.h"
  32. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  33. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  34. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  35. int stride);
  36. void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  37. int stride);
  38. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  39. int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
  40. int ff_pix_sum16_xop(uint8_t *pix, int line_size);
  41. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  42. int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
  43. int ff_sum_abs_dctelem_mmx(int16_t *block);
  44. int ff_sum_abs_dctelem_mmxext(int16_t *block);
  45. int ff_sum_abs_dctelem_sse2(int16_t *block);
  46. int ff_sum_abs_dctelem_ssse3(int16_t *block);
  47. int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  48. int line_size, int h);
  49. int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  50. int line_size, int h);
  51. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  52. int line_size, int h);
  53. int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
  54. int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
  55. #define hadamard_func(cpu) \
  56. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  57. uint8_t *src2, int stride, int h); \
  58. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  59. uint8_t *src2, int stride, int h);
  60. hadamard_func(mmx)
  61. hadamard_func(mmxext)
  62. hadamard_func(sse2)
  63. hadamard_func(ssse3)
  64. #if HAVE_YASM
  65. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  66. int line_size, int h)
  67. {
  68. int score1, score2;
  69. if (c)
  70. score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  71. else
  72. score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
  73. score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
  74. - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
  75. if (c)
  76. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  77. else
  78. return score1 + FFABS(score2) * 8;
  79. }
  80. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  81. int line_size, int h)
  82. {
  83. int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
  84. int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
  85. ff_hf_noise8_mmx(pix2, line_size, h);
  86. if (c)
  87. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  88. else
  89. return score1 + FFABS(score2) * 8;
  90. }
  91. #endif /* HAVE_YASM */
  92. #if HAVE_INLINE_ASM
  93. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  94. int line_size, int h)
  95. {
  96. int tmp;
  97. av_assert2((((int) pix) & 7) == 0);
  98. av_assert2((line_size & 7) == 0);
  99. #define SUM(in0, in1, out0, out1) \
  100. "movq (%0), %%mm2\n" \
  101. "movq 8(%0), %%mm3\n" \
  102. "add %2,%0\n" \
  103. "movq %%mm2, " #out0 "\n" \
  104. "movq %%mm3, " #out1 "\n" \
  105. "psubusb " #in0 ", %%mm2\n" \
  106. "psubusb " #in1 ", %%mm3\n" \
  107. "psubusb " #out0 ", " #in0 "\n" \
  108. "psubusb " #out1 ", " #in1 "\n" \
  109. "por %%mm2, " #in0 "\n" \
  110. "por %%mm3, " #in1 "\n" \
  111. "movq " #in0 ", %%mm2\n" \
  112. "movq " #in1 ", %%mm3\n" \
  113. "punpcklbw %%mm7, " #in0 "\n" \
  114. "punpcklbw %%mm7, " #in1 "\n" \
  115. "punpckhbw %%mm7, %%mm2\n" \
  116. "punpckhbw %%mm7, %%mm3\n" \
  117. "paddw " #in1 ", " #in0 "\n" \
  118. "paddw %%mm3, %%mm2\n" \
  119. "paddw %%mm2, " #in0 "\n" \
  120. "paddw " #in0 ", %%mm6\n"
  121. __asm__ volatile (
  122. "movl %3, %%ecx\n"
  123. "pxor %%mm6, %%mm6\n"
  124. "pxor %%mm7, %%mm7\n"
  125. "movq (%0), %%mm0\n"
  126. "movq 8(%0), %%mm1\n"
  127. "add %2, %0\n"
  128. "jmp 2f\n"
  129. "1:\n"
  130. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  131. "2:\n"
  132. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  133. "subl $2, %%ecx\n"
  134. "jnz 1b\n"
  135. "movq %%mm6, %%mm0\n"
  136. "psrlq $32, %%mm6\n"
  137. "paddw %%mm6, %%mm0\n"
  138. "movq %%mm0, %%mm6\n"
  139. "psrlq $16, %%mm0\n"
  140. "paddw %%mm6, %%mm0\n"
  141. "movd %%mm0, %1\n"
  142. : "+r" (pix), "=r" (tmp)
  143. : "r" ((x86_reg) line_size), "m" (h)
  144. : "%ecx");
  145. return tmp & 0xFFFF;
  146. }
  147. #undef SUM
  148. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  149. int line_size, int h)
  150. {
  151. int tmp;
  152. av_assert2((((int) pix) & 7) == 0);
  153. av_assert2((line_size & 7) == 0);
  154. #define SUM(in0, in1, out0, out1) \
  155. "movq (%0), " #out0 "\n" \
  156. "movq 8(%0), " #out1 "\n" \
  157. "add %2, %0\n" \
  158. "psadbw " #out0 ", " #in0 "\n" \
  159. "psadbw " #out1 ", " #in1 "\n" \
  160. "paddw " #in1 ", " #in0 "\n" \
  161. "paddw " #in0 ", %%mm6\n"
  162. __asm__ volatile (
  163. "movl %3, %%ecx\n"
  164. "pxor %%mm6, %%mm6\n"
  165. "pxor %%mm7, %%mm7\n"
  166. "movq (%0), %%mm0\n"
  167. "movq 8(%0), %%mm1\n"
  168. "add %2, %0\n"
  169. "jmp 2f\n"
  170. "1:\n"
  171. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  172. "2:\n"
  173. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  174. "subl $2, %%ecx\n"
  175. "jnz 1b\n"
  176. "movd %%mm6, %1\n"
  177. : "+r" (pix), "=r" (tmp)
  178. : "r" ((x86_reg) line_size), "m" (h)
  179. : "%ecx");
  180. return tmp;
  181. }
  182. #undef SUM
  183. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  184. int line_size, int h)
  185. {
  186. int tmp;
  187. av_assert2((((int) pix1) & 7) == 0);
  188. av_assert2((((int) pix2) & 7) == 0);
  189. av_assert2((line_size & 7) == 0);
  190. #define SUM(in0, in1, out0, out1) \
  191. "movq (%0), %%mm2\n" \
  192. "movq (%1), " #out0 "\n" \
  193. "movq 8(%0), %%mm3\n" \
  194. "movq 8(%1), " #out1 "\n" \
  195. "add %3, %0\n" \
  196. "add %3, %1\n" \
  197. "psubb " #out0 ", %%mm2\n" \
  198. "psubb " #out1 ", %%mm3\n" \
  199. "pxor %%mm7, %%mm2\n" \
  200. "pxor %%mm7, %%mm3\n" \
  201. "movq %%mm2, " #out0 "\n" \
  202. "movq %%mm3, " #out1 "\n" \
  203. "psubusb " #in0 ", %%mm2\n" \
  204. "psubusb " #in1 ", %%mm3\n" \
  205. "psubusb " #out0 ", " #in0 "\n" \
  206. "psubusb " #out1 ", " #in1 "\n" \
  207. "por %%mm2, " #in0 "\n" \
  208. "por %%mm3, " #in1 "\n" \
  209. "movq " #in0 ", %%mm2\n" \
  210. "movq " #in1 ", %%mm3\n" \
  211. "punpcklbw %%mm7, " #in0 "\n" \
  212. "punpcklbw %%mm7, " #in1 "\n" \
  213. "punpckhbw %%mm7, %%mm2\n" \
  214. "punpckhbw %%mm7, %%mm3\n" \
  215. "paddw " #in1 ", " #in0 "\n" \
  216. "paddw %%mm3, %%mm2\n" \
  217. "paddw %%mm2, " #in0 "\n" \
  218. "paddw " #in0 ", %%mm6\n"
  219. __asm__ volatile (
  220. "movl %4, %%ecx\n"
  221. "pxor %%mm6, %%mm6\n"
  222. "pcmpeqw %%mm7, %%mm7\n"
  223. "psllw $15, %%mm7\n"
  224. "packsswb %%mm7, %%mm7\n"
  225. "movq (%0), %%mm0\n"
  226. "movq (%1), %%mm2\n"
  227. "movq 8(%0), %%mm1\n"
  228. "movq 8(%1), %%mm3\n"
  229. "add %3, %0\n"
  230. "add %3, %1\n"
  231. "psubb %%mm2, %%mm0\n"
  232. "psubb %%mm3, %%mm1\n"
  233. "pxor %%mm7, %%mm0\n"
  234. "pxor %%mm7, %%mm1\n"
  235. "jmp 2f\n"
  236. "1:\n"
  237. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  238. "2:\n"
  239. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  240. "subl $2, %%ecx\n"
  241. "jnz 1b\n"
  242. "movq %%mm6, %%mm0\n"
  243. "psrlq $32, %%mm6\n"
  244. "paddw %%mm6, %%mm0\n"
  245. "movq %%mm0, %%mm6\n"
  246. "psrlq $16, %%mm0\n"
  247. "paddw %%mm6, %%mm0\n"
  248. "movd %%mm0, %2\n"
  249. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  250. : "r" ((x86_reg) line_size), "m" (h)
  251. : "%ecx");
  252. return tmp & 0x7FFF;
  253. }
  254. #undef SUM
  255. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  256. int line_size, int h)
  257. {
  258. int tmp;
  259. av_assert2((((int) pix1) & 7) == 0);
  260. av_assert2((((int) pix2) & 7) == 0);
  261. av_assert2((line_size & 7) == 0);
  262. #define SUM(in0, in1, out0, out1) \
  263. "movq (%0), " #out0 "\n" \
  264. "movq (%1), %%mm2\n" \
  265. "movq 8(%0), " #out1 "\n" \
  266. "movq 8(%1), %%mm3\n" \
  267. "add %3, %0\n" \
  268. "add %3, %1\n" \
  269. "psubb %%mm2, " #out0 "\n" \
  270. "psubb %%mm3, " #out1 "\n" \
  271. "pxor %%mm7, " #out0 "\n" \
  272. "pxor %%mm7, " #out1 "\n" \
  273. "psadbw " #out0 ", " #in0 "\n" \
  274. "psadbw " #out1 ", " #in1 "\n" \
  275. "paddw " #in1 ", " #in0 "\n" \
  276. "paddw " #in0 ", %%mm6\n "
  277. __asm__ volatile (
  278. "movl %4, %%ecx\n"
  279. "pxor %%mm6, %%mm6\n"
  280. "pcmpeqw %%mm7, %%mm7\n"
  281. "psllw $15, %%mm7\n"
  282. "packsswb %%mm7, %%mm7\n"
  283. "movq (%0), %%mm0\n"
  284. "movq (%1), %%mm2\n"
  285. "movq 8(%0), %%mm1\n"
  286. "movq 8(%1), %%mm3\n"
  287. "add %3, %0\n"
  288. "add %3, %1\n"
  289. "psubb %%mm2, %%mm0\n"
  290. "psubb %%mm3, %%mm1\n"
  291. "pxor %%mm7, %%mm0\n"
  292. "pxor %%mm7, %%mm1\n"
  293. "jmp 2f\n"
  294. "1:\n"
  295. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  296. "2:\n"
  297. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  298. "subl $2, %%ecx\n"
  299. "jnz 1b\n"
  300. "movd %%mm6, %2\n"
  301. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  302. : "r" ((x86_reg) line_size), "m" (h)
  303. : "%ecx");
  304. return tmp;
  305. }
  306. #undef SUM
  307. #endif /* HAVE_INLINE_ASM */
  308. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
  309. unsigned high_bit_depth)
  310. {
  311. int cpu_flags = av_get_cpu_flags();
  312. const int dct_algo = avctx->dct_algo;
  313. if (EXTERNAL_MMX(cpu_flags)) {
  314. if (!high_bit_depth)
  315. c->get_pixels = ff_get_pixels_mmx;
  316. c->diff_pixels = ff_diff_pixels_mmx;
  317. c->pix_sum = ff_pix_sum16_mmx;
  318. c->pix_norm1 = ff_pix_norm1_mmx;
  319. }
  320. if (EXTERNAL_SSE2(cpu_flags))
  321. if (!high_bit_depth)
  322. c->get_pixels = ff_get_pixels_sse2;
  323. #if HAVE_INLINE_ASM
  324. if (INLINE_MMX(cpu_flags)) {
  325. if (!high_bit_depth &&
  326. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  327. c->fdct = ff_fdct_mmx;
  328. c->vsad[4] = vsad_intra16_mmx;
  329. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  330. c->vsad[0] = vsad16_mmx;
  331. }
  332. }
  333. if (INLINE_MMXEXT(cpu_flags)) {
  334. if (!high_bit_depth &&
  335. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  336. c->fdct = ff_fdct_mmxext;
  337. c->vsad[4] = vsad_intra16_mmxext;
  338. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  339. c->vsad[0] = vsad16_mmxext;
  340. }
  341. }
  342. if (INLINE_SSE2(cpu_flags)) {
  343. if (!high_bit_depth &&
  344. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  345. c->fdct = ff_fdct_sse2;
  346. }
  347. #if HAVE_SSSE3_INLINE
  348. if (INLINE_SSSE3(cpu_flags)) {
  349. }
  350. #endif
  351. #endif /* HAVE_INLINE_ASM */
  352. if (EXTERNAL_MMX(cpu_flags)) {
  353. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  354. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  355. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
  356. c->sse[0] = ff_sse16_mmx;
  357. c->sse[1] = ff_sse8_mmx;
  358. #if HAVE_YASM
  359. c->nsse[0] = nsse16_mmx;
  360. c->nsse[1] = nsse8_mmx;
  361. #endif
  362. }
  363. if (EXTERNAL_MMXEXT(cpu_flags)) {
  364. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  365. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  366. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
  367. }
  368. if (EXTERNAL_SSE2(cpu_flags)) {
  369. c->sse[0] = ff_sse16_sse2;
  370. c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
  371. c->diff_pixels = ff_diff_pixels_sse2;
  372. c->pix_sum = ff_pix_sum16_sse2;
  373. c->pix_norm1 = ff_pix_norm1_sse2;
  374. #if HAVE_ALIGNED_STACK
  375. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  376. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  377. #endif
  378. }
  379. if (EXTERNAL_SSSE3(cpu_flags)) {
  380. c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
  381. #if HAVE_ALIGNED_STACK
  382. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  383. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  384. #endif
  385. }
  386. if (EXTERNAL_XOP(cpu_flags)) {
  387. c->pix_sum = ff_pix_sum16_xop;
  388. }
  389. ff_dsputil_init_pix_mmx(c, avctx);
  390. }