You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

866 lines
26KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/dct.h"
  29. #include "libavcodec/dsputil.h"
  30. #include "libavcodec/mpegvideo.h"
  31. #include "libavcodec/mathops.h"
  32. #include "dsputil_x86.h"
  33. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  34. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  35. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  36. int stride);
  37. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  38. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  39. int ff_sum_abs_dctelem_mmx(int16_t *block);
  40. int ff_sum_abs_dctelem_mmxext(int16_t *block);
  41. int ff_sum_abs_dctelem_sse2(int16_t *block);
  42. int ff_sum_abs_dctelem_ssse3(int16_t *block);
  43. int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  44. int line_size, int h);
  45. int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  46. int line_size, int h);
  47. #if HAVE_INLINE_ASM
  48. static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
  49. {
  50. int tmp;
  51. __asm__ volatile (
  52. "movl %3, %%ecx\n"
  53. "pxor %%mm7, %%mm7\n"
  54. "pxor %%mm6, %%mm6\n"
  55. "movq (%0), %%mm0\n"
  56. "movq %%mm0, %%mm1\n"
  57. "psllq $8, %%mm0\n"
  58. "psrlq $8, %%mm1\n"
  59. "psrlq $8, %%mm0\n"
  60. "movq %%mm0, %%mm2\n"
  61. "movq %%mm1, %%mm3\n"
  62. "punpcklbw %%mm7, %%mm0\n"
  63. "punpcklbw %%mm7, %%mm1\n"
  64. "punpckhbw %%mm7, %%mm2\n"
  65. "punpckhbw %%mm7, %%mm3\n"
  66. "psubw %%mm1, %%mm0\n"
  67. "psubw %%mm3, %%mm2\n"
  68. "add %2, %0\n"
  69. "movq (%0), %%mm4\n"
  70. "movq %%mm4, %%mm1\n"
  71. "psllq $8, %%mm4\n"
  72. "psrlq $8, %%mm1\n"
  73. "psrlq $8, %%mm4\n"
  74. "movq %%mm4, %%mm5\n"
  75. "movq %%mm1, %%mm3\n"
  76. "punpcklbw %%mm7, %%mm4\n"
  77. "punpcklbw %%mm7, %%mm1\n"
  78. "punpckhbw %%mm7, %%mm5\n"
  79. "punpckhbw %%mm7, %%mm3\n"
  80. "psubw %%mm1, %%mm4\n"
  81. "psubw %%mm3, %%mm5\n"
  82. "psubw %%mm4, %%mm0\n"
  83. "psubw %%mm5, %%mm2\n"
  84. "pxor %%mm3, %%mm3\n"
  85. "pxor %%mm1, %%mm1\n"
  86. "pcmpgtw %%mm0, %%mm3\n\t"
  87. "pcmpgtw %%mm2, %%mm1\n\t"
  88. "pxor %%mm3, %%mm0\n"
  89. "pxor %%mm1, %%mm2\n"
  90. "psubw %%mm3, %%mm0\n"
  91. "psubw %%mm1, %%mm2\n"
  92. "paddw %%mm0, %%mm2\n"
  93. "paddw %%mm2, %%mm6\n"
  94. "add %2, %0\n"
  95. "1:\n"
  96. "movq (%0), %%mm0\n"
  97. "movq %%mm0, %%mm1\n"
  98. "psllq $8, %%mm0\n"
  99. "psrlq $8, %%mm1\n"
  100. "psrlq $8, %%mm0\n"
  101. "movq %%mm0, %%mm2\n"
  102. "movq %%mm1, %%mm3\n"
  103. "punpcklbw %%mm7, %%mm0\n"
  104. "punpcklbw %%mm7, %%mm1\n"
  105. "punpckhbw %%mm7, %%mm2\n"
  106. "punpckhbw %%mm7, %%mm3\n"
  107. "psubw %%mm1, %%mm0\n"
  108. "psubw %%mm3, %%mm2\n"
  109. "psubw %%mm0, %%mm4\n"
  110. "psubw %%mm2, %%mm5\n"
  111. "pxor %%mm3, %%mm3\n"
  112. "pxor %%mm1, %%mm1\n"
  113. "pcmpgtw %%mm4, %%mm3\n\t"
  114. "pcmpgtw %%mm5, %%mm1\n\t"
  115. "pxor %%mm3, %%mm4\n"
  116. "pxor %%mm1, %%mm5\n"
  117. "psubw %%mm3, %%mm4\n"
  118. "psubw %%mm1, %%mm5\n"
  119. "paddw %%mm4, %%mm5\n"
  120. "paddw %%mm5, %%mm6\n"
  121. "add %2, %0\n"
  122. "movq (%0), %%mm4\n"
  123. "movq %%mm4, %%mm1\n"
  124. "psllq $8, %%mm4\n"
  125. "psrlq $8, %%mm1\n"
  126. "psrlq $8, %%mm4\n"
  127. "movq %%mm4, %%mm5\n"
  128. "movq %%mm1, %%mm3\n"
  129. "punpcklbw %%mm7, %%mm4\n"
  130. "punpcklbw %%mm7, %%mm1\n"
  131. "punpckhbw %%mm7, %%mm5\n"
  132. "punpckhbw %%mm7, %%mm3\n"
  133. "psubw %%mm1, %%mm4\n"
  134. "psubw %%mm3, %%mm5\n"
  135. "psubw %%mm4, %%mm0\n"
  136. "psubw %%mm5, %%mm2\n"
  137. "pxor %%mm3, %%mm3\n"
  138. "pxor %%mm1, %%mm1\n"
  139. "pcmpgtw %%mm0, %%mm3\n\t"
  140. "pcmpgtw %%mm2, %%mm1\n\t"
  141. "pxor %%mm3, %%mm0\n"
  142. "pxor %%mm1, %%mm2\n"
  143. "psubw %%mm3, %%mm0\n"
  144. "psubw %%mm1, %%mm2\n"
  145. "paddw %%mm0, %%mm2\n"
  146. "paddw %%mm2, %%mm6\n"
  147. "add %2, %0\n"
  148. "subl $2, %%ecx\n"
  149. " jnz 1b\n"
  150. "movq %%mm6, %%mm0\n"
  151. "punpcklwd %%mm7, %%mm0\n"
  152. "punpckhwd %%mm7, %%mm6\n"
  153. "paddd %%mm0, %%mm6\n"
  154. "movq %%mm6, %%mm0\n"
  155. "psrlq $32, %%mm6\n"
  156. "paddd %%mm6, %%mm0\n"
  157. "movd %%mm0, %1\n"
  158. : "+r" (pix1), "=r" (tmp)
  159. : "r" ((x86_reg) line_size), "g" (h - 2)
  160. : "%ecx");
  161. return tmp;
  162. }
  163. static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
  164. {
  165. int tmp;
  166. uint8_t *pix = pix1;
  167. __asm__ volatile (
  168. "movl %3, %%ecx\n"
  169. "pxor %%mm7, %%mm7\n"
  170. "pxor %%mm6, %%mm6\n"
  171. "movq (%0), %%mm0\n"
  172. "movq 1(%0), %%mm1\n"
  173. "movq %%mm0, %%mm2\n"
  174. "movq %%mm1, %%mm3\n"
  175. "punpcklbw %%mm7, %%mm0\n"
  176. "punpcklbw %%mm7, %%mm1\n"
  177. "punpckhbw %%mm7, %%mm2\n"
  178. "punpckhbw %%mm7, %%mm3\n"
  179. "psubw %%mm1, %%mm0\n"
  180. "psubw %%mm3, %%mm2\n"
  181. "add %2, %0\n"
  182. "movq (%0), %%mm4\n"
  183. "movq 1(%0), %%mm1\n"
  184. "movq %%mm4, %%mm5\n"
  185. "movq %%mm1, %%mm3\n"
  186. "punpcklbw %%mm7, %%mm4\n"
  187. "punpcklbw %%mm7, %%mm1\n"
  188. "punpckhbw %%mm7, %%mm5\n"
  189. "punpckhbw %%mm7, %%mm3\n"
  190. "psubw %%mm1, %%mm4\n"
  191. "psubw %%mm3, %%mm5\n"
  192. "psubw %%mm4, %%mm0\n"
  193. "psubw %%mm5, %%mm2\n"
  194. "pxor %%mm3, %%mm3\n"
  195. "pxor %%mm1, %%mm1\n"
  196. "pcmpgtw %%mm0, %%mm3\n\t"
  197. "pcmpgtw %%mm2, %%mm1\n\t"
  198. "pxor %%mm3, %%mm0\n"
  199. "pxor %%mm1, %%mm2\n"
  200. "psubw %%mm3, %%mm0\n"
  201. "psubw %%mm1, %%mm2\n"
  202. "paddw %%mm0, %%mm2\n"
  203. "paddw %%mm2, %%mm6\n"
  204. "add %2, %0\n"
  205. "1:\n"
  206. "movq (%0), %%mm0\n"
  207. "movq 1(%0), %%mm1\n"
  208. "movq %%mm0, %%mm2\n"
  209. "movq %%mm1, %%mm3\n"
  210. "punpcklbw %%mm7, %%mm0\n"
  211. "punpcklbw %%mm7, %%mm1\n"
  212. "punpckhbw %%mm7, %%mm2\n"
  213. "punpckhbw %%mm7, %%mm3\n"
  214. "psubw %%mm1, %%mm0\n"
  215. "psubw %%mm3, %%mm2\n"
  216. "psubw %%mm0, %%mm4\n"
  217. "psubw %%mm2, %%mm5\n"
  218. "pxor %%mm3, %%mm3\n"
  219. "pxor %%mm1, %%mm1\n"
  220. "pcmpgtw %%mm4, %%mm3\n\t"
  221. "pcmpgtw %%mm5, %%mm1\n\t"
  222. "pxor %%mm3, %%mm4\n"
  223. "pxor %%mm1, %%mm5\n"
  224. "psubw %%mm3, %%mm4\n"
  225. "psubw %%mm1, %%mm5\n"
  226. "paddw %%mm4, %%mm5\n"
  227. "paddw %%mm5, %%mm6\n"
  228. "add %2, %0\n"
  229. "movq (%0), %%mm4\n"
  230. "movq 1(%0), %%mm1\n"
  231. "movq %%mm4, %%mm5\n"
  232. "movq %%mm1, %%mm3\n"
  233. "punpcklbw %%mm7, %%mm4\n"
  234. "punpcklbw %%mm7, %%mm1\n"
  235. "punpckhbw %%mm7, %%mm5\n"
  236. "punpckhbw %%mm7, %%mm3\n"
  237. "psubw %%mm1, %%mm4\n"
  238. "psubw %%mm3, %%mm5\n"
  239. "psubw %%mm4, %%mm0\n"
  240. "psubw %%mm5, %%mm2\n"
  241. "pxor %%mm3, %%mm3\n"
  242. "pxor %%mm1, %%mm1\n"
  243. "pcmpgtw %%mm0, %%mm3\n\t"
  244. "pcmpgtw %%mm2, %%mm1\n\t"
  245. "pxor %%mm3, %%mm0\n"
  246. "pxor %%mm1, %%mm2\n"
  247. "psubw %%mm3, %%mm0\n"
  248. "psubw %%mm1, %%mm2\n"
  249. "paddw %%mm0, %%mm2\n"
  250. "paddw %%mm2, %%mm6\n"
  251. "add %2, %0\n"
  252. "subl $2, %%ecx\n"
  253. " jnz 1b\n"
  254. "movq %%mm6, %%mm0\n"
  255. "punpcklwd %%mm7, %%mm0\n"
  256. "punpckhwd %%mm7, %%mm6\n"
  257. "paddd %%mm0, %%mm6\n"
  258. "movq %%mm6, %%mm0\n"
  259. "psrlq $32, %%mm6\n"
  260. "paddd %%mm6, %%mm0\n"
  261. "movd %%mm0, %1\n"
  262. : "+r" (pix1), "=r" (tmp)
  263. : "r" ((x86_reg) line_size), "g" (h - 2)
  264. : "%ecx");
  265. return tmp + hf_noise8_mmx(pix + 8, line_size, h);
  266. }
  267. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  268. int line_size, int h)
  269. {
  270. int score1, score2;
  271. if (c)
  272. score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  273. else
  274. score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
  275. score2 = hf_noise16_mmx(pix1, line_size, h) -
  276. hf_noise16_mmx(pix2, line_size, h);
  277. if (c)
  278. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  279. else
  280. return score1 + FFABS(score2) * 8;
  281. }
  282. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  283. int line_size, int h)
  284. {
  285. int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
  286. int score2 = hf_noise8_mmx(pix1, line_size, h) -
  287. hf_noise8_mmx(pix2, line_size, h);
  288. if (c)
  289. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  290. else
  291. return score1 + FFABS(score2) * 8;
  292. }
  293. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  294. int line_size, int h)
  295. {
  296. int tmp;
  297. av_assert2((((int) pix) & 7) == 0);
  298. av_assert2((line_size & 7) == 0);
  299. #define SUM(in0, in1, out0, out1) \
  300. "movq (%0), %%mm2\n" \
  301. "movq 8(%0), %%mm3\n" \
  302. "add %2,%0\n" \
  303. "movq %%mm2, " #out0 "\n" \
  304. "movq %%mm3, " #out1 "\n" \
  305. "psubusb " #in0 ", %%mm2\n" \
  306. "psubusb " #in1 ", %%mm3\n" \
  307. "psubusb " #out0 ", " #in0 "\n" \
  308. "psubusb " #out1 ", " #in1 "\n" \
  309. "por %%mm2, " #in0 "\n" \
  310. "por %%mm3, " #in1 "\n" \
  311. "movq " #in0 ", %%mm2\n" \
  312. "movq " #in1 ", %%mm3\n" \
  313. "punpcklbw %%mm7, " #in0 "\n" \
  314. "punpcklbw %%mm7, " #in1 "\n" \
  315. "punpckhbw %%mm7, %%mm2\n" \
  316. "punpckhbw %%mm7, %%mm3\n" \
  317. "paddw " #in1 ", " #in0 "\n" \
  318. "paddw %%mm3, %%mm2\n" \
  319. "paddw %%mm2, " #in0 "\n" \
  320. "paddw " #in0 ", %%mm6\n"
  321. __asm__ volatile (
  322. "movl %3, %%ecx\n"
  323. "pxor %%mm6, %%mm6\n"
  324. "pxor %%mm7, %%mm7\n"
  325. "movq (%0), %%mm0\n"
  326. "movq 8(%0), %%mm1\n"
  327. "add %2, %0\n"
  328. "jmp 2f\n"
  329. "1:\n"
  330. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  331. "2:\n"
  332. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  333. "subl $2, %%ecx\n"
  334. "jnz 1b\n"
  335. "movq %%mm6, %%mm0\n"
  336. "psrlq $32, %%mm6\n"
  337. "paddw %%mm6, %%mm0\n"
  338. "movq %%mm0, %%mm6\n"
  339. "psrlq $16, %%mm0\n"
  340. "paddw %%mm6, %%mm0\n"
  341. "movd %%mm0, %1\n"
  342. : "+r" (pix), "=r" (tmp)
  343. : "r" ((x86_reg) line_size), "m" (h)
  344. : "%ecx");
  345. return tmp & 0xFFFF;
  346. }
  347. #undef SUM
  348. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  349. int line_size, int h)
  350. {
  351. int tmp;
  352. av_assert2((((int) pix) & 7) == 0);
  353. av_assert2((line_size & 7) == 0);
  354. #define SUM(in0, in1, out0, out1) \
  355. "movq (%0), " #out0 "\n" \
  356. "movq 8(%0), " #out1 "\n" \
  357. "add %2, %0\n" \
  358. "psadbw " #out0 ", " #in0 "\n" \
  359. "psadbw " #out1 ", " #in1 "\n" \
  360. "paddw " #in1 ", " #in0 "\n" \
  361. "paddw " #in0 ", %%mm6\n"
  362. __asm__ volatile (
  363. "movl %3, %%ecx\n"
  364. "pxor %%mm6, %%mm6\n"
  365. "pxor %%mm7, %%mm7\n"
  366. "movq (%0), %%mm0\n"
  367. "movq 8(%0), %%mm1\n"
  368. "add %2, %0\n"
  369. "jmp 2f\n"
  370. "1:\n"
  371. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  372. "2:\n"
  373. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  374. "subl $2, %%ecx\n"
  375. "jnz 1b\n"
  376. "movd %%mm6, %1\n"
  377. : "+r" (pix), "=r" (tmp)
  378. : "r" ((x86_reg) line_size), "m" (h)
  379. : "%ecx");
  380. return tmp;
  381. }
  382. #undef SUM
  383. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  384. int line_size, int h)
  385. {
  386. int tmp;
  387. av_assert2((((int) pix1) & 7) == 0);
  388. av_assert2((((int) pix2) & 7) == 0);
  389. av_assert2((line_size & 7) == 0);
  390. #define SUM(in0, in1, out0, out1) \
  391. "movq (%0), %%mm2\n" \
  392. "movq (%1), " #out0 "\n" \
  393. "movq 8(%0), %%mm3\n" \
  394. "movq 8(%1), " #out1 "\n" \
  395. "add %3, %0\n" \
  396. "add %3, %1\n" \
  397. "psubb " #out0 ", %%mm2\n" \
  398. "psubb " #out1 ", %%mm3\n" \
  399. "pxor %%mm7, %%mm2\n" \
  400. "pxor %%mm7, %%mm3\n" \
  401. "movq %%mm2, " #out0 "\n" \
  402. "movq %%mm3, " #out1 "\n" \
  403. "psubusb " #in0 ", %%mm2\n" \
  404. "psubusb " #in1 ", %%mm3\n" \
  405. "psubusb " #out0 ", " #in0 "\n" \
  406. "psubusb " #out1 ", " #in1 "\n" \
  407. "por %%mm2, " #in0 "\n" \
  408. "por %%mm3, " #in1 "\n" \
  409. "movq " #in0 ", %%mm2\n" \
  410. "movq " #in1 ", %%mm3\n" \
  411. "punpcklbw %%mm7, " #in0 "\n" \
  412. "punpcklbw %%mm7, " #in1 "\n" \
  413. "punpckhbw %%mm7, %%mm2\n" \
  414. "punpckhbw %%mm7, %%mm3\n" \
  415. "paddw " #in1 ", " #in0 "\n" \
  416. "paddw %%mm3, %%mm2\n" \
  417. "paddw %%mm2, " #in0 "\n" \
  418. "paddw " #in0 ", %%mm6\n"
  419. __asm__ volatile (
  420. "movl %4, %%ecx\n"
  421. "pxor %%mm6, %%mm6\n"
  422. "pcmpeqw %%mm7, %%mm7\n"
  423. "psllw $15, %%mm7\n"
  424. "packsswb %%mm7, %%mm7\n"
  425. "movq (%0), %%mm0\n"
  426. "movq (%1), %%mm2\n"
  427. "movq 8(%0), %%mm1\n"
  428. "movq 8(%1), %%mm3\n"
  429. "add %3, %0\n"
  430. "add %3, %1\n"
  431. "psubb %%mm2, %%mm0\n"
  432. "psubb %%mm3, %%mm1\n"
  433. "pxor %%mm7, %%mm0\n"
  434. "pxor %%mm7, %%mm1\n"
  435. "jmp 2f\n"
  436. "1:\n"
  437. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  438. "2:\n"
  439. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  440. "subl $2, %%ecx\n"
  441. "jnz 1b\n"
  442. "movq %%mm6, %%mm0\n"
  443. "psrlq $32, %%mm6\n"
  444. "paddw %%mm6, %%mm0\n"
  445. "movq %%mm0, %%mm6\n"
  446. "psrlq $16, %%mm0\n"
  447. "paddw %%mm6, %%mm0\n"
  448. "movd %%mm0, %2\n"
  449. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  450. : "r" ((x86_reg) line_size), "m" (h)
  451. : "%ecx");
  452. return tmp & 0x7FFF;
  453. }
  454. #undef SUM
  455. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  456. int line_size, int h)
  457. {
  458. int tmp;
  459. av_assert2((((int) pix1) & 7) == 0);
  460. av_assert2((((int) pix2) & 7) == 0);
  461. av_assert2((line_size & 7) == 0);
  462. #define SUM(in0, in1, out0, out1) \
  463. "movq (%0), " #out0 "\n" \
  464. "movq (%1), %%mm2\n" \
  465. "movq 8(%0), " #out1 "\n" \
  466. "movq 8(%1), %%mm3\n" \
  467. "add %3, %0\n" \
  468. "add %3, %1\n" \
  469. "psubb %%mm2, " #out0 "\n" \
  470. "psubb %%mm3, " #out1 "\n" \
  471. "pxor %%mm7, " #out0 "\n" \
  472. "pxor %%mm7, " #out1 "\n" \
  473. "psadbw " #out0 ", " #in0 "\n" \
  474. "psadbw " #out1 ", " #in1 "\n" \
  475. "paddw " #in1 ", " #in0 "\n" \
  476. "paddw " #in0 ", %%mm6\n "
  477. __asm__ volatile (
  478. "movl %4, %%ecx\n"
  479. "pxor %%mm6, %%mm6\n"
  480. "pcmpeqw %%mm7, %%mm7\n"
  481. "psllw $15, %%mm7\n"
  482. "packsswb %%mm7, %%mm7\n"
  483. "movq (%0), %%mm0\n"
  484. "movq (%1), %%mm2\n"
  485. "movq 8(%0), %%mm1\n"
  486. "movq 8(%1), %%mm3\n"
  487. "add %3, %0\n"
  488. "add %3, %1\n"
  489. "psubb %%mm2, %%mm0\n"
  490. "psubb %%mm3, %%mm1\n"
  491. "pxor %%mm7, %%mm0\n"
  492. "pxor %%mm7, %%mm1\n"
  493. "jmp 2f\n"
  494. "1:\n"
  495. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  496. "2:\n"
  497. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  498. "subl $2, %%ecx\n"
  499. "jnz 1b\n"
  500. "movd %%mm6, %2\n"
  501. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  502. : "r" ((x86_reg) line_size), "m" (h)
  503. : "%ecx");
  504. return tmp;
  505. }
  506. #undef SUM
  507. static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
  508. {
  509. x86_reg i = 0;
  510. if (w >= 16)
  511. __asm__ volatile (
  512. "1: \n\t"
  513. "movq (%2, %0), %%mm0 \n\t"
  514. "movq (%1, %0), %%mm1 \n\t"
  515. "psubb %%mm0, %%mm1 \n\t"
  516. "movq %%mm1, (%3, %0) \n\t"
  517. "movq 8(%2, %0), %%mm0 \n\t"
  518. "movq 8(%1, %0), %%mm1 \n\t"
  519. "psubb %%mm0, %%mm1 \n\t"
  520. "movq %%mm1, 8(%3, %0) \n\t"
  521. "add $16, %0 \n\t"
  522. "cmp %4, %0 \n\t"
  523. " jb 1b \n\t"
  524. : "+r" (i)
  525. : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w - 15));
  526. for (; i < w; i++)
  527. dst[i + 0] = src1[i + 0] - src2[i + 0];
  528. }
  529. static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
  530. const uint8_t *src2, int w,
  531. int *left, int *left_top)
  532. {
  533. x86_reg i = 0;
  534. uint8_t l, lt;
  535. __asm__ volatile (
  536. "movq (%1, %0), %%mm0 \n\t" // LT
  537. "psllq $8, %%mm0 \n\t"
  538. "1: \n\t"
  539. "movq (%1, %0), %%mm1 \n\t" // T
  540. "movq -1(%2, %0), %%mm2 \n\t" // L
  541. "movq (%2, %0), %%mm3 \n\t" // X
  542. "movq %%mm2, %%mm4 \n\t" // L
  543. "psubb %%mm0, %%mm2 \n\t"
  544. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  545. "movq %%mm4, %%mm5 \n\t" // L
  546. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  547. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  548. "pminub %%mm2, %%mm4 \n\t"
  549. "pmaxub %%mm1, %%mm4 \n\t"
  550. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  551. "movq %%mm3, (%3, %0) \n\t"
  552. "add $8, %0 \n\t"
  553. "movq -1(%1, %0), %%mm0 \n\t" // LT
  554. "cmp %4, %0 \n\t"
  555. " jb 1b \n\t"
  556. : "+r" (i)
  557. : "r" (src1), "r" (src2), "r" (dst), "r" ((x86_reg) w));
  558. l = *left;
  559. lt = *left_top;
  560. dst[0] = src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt) & 0xFF);
  561. *left_top = src1[w - 1];
  562. *left = src2[w - 1];
  563. }
  564. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
  565. int size)
  566. {
  567. int sum;
  568. x86_reg i = size;
  569. __asm__ volatile (
  570. "pxor %%mm4, %%mm4 \n"
  571. "1: \n"
  572. "sub $8, %0 \n"
  573. "movq (%2, %0), %%mm2 \n"
  574. "movq (%3, %0, 2), %%mm0 \n"
  575. "movq 8(%3, %0, 2), %%mm1 \n"
  576. "punpckhbw %%mm2, %%mm3 \n"
  577. "punpcklbw %%mm2, %%mm2 \n"
  578. "psraw $8, %%mm3 \n"
  579. "psraw $8, %%mm2 \n"
  580. "psubw %%mm3, %%mm1 \n"
  581. "psubw %%mm2, %%mm0 \n"
  582. "pmaddwd %%mm1, %%mm1 \n"
  583. "pmaddwd %%mm0, %%mm0 \n"
  584. "paddd %%mm1, %%mm4 \n"
  585. "paddd %%mm0, %%mm4 \n"
  586. "jg 1b \n"
  587. "movq %%mm4, %%mm3 \n"
  588. "psrlq $32, %%mm3 \n"
  589. "paddd %%mm3, %%mm4 \n"
  590. "movd %%mm4, %1 \n"
  591. : "+r" (i), "=r" (sum)
  592. : "r" (pix1), "r" (pix2));
  593. return sum;
  594. }
  595. #define PHADDD(a, t) \
  596. "movq " #a ", " #t " \n\t" \
  597. "psrlq $32, " #a " \n\t" \
  598. "paddd " #t ", " #a " \n\t"
  599. /*
  600. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  601. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  602. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  603. */
  604. #define PMULHRW(x, y, s, o) \
  605. "pmulhw " #s ", " #x " \n\t" \
  606. "pmulhw " #s ", " #y " \n\t" \
  607. "paddw " #o ", " #x " \n\t" \
  608. "paddw " #o ", " #y " \n\t" \
  609. "psraw $1, " #x " \n\t" \
  610. "psraw $1, " #y " \n\t"
  611. #define DEF(x) x ## _mmx
  612. #define SET_RND MOVQ_WONE
  613. #define SCALE_OFFSET 1
  614. #include "dsputil_qns_template.c"
  615. #undef DEF
  616. #undef SET_RND
  617. #undef SCALE_OFFSET
  618. #undef PMULHRW
  619. #define DEF(x) x ## _3dnow
  620. #define SET_RND(x)
  621. #define SCALE_OFFSET 0
  622. #define PMULHRW(x, y, s, o) \
  623. "pmulhrw " #s ", " #x " \n\t" \
  624. "pmulhrw " #s ", " #y " \n\t"
  625. #include "dsputil_qns_template.c"
  626. #undef DEF
  627. #undef SET_RND
  628. #undef SCALE_OFFSET
  629. #undef PMULHRW
  630. #if HAVE_SSSE3_INLINE
  631. #undef PHADDD
  632. #define DEF(x) x ## _ssse3
  633. #define SET_RND(x)
  634. #define SCALE_OFFSET -1
  635. #define PHADDD(a, t) \
  636. "pshufw $0x0E, " #a ", " #t " \n\t" \
  637. /* faster than phaddd on core2 */ \
  638. "paddd " #t ", " #a " \n\t"
  639. #define PMULHRW(x, y, s, o) \
  640. "pmulhrsw " #s ", " #x " \n\t" \
  641. "pmulhrsw " #s ", " #y " \n\t"
  642. #include "dsputil_qns_template.c"
  643. #undef DEF
  644. #undef SET_RND
  645. #undef SCALE_OFFSET
  646. #undef PMULHRW
  647. #undef PHADDD
  648. #endif /* HAVE_SSSE3_INLINE */
  649. #endif /* HAVE_INLINE_ASM */
  650. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  651. int line_size, int h);
  652. #define hadamard_func(cpu) \
  653. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  654. uint8_t *src2, int stride, int h); \
  655. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  656. uint8_t *src2, int stride, int h);
  657. hadamard_func(mmx)
  658. hadamard_func(mmxext)
  659. hadamard_func(sse2)
  660. hadamard_func(ssse3)
  661. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
  662. unsigned high_bit_depth)
  663. {
  664. int cpu_flags = av_get_cpu_flags();
  665. const int dct_algo = avctx->dct_algo;
  666. if (EXTERNAL_MMX(cpu_flags)) {
  667. if (!high_bit_depth)
  668. c->get_pixels = ff_get_pixels_mmx;
  669. c->diff_pixels = ff_diff_pixels_mmx;
  670. c->pix_sum = ff_pix_sum16_mmx;
  671. c->pix_norm1 = ff_pix_norm1_mmx;
  672. c->sse[0] = ff_sse16_mmx;
  673. c->sse[1] = ff_sse8_mmx;
  674. }
  675. if (EXTERNAL_SSE2(cpu_flags))
  676. if (!high_bit_depth)
  677. c->get_pixels = ff_get_pixels_sse2;
  678. #if HAVE_INLINE_ASM
  679. if (INLINE_MMX(cpu_flags)) {
  680. if (!high_bit_depth &&
  681. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  682. c->fdct = ff_fdct_mmx;
  683. c->diff_bytes = diff_bytes_mmx;
  684. c->vsad[4] = vsad_intra16_mmx;
  685. c->nsse[0] = nsse16_mmx;
  686. c->nsse[1] = nsse8_mmx;
  687. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  688. c->vsad[0] = vsad16_mmx;
  689. c->try_8x8basis = try_8x8basis_mmx;
  690. }
  691. c->add_8x8basis = add_8x8basis_mmx;
  692. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  693. }
  694. if (INLINE_AMD3DNOW(cpu_flags)) {
  695. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  696. c->try_8x8basis = try_8x8basis_3dnow;
  697. }
  698. c->add_8x8basis = add_8x8basis_3dnow;
  699. }
  700. if (INLINE_MMXEXT(cpu_flags)) {
  701. if (!high_bit_depth &&
  702. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  703. c->fdct = ff_fdct_mmxext;
  704. c->vsad[4] = vsad_intra16_mmxext;
  705. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  706. c->vsad[0] = vsad16_mmxext;
  707. }
  708. c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
  709. }
  710. if (INLINE_SSE2(cpu_flags)) {
  711. if (!high_bit_depth &&
  712. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  713. c->fdct = ff_fdct_sse2;
  714. }
  715. #if HAVE_SSSE3_INLINE
  716. if (INLINE_SSSE3(cpu_flags)) {
  717. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  718. c->try_8x8basis = try_8x8basis_ssse3;
  719. }
  720. c->add_8x8basis = add_8x8basis_ssse3;
  721. }
  722. #endif
  723. #endif /* HAVE_INLINE_ASM */
  724. if (EXTERNAL_MMX(cpu_flags)) {
  725. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  726. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  727. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
  728. }
  729. if (EXTERNAL_MMXEXT(cpu_flags)) {
  730. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  731. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  732. c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
  733. }
  734. if (EXTERNAL_SSE2(cpu_flags)) {
  735. c->sse[0] = ff_sse16_sse2;
  736. c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
  737. #if HAVE_ALIGNED_STACK
  738. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  739. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  740. #endif
  741. }
  742. if (EXTERNAL_SSSE3(cpu_flags)) {
  743. c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
  744. #if HAVE_ALIGNED_STACK
  745. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  746. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  747. #endif
  748. }
  749. ff_dsputil_init_pix_mmx(c, avctx);
  750. }