You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1030 lines
31KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/dct.h"
  29. #include "libavcodec/dsputil.h"
  30. #include "libavcodec/mpegvideo.h"
  31. #include "dsputil_x86.h"
  32. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  33. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  34. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  35. int stride);
  36. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  37. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  38. #if HAVE_INLINE_ASM
  39. static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  40. int line_size, int h)
  41. {
  42. int tmp;
  43. __asm__ volatile (
  44. "movl %4, %%ecx \n"
  45. "shr $1, %%ecx \n"
  46. "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
  47. "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
  48. "1: \n"
  49. "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
  50. "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
  51. "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
  52. "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
  53. /* todo: mm1-mm2, mm3-mm4 */
  54. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  55. /* OR the results to get absolute difference */
  56. "movq %%mm1, %%mm5 \n"
  57. "movq %%mm3, %%mm6 \n"
  58. "psubusb %%mm2, %%mm1 \n"
  59. "psubusb %%mm4, %%mm3 \n"
  60. "psubusb %%mm5, %%mm2 \n"
  61. "psubusb %%mm6, %%mm4 \n"
  62. "por %%mm1, %%mm2 \n"
  63. "por %%mm3, %%mm4 \n"
  64. /* now convert to 16-bit vectors so we can square them */
  65. "movq %%mm2, %%mm1 \n"
  66. "movq %%mm4, %%mm3 \n"
  67. "punpckhbw %%mm0, %%mm2 \n"
  68. "punpckhbw %%mm0, %%mm4 \n"
  69. "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
  70. "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
  71. "pmaddwd %%mm2, %%mm2 \n"
  72. "pmaddwd %%mm4, %%mm4 \n"
  73. "pmaddwd %%mm1, %%mm1 \n"
  74. "pmaddwd %%mm3, %%mm3 \n"
  75. "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */
  76. "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */
  77. "paddd %%mm2, %%mm1 \n"
  78. "paddd %%mm4, %%mm3 \n"
  79. "paddd %%mm1, %%mm7 \n"
  80. "paddd %%mm3, %%mm7 \n"
  81. "decl %%ecx \n"
  82. "jnz 1b \n"
  83. "movq %%mm7, %%mm1 \n"
  84. "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
  85. "paddd %%mm7, %%mm1 \n"
  86. "movd %%mm1, %2 \n"
  87. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  88. : "r" ((x86_reg) line_size), "m" (h)
  89. : "%ecx");
  90. return tmp;
  91. }
  92. static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  93. int line_size, int h)
  94. {
  95. int tmp;
  96. __asm__ volatile (
  97. "movl %4, %%ecx\n"
  98. "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
  99. "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
  100. "1:\n"
  101. "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
  102. "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
  103. "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
  104. "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
  105. /* todo: mm1-mm2, mm3-mm4 */
  106. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  107. /* OR the results to get absolute difference */
  108. "movq %%mm1, %%mm5\n"
  109. "movq %%mm3, %%mm6\n"
  110. "psubusb %%mm2, %%mm1\n"
  111. "psubusb %%mm4, %%mm3\n"
  112. "psubusb %%mm5, %%mm2\n"
  113. "psubusb %%mm6, %%mm4\n"
  114. "por %%mm1, %%mm2\n"
  115. "por %%mm3, %%mm4\n"
  116. /* now convert to 16-bit vectors so we can square them */
  117. "movq %%mm2, %%mm1\n"
  118. "movq %%mm4, %%mm3\n"
  119. "punpckhbw %%mm0, %%mm2\n"
  120. "punpckhbw %%mm0, %%mm4\n"
  121. "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
  122. "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
  123. "pmaddwd %%mm2, %%mm2\n"
  124. "pmaddwd %%mm4, %%mm4\n"
  125. "pmaddwd %%mm1, %%mm1\n"
  126. "pmaddwd %%mm3, %%mm3\n"
  127. "add %3, %0\n"
  128. "add %3, %1\n"
  129. "paddd %%mm2, %%mm1\n"
  130. "paddd %%mm4, %%mm3\n"
  131. "paddd %%mm1, %%mm7\n"
  132. "paddd %%mm3, %%mm7\n"
  133. "decl %%ecx\n"
  134. "jnz 1b\n"
  135. "movq %%mm7, %%mm1\n"
  136. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  137. "paddd %%mm7, %%mm1\n"
  138. "movd %%mm1, %2\n"
  139. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  140. : "r" ((x86_reg) line_size), "m" (h)
  141. : "%ecx");
  142. return tmp;
  143. }
  144. static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
  145. {
  146. int tmp;
  147. __asm__ volatile (
  148. "movl %3, %%ecx\n"
  149. "pxor %%mm7, %%mm7\n"
  150. "pxor %%mm6, %%mm6\n"
  151. "movq (%0), %%mm0\n"
  152. "movq %%mm0, %%mm1\n"
  153. "psllq $8, %%mm0\n"
  154. "psrlq $8, %%mm1\n"
  155. "psrlq $8, %%mm0\n"
  156. "movq %%mm0, %%mm2\n"
  157. "movq %%mm1, %%mm3\n"
  158. "punpcklbw %%mm7, %%mm0\n"
  159. "punpcklbw %%mm7, %%mm1\n"
  160. "punpckhbw %%mm7, %%mm2\n"
  161. "punpckhbw %%mm7, %%mm3\n"
  162. "psubw %%mm1, %%mm0\n"
  163. "psubw %%mm3, %%mm2\n"
  164. "add %2, %0\n"
  165. "movq (%0), %%mm4\n"
  166. "movq %%mm4, %%mm1\n"
  167. "psllq $8, %%mm4\n"
  168. "psrlq $8, %%mm1\n"
  169. "psrlq $8, %%mm4\n"
  170. "movq %%mm4, %%mm5\n"
  171. "movq %%mm1, %%mm3\n"
  172. "punpcklbw %%mm7, %%mm4\n"
  173. "punpcklbw %%mm7, %%mm1\n"
  174. "punpckhbw %%mm7, %%mm5\n"
  175. "punpckhbw %%mm7, %%mm3\n"
  176. "psubw %%mm1, %%mm4\n"
  177. "psubw %%mm3, %%mm5\n"
  178. "psubw %%mm4, %%mm0\n"
  179. "psubw %%mm5, %%mm2\n"
  180. "pxor %%mm3, %%mm3\n"
  181. "pxor %%mm1, %%mm1\n"
  182. "pcmpgtw %%mm0, %%mm3\n\t"
  183. "pcmpgtw %%mm2, %%mm1\n\t"
  184. "pxor %%mm3, %%mm0\n"
  185. "pxor %%mm1, %%mm2\n"
  186. "psubw %%mm3, %%mm0\n"
  187. "psubw %%mm1, %%mm2\n"
  188. "paddw %%mm0, %%mm2\n"
  189. "paddw %%mm2, %%mm6\n"
  190. "add %2, %0\n"
  191. "1:\n"
  192. "movq (%0), %%mm0\n"
  193. "movq %%mm0, %%mm1\n"
  194. "psllq $8, %%mm0\n"
  195. "psrlq $8, %%mm1\n"
  196. "psrlq $8, %%mm0\n"
  197. "movq %%mm0, %%mm2\n"
  198. "movq %%mm1, %%mm3\n"
  199. "punpcklbw %%mm7, %%mm0\n"
  200. "punpcklbw %%mm7, %%mm1\n"
  201. "punpckhbw %%mm7, %%mm2\n"
  202. "punpckhbw %%mm7, %%mm3\n"
  203. "psubw %%mm1, %%mm0\n"
  204. "psubw %%mm3, %%mm2\n"
  205. "psubw %%mm0, %%mm4\n"
  206. "psubw %%mm2, %%mm5\n"
  207. "pxor %%mm3, %%mm3\n"
  208. "pxor %%mm1, %%mm1\n"
  209. "pcmpgtw %%mm4, %%mm3\n\t"
  210. "pcmpgtw %%mm5, %%mm1\n\t"
  211. "pxor %%mm3, %%mm4\n"
  212. "pxor %%mm1, %%mm5\n"
  213. "psubw %%mm3, %%mm4\n"
  214. "psubw %%mm1, %%mm5\n"
  215. "paddw %%mm4, %%mm5\n"
  216. "paddw %%mm5, %%mm6\n"
  217. "add %2, %0\n"
  218. "movq (%0), %%mm4\n"
  219. "movq %%mm4, %%mm1\n"
  220. "psllq $8, %%mm4\n"
  221. "psrlq $8, %%mm1\n"
  222. "psrlq $8, %%mm4\n"
  223. "movq %%mm4, %%mm5\n"
  224. "movq %%mm1, %%mm3\n"
  225. "punpcklbw %%mm7, %%mm4\n"
  226. "punpcklbw %%mm7, %%mm1\n"
  227. "punpckhbw %%mm7, %%mm5\n"
  228. "punpckhbw %%mm7, %%mm3\n"
  229. "psubw %%mm1, %%mm4\n"
  230. "psubw %%mm3, %%mm5\n"
  231. "psubw %%mm4, %%mm0\n"
  232. "psubw %%mm5, %%mm2\n"
  233. "pxor %%mm3, %%mm3\n"
  234. "pxor %%mm1, %%mm1\n"
  235. "pcmpgtw %%mm0, %%mm3\n\t"
  236. "pcmpgtw %%mm2, %%mm1\n\t"
  237. "pxor %%mm3, %%mm0\n"
  238. "pxor %%mm1, %%mm2\n"
  239. "psubw %%mm3, %%mm0\n"
  240. "psubw %%mm1, %%mm2\n"
  241. "paddw %%mm0, %%mm2\n"
  242. "paddw %%mm2, %%mm6\n"
  243. "add %2, %0\n"
  244. "subl $2, %%ecx\n"
  245. " jnz 1b\n"
  246. "movq %%mm6, %%mm0\n"
  247. "punpcklwd %%mm7, %%mm0\n"
  248. "punpckhwd %%mm7, %%mm6\n"
  249. "paddd %%mm0, %%mm6\n"
  250. "movq %%mm6, %%mm0\n"
  251. "psrlq $32, %%mm6\n"
  252. "paddd %%mm6, %%mm0\n"
  253. "movd %%mm0, %1\n"
  254. : "+r" (pix1), "=r" (tmp)
  255. : "r" ((x86_reg) line_size), "g" (h - 2)
  256. : "%ecx");
  257. return tmp;
  258. }
  259. static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
  260. {
  261. int tmp;
  262. uint8_t *pix = pix1;
  263. __asm__ volatile (
  264. "movl %3, %%ecx\n"
  265. "pxor %%mm7, %%mm7\n"
  266. "pxor %%mm6, %%mm6\n"
  267. "movq (%0), %%mm0\n"
  268. "movq 1(%0), %%mm1\n"
  269. "movq %%mm0, %%mm2\n"
  270. "movq %%mm1, %%mm3\n"
  271. "punpcklbw %%mm7, %%mm0\n"
  272. "punpcklbw %%mm7, %%mm1\n"
  273. "punpckhbw %%mm7, %%mm2\n"
  274. "punpckhbw %%mm7, %%mm3\n"
  275. "psubw %%mm1, %%mm0\n"
  276. "psubw %%mm3, %%mm2\n"
  277. "add %2, %0\n"
  278. "movq (%0), %%mm4\n"
  279. "movq 1(%0), %%mm1\n"
  280. "movq %%mm4, %%mm5\n"
  281. "movq %%mm1, %%mm3\n"
  282. "punpcklbw %%mm7, %%mm4\n"
  283. "punpcklbw %%mm7, %%mm1\n"
  284. "punpckhbw %%mm7, %%mm5\n"
  285. "punpckhbw %%mm7, %%mm3\n"
  286. "psubw %%mm1, %%mm4\n"
  287. "psubw %%mm3, %%mm5\n"
  288. "psubw %%mm4, %%mm0\n"
  289. "psubw %%mm5, %%mm2\n"
  290. "pxor %%mm3, %%mm3\n"
  291. "pxor %%mm1, %%mm1\n"
  292. "pcmpgtw %%mm0, %%mm3\n\t"
  293. "pcmpgtw %%mm2, %%mm1\n\t"
  294. "pxor %%mm3, %%mm0\n"
  295. "pxor %%mm1, %%mm2\n"
  296. "psubw %%mm3, %%mm0\n"
  297. "psubw %%mm1, %%mm2\n"
  298. "paddw %%mm0, %%mm2\n"
  299. "paddw %%mm2, %%mm6\n"
  300. "add %2, %0\n"
  301. "1:\n"
  302. "movq (%0), %%mm0\n"
  303. "movq 1(%0), %%mm1\n"
  304. "movq %%mm0, %%mm2\n"
  305. "movq %%mm1, %%mm3\n"
  306. "punpcklbw %%mm7, %%mm0\n"
  307. "punpcklbw %%mm7, %%mm1\n"
  308. "punpckhbw %%mm7, %%mm2\n"
  309. "punpckhbw %%mm7, %%mm3\n"
  310. "psubw %%mm1, %%mm0\n"
  311. "psubw %%mm3, %%mm2\n"
  312. "psubw %%mm0, %%mm4\n"
  313. "psubw %%mm2, %%mm5\n"
  314. "pxor %%mm3, %%mm3\n"
  315. "pxor %%mm1, %%mm1\n"
  316. "pcmpgtw %%mm4, %%mm3\n\t"
  317. "pcmpgtw %%mm5, %%mm1\n\t"
  318. "pxor %%mm3, %%mm4\n"
  319. "pxor %%mm1, %%mm5\n"
  320. "psubw %%mm3, %%mm4\n"
  321. "psubw %%mm1, %%mm5\n"
  322. "paddw %%mm4, %%mm5\n"
  323. "paddw %%mm5, %%mm6\n"
  324. "add %2, %0\n"
  325. "movq (%0), %%mm4\n"
  326. "movq 1(%0), %%mm1\n"
  327. "movq %%mm4, %%mm5\n"
  328. "movq %%mm1, %%mm3\n"
  329. "punpcklbw %%mm7, %%mm4\n"
  330. "punpcklbw %%mm7, %%mm1\n"
  331. "punpckhbw %%mm7, %%mm5\n"
  332. "punpckhbw %%mm7, %%mm3\n"
  333. "psubw %%mm1, %%mm4\n"
  334. "psubw %%mm3, %%mm5\n"
  335. "psubw %%mm4, %%mm0\n"
  336. "psubw %%mm5, %%mm2\n"
  337. "pxor %%mm3, %%mm3\n"
  338. "pxor %%mm1, %%mm1\n"
  339. "pcmpgtw %%mm0, %%mm3\n\t"
  340. "pcmpgtw %%mm2, %%mm1\n\t"
  341. "pxor %%mm3, %%mm0\n"
  342. "pxor %%mm1, %%mm2\n"
  343. "psubw %%mm3, %%mm0\n"
  344. "psubw %%mm1, %%mm2\n"
  345. "paddw %%mm0, %%mm2\n"
  346. "paddw %%mm2, %%mm6\n"
  347. "add %2, %0\n"
  348. "subl $2, %%ecx\n"
  349. " jnz 1b\n"
  350. "movq %%mm6, %%mm0\n"
  351. "punpcklwd %%mm7, %%mm0\n"
  352. "punpckhwd %%mm7, %%mm6\n"
  353. "paddd %%mm0, %%mm6\n"
  354. "movq %%mm6, %%mm0\n"
  355. "psrlq $32, %%mm6\n"
  356. "paddd %%mm6, %%mm0\n"
  357. "movd %%mm0, %1\n"
  358. : "+r" (pix1), "=r" (tmp)
  359. : "r" ((x86_reg) line_size), "g" (h - 2)
  360. : "%ecx");
  361. return tmp + hf_noise8_mmx(pix + 8, line_size, h);
  362. }
  363. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  364. int line_size, int h)
  365. {
  366. int score1, score2;
  367. if (c)
  368. score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  369. else
  370. score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  371. score2 = hf_noise16_mmx(pix1, line_size, h) -
  372. hf_noise16_mmx(pix2, line_size, h);
  373. if (c)
  374. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  375. else
  376. return score1 + FFABS(score2) * 8;
  377. }
  378. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  379. int line_size, int h)
  380. {
  381. int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
  382. int score2 = hf_noise8_mmx(pix1, line_size, h) -
  383. hf_noise8_mmx(pix2, line_size, h);
  384. if (c)
  385. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  386. else
  387. return score1 + FFABS(score2) * 8;
  388. }
  389. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  390. int line_size, int h)
  391. {
  392. int tmp;
  393. assert((((int) pix) & 7) == 0);
  394. assert((line_size & 7) == 0);
  395. #define SUM(in0, in1, out0, out1) \
  396. "movq (%0), %%mm2\n" \
  397. "movq 8(%0), %%mm3\n" \
  398. "add %2,%0\n" \
  399. "movq %%mm2, " #out0 "\n" \
  400. "movq %%mm3, " #out1 "\n" \
  401. "psubusb " #in0 ", %%mm2\n" \
  402. "psubusb " #in1 ", %%mm3\n" \
  403. "psubusb " #out0 ", " #in0 "\n" \
  404. "psubusb " #out1 ", " #in1 "\n" \
  405. "por %%mm2, " #in0 "\n" \
  406. "por %%mm3, " #in1 "\n" \
  407. "movq " #in0 ", %%mm2\n" \
  408. "movq " #in1 ", %%mm3\n" \
  409. "punpcklbw %%mm7, " #in0 "\n" \
  410. "punpcklbw %%mm7, " #in1 "\n" \
  411. "punpckhbw %%mm7, %%mm2\n" \
  412. "punpckhbw %%mm7, %%mm3\n" \
  413. "paddw " #in1 ", " #in0 "\n" \
  414. "paddw %%mm3, %%mm2\n" \
  415. "paddw %%mm2, " #in0 "\n" \
  416. "paddw " #in0 ", %%mm6\n"
  417. __asm__ volatile (
  418. "movl %3, %%ecx\n"
  419. "pxor %%mm6, %%mm6\n"
  420. "pxor %%mm7, %%mm7\n"
  421. "movq (%0), %%mm0\n"
  422. "movq 8(%0), %%mm1\n"
  423. "add %2, %0\n"
  424. "jmp 2f\n"
  425. "1:\n"
  426. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  427. "2:\n"
  428. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  429. "subl $2, %%ecx\n"
  430. "jnz 1b\n"
  431. "movq %%mm6, %%mm0\n"
  432. "psrlq $32, %%mm6\n"
  433. "paddw %%mm6, %%mm0\n"
  434. "movq %%mm0, %%mm6\n"
  435. "psrlq $16, %%mm0\n"
  436. "paddw %%mm6, %%mm0\n"
  437. "movd %%mm0, %1\n"
  438. : "+r" (pix), "=r" (tmp)
  439. : "r" ((x86_reg) line_size), "m" (h)
  440. : "%ecx");
  441. return tmp & 0xFFFF;
  442. }
  443. #undef SUM
  444. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  445. int line_size, int h)
  446. {
  447. int tmp;
  448. assert((((int) pix) & 7) == 0);
  449. assert((line_size & 7) == 0);
  450. #define SUM(in0, in1, out0, out1) \
  451. "movq (%0), " #out0 "\n" \
  452. "movq 8(%0), " #out1 "\n" \
  453. "add %2, %0\n" \
  454. "psadbw " #out0 ", " #in0 "\n" \
  455. "psadbw " #out1 ", " #in1 "\n" \
  456. "paddw " #in1 ", " #in0 "\n" \
  457. "paddw " #in0 ", %%mm6\n"
  458. __asm__ volatile (
  459. "movl %3, %%ecx\n"
  460. "pxor %%mm6, %%mm6\n"
  461. "pxor %%mm7, %%mm7\n"
  462. "movq (%0), %%mm0\n"
  463. "movq 8(%0), %%mm1\n"
  464. "add %2, %0\n"
  465. "jmp 2f\n"
  466. "1:\n"
  467. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  468. "2:\n"
  469. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  470. "subl $2, %%ecx\n"
  471. "jnz 1b\n"
  472. "movd %%mm6, %1\n"
  473. : "+r" (pix), "=r" (tmp)
  474. : "r" ((x86_reg) line_size), "m" (h)
  475. : "%ecx");
  476. return tmp;
  477. }
  478. #undef SUM
  479. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  480. int line_size, int h)
  481. {
  482. int tmp;
  483. assert((((int) pix1) & 7) == 0);
  484. assert((((int) pix2) & 7) == 0);
  485. assert((line_size & 7) == 0);
  486. #define SUM(in0, in1, out0, out1) \
  487. "movq (%0), %%mm2\n" \
  488. "movq (%1), " #out0 "\n" \
  489. "movq 8(%0), %%mm3\n" \
  490. "movq 8(%1), " #out1 "\n" \
  491. "add %3, %0\n" \
  492. "add %3, %1\n" \
  493. "psubb " #out0 ", %%mm2\n" \
  494. "psubb " #out1 ", %%mm3\n" \
  495. "pxor %%mm7, %%mm2\n" \
  496. "pxor %%mm7, %%mm3\n" \
  497. "movq %%mm2, " #out0 "\n" \
  498. "movq %%mm3, " #out1 "\n" \
  499. "psubusb " #in0 ", %%mm2\n" \
  500. "psubusb " #in1 ", %%mm3\n" \
  501. "psubusb " #out0 ", " #in0 "\n" \
  502. "psubusb " #out1 ", " #in1 "\n" \
  503. "por %%mm2, " #in0 "\n" \
  504. "por %%mm3, " #in1 "\n" \
  505. "movq " #in0 ", %%mm2\n" \
  506. "movq " #in1 ", %%mm3\n" \
  507. "punpcklbw %%mm7, " #in0 "\n" \
  508. "punpcklbw %%mm7, " #in1 "\n" \
  509. "punpckhbw %%mm7, %%mm2\n" \
  510. "punpckhbw %%mm7, %%mm3\n" \
  511. "paddw " #in1 ", " #in0 "\n" \
  512. "paddw %%mm3, %%mm2\n" \
  513. "paddw %%mm2, " #in0 "\n" \
  514. "paddw " #in0 ", %%mm6\n"
  515. __asm__ volatile (
  516. "movl %4, %%ecx\n"
  517. "pxor %%mm6, %%mm6\n"
  518. "pcmpeqw %%mm7, %%mm7\n"
  519. "psllw $15, %%mm7\n"
  520. "packsswb %%mm7, %%mm7\n"
  521. "movq (%0), %%mm0\n"
  522. "movq (%1), %%mm2\n"
  523. "movq 8(%0), %%mm1\n"
  524. "movq 8(%1), %%mm3\n"
  525. "add %3, %0\n"
  526. "add %3, %1\n"
  527. "psubb %%mm2, %%mm0\n"
  528. "psubb %%mm3, %%mm1\n"
  529. "pxor %%mm7, %%mm0\n"
  530. "pxor %%mm7, %%mm1\n"
  531. "jmp 2f\n"
  532. "1:\n"
  533. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  534. "2:\n"
  535. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  536. "subl $2, %%ecx\n"
  537. "jnz 1b\n"
  538. "movq %%mm6, %%mm0\n"
  539. "psrlq $32, %%mm6\n"
  540. "paddw %%mm6, %%mm0\n"
  541. "movq %%mm0, %%mm6\n"
  542. "psrlq $16, %%mm0\n"
  543. "paddw %%mm6, %%mm0\n"
  544. "movd %%mm0, %2\n"
  545. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  546. : "r" ((x86_reg) line_size), "m" (h)
  547. : "%ecx");
  548. return tmp & 0x7FFF;
  549. }
  550. #undef SUM
  551. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  552. int line_size, int h)
  553. {
  554. int tmp;
  555. assert((((int) pix1) & 7) == 0);
  556. assert((((int) pix2) & 7) == 0);
  557. assert((line_size & 7) == 0);
  558. #define SUM(in0, in1, out0, out1) \
  559. "movq (%0), " #out0 "\n" \
  560. "movq (%1), %%mm2\n" \
  561. "movq 8(%0), " #out1 "\n" \
  562. "movq 8(%1), %%mm3\n" \
  563. "add %3, %0\n" \
  564. "add %3, %1\n" \
  565. "psubb %%mm2, " #out0 "\n" \
  566. "psubb %%mm3, " #out1 "\n" \
  567. "pxor %%mm7, " #out0 "\n" \
  568. "pxor %%mm7, " #out1 "\n" \
  569. "psadbw " #out0 ", " #in0 "\n" \
  570. "psadbw " #out1 ", " #in1 "\n" \
  571. "paddw " #in1 ", " #in0 "\n" \
  572. "paddw " #in0 ", %%mm6\n "
  573. __asm__ volatile (
  574. "movl %4, %%ecx\n"
  575. "pxor %%mm6, %%mm6\n"
  576. "pcmpeqw %%mm7, %%mm7\n"
  577. "psllw $15, %%mm7\n"
  578. "packsswb %%mm7, %%mm7\n"
  579. "movq (%0), %%mm0\n"
  580. "movq (%1), %%mm2\n"
  581. "movq 8(%0), %%mm1\n"
  582. "movq 8(%1), %%mm3\n"
  583. "add %3, %0\n"
  584. "add %3, %1\n"
  585. "psubb %%mm2, %%mm0\n"
  586. "psubb %%mm3, %%mm1\n"
  587. "pxor %%mm7, %%mm0\n"
  588. "pxor %%mm7, %%mm1\n"
  589. "jmp 2f\n"
  590. "1:\n"
  591. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  592. "2:\n"
  593. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  594. "subl $2, %%ecx\n"
  595. "jnz 1b\n"
  596. "movd %%mm6, %2\n"
  597. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  598. : "r" ((x86_reg) line_size), "m" (h)
  599. : "%ecx");
  600. return tmp;
  601. }
  602. #undef SUM
  603. #define MMABS_MMX(a,z) \
  604. "pxor " #z ", " #z " \n\t" \
  605. "pcmpgtw " #a ", " #z " \n\t" \
  606. "pxor " #z ", " #a " \n\t" \
  607. "psubw " #z ", " #a " \n\t"
  608. #define MMABS_MMXEXT(a, z) \
  609. "pxor " #z ", " #z " \n\t" \
  610. "psubw " #a ", " #z " \n\t" \
  611. "pmaxsw " #z ", " #a " \n\t"
  612. #define MMABS_SSSE3(a,z) \
  613. "pabsw " #a ", " #a " \n\t"
  614. #define MMABS_SUM(a,z, sum) \
  615. MMABS(a,z) \
  616. "paddusw " #a ", " #sum " \n\t"
  617. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
  618. * up to about 100k on extreme inputs. But that's very unlikely to occur in
  619. * natural video, and it's even more unlikely to not have any alternative
  620. * mvs/modes with lower cost. */
  621. #define HSUM_MMX(a, t, dst) \
  622. "movq " #a ", " #t " \n\t" \
  623. "psrlq $32, " #a " \n\t" \
  624. "paddusw " #t ", " #a " \n\t" \
  625. "movq " #a ", " #t " \n\t" \
  626. "psrlq $16, " #a " \n\t" \
  627. "paddusw " #t ", " #a " \n\t" \
  628. "movd " #a ", " #dst " \n\t" \
  629. #define HSUM_MMXEXT(a, t, dst) \
  630. "pshufw $0x0E, " #a ", " #t " \n\t" \
  631. "paddusw " #t ", " #a " \n\t" \
  632. "pshufw $0x01, " #a ", " #t " \n\t" \
  633. "paddusw " #t ", " #a " \n\t" \
  634. "movd " #a ", " #dst " \n\t" \
  635. #define HSUM_SSE2(a, t, dst) \
  636. "movhlps " #a ", " #t " \n\t" \
  637. "paddusw " #t ", " #a " \n\t" \
  638. "pshuflw $0x0E, " #a ", " #t " \n\t" \
  639. "paddusw " #t ", " #a " \n\t" \
  640. "pshuflw $0x01, " #a ", " #t " \n\t" \
  641. "paddusw " #t ", " #a " \n\t" \
  642. "movd " #a ", " #dst " \n\t" \
  643. #define DCT_SAD4(m, mm, o) \
  644. "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
  645. "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
  646. "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
  647. "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
  648. MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
  649. MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
  650. MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
  651. MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
  652. #define DCT_SAD_MMX \
  653. "pxor %%mm0, %%mm0 \n\t" \
  654. "pxor %%mm1, %%mm1 \n\t" \
  655. DCT_SAD4(q, %%mm, 0) \
  656. DCT_SAD4(q, %%mm, 8) \
  657. DCT_SAD4(q, %%mm, 64) \
  658. DCT_SAD4(q, %%mm, 72) \
  659. "paddusw %%mm1, %%mm0 \n\t" \
  660. HSUM(%%mm0, %%mm1, %0)
  661. #define DCT_SAD_SSE2 \
  662. "pxor %%xmm0, %%xmm0 \n\t" \
  663. "pxor %%xmm1, %%xmm1 \n\t" \
  664. DCT_SAD4(dqa, %%xmm, 0) \
  665. DCT_SAD4(dqa, %%xmm, 64) \
  666. "paddusw %%xmm1, %%xmm0 \n\t" \
  667. HSUM(%%xmm0, %%xmm1, %0)
  668. #define DCT_SAD_FUNC(cpu) \
  669. static int sum_abs_dctelem_ ## cpu(int16_t *block) \
  670. { \
  671. int sum; \
  672. __asm__ volatile ( \
  673. DCT_SAD \
  674. :"=r"(sum) \
  675. :"r"(block)); \
  676. return sum & 0xFFFF; \
  677. }
  678. #define DCT_SAD DCT_SAD_MMX
  679. #define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
  680. #define MMABS(a, z) MMABS_MMX(a, z)
  681. DCT_SAD_FUNC(mmx)
  682. #undef MMABS
  683. #undef HSUM
  684. #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
  685. #define MMABS(a, z) MMABS_MMXEXT(a, z)
  686. DCT_SAD_FUNC(mmxext)
  687. #undef HSUM
  688. #undef DCT_SAD
  689. #define DCT_SAD DCT_SAD_SSE2
  690. #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
  691. DCT_SAD_FUNC(sse2)
  692. #undef MMABS
  693. #if HAVE_SSSE3_INLINE
  694. #define MMABS(a, z) MMABS_SSSE3(a, z)
  695. DCT_SAD_FUNC(ssse3)
  696. #undef MMABS
  697. #endif
  698. #undef HSUM
  699. #undef DCT_SAD
  700. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2,
  701. int size)
  702. {
  703. int sum;
  704. x86_reg i = size;
  705. __asm__ volatile (
  706. "pxor %%mm4, %%mm4 \n"
  707. "1: \n"
  708. "sub $8, %0 \n"
  709. "movq (%2, %0), %%mm2 \n"
  710. "movq (%3, %0, 2), %%mm0 \n"
  711. "movq 8(%3, %0, 2), %%mm1 \n"
  712. "punpckhbw %%mm2, %%mm3 \n"
  713. "punpcklbw %%mm2, %%mm2 \n"
  714. "psraw $8, %%mm3 \n"
  715. "psraw $8, %%mm2 \n"
  716. "psubw %%mm3, %%mm1 \n"
  717. "psubw %%mm2, %%mm0 \n"
  718. "pmaddwd %%mm1, %%mm1 \n"
  719. "pmaddwd %%mm0, %%mm0 \n"
  720. "paddd %%mm1, %%mm4 \n"
  721. "paddd %%mm0, %%mm4 \n"
  722. "jg 1b \n"
  723. "movq %%mm4, %%mm3 \n"
  724. "psrlq $32, %%mm3 \n"
  725. "paddd %%mm3, %%mm4 \n"
  726. "movd %%mm4, %1 \n"
  727. : "+r" (i), "=r" (sum)
  728. : "r" (pix1), "r" (pix2));
  729. return sum;
  730. }
  731. #define PHADDD(a, t) \
  732. "movq " #a ", " #t " \n\t" \
  733. "psrlq $32, " #a " \n\t" \
  734. "paddd " #t ", " #a " \n\t"
  735. /*
  736. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  737. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  738. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  739. */
  740. #define PMULHRW(x, y, s, o) \
  741. "pmulhw " #s ", " #x " \n\t" \
  742. "pmulhw " #s ", " #y " \n\t" \
  743. "paddw " #o ", " #x " \n\t" \
  744. "paddw " #o ", " #y " \n\t" \
  745. "psraw $1, " #x " \n\t" \
  746. "psraw $1, " #y " \n\t"
  747. #define DEF(x) x ## _mmx
  748. #define SET_RND MOVQ_WONE
  749. #define SCALE_OFFSET 1
  750. #include "dsputil_qns_template.c"
  751. #undef DEF
  752. #undef SET_RND
  753. #undef SCALE_OFFSET
  754. #undef PMULHRW
  755. #define DEF(x) x ## _3dnow
  756. #define SET_RND(x)
  757. #define SCALE_OFFSET 0
  758. #define PMULHRW(x, y, s, o) \
  759. "pmulhrw " #s ", " #x " \n\t" \
  760. "pmulhrw " #s ", " #y " \n\t"
  761. #include "dsputil_qns_template.c"
  762. #undef DEF
  763. #undef SET_RND
  764. #undef SCALE_OFFSET
  765. #undef PMULHRW
  766. #if HAVE_SSSE3_INLINE
  767. #undef PHADDD
  768. #define DEF(x) x ## _ssse3
  769. #define SET_RND(x)
  770. #define SCALE_OFFSET -1
  771. #define PHADDD(a, t) \
  772. "pshufw $0x0E, " #a ", " #t " \n\t" \
  773. /* faster than phaddd on core2 */ \
  774. "paddd " #t ", " #a " \n\t"
  775. #define PMULHRW(x, y, s, o) \
  776. "pmulhrsw " #s ", " #x " \n\t" \
  777. "pmulhrsw " #s ", " #y " \n\t"
  778. #include "dsputil_qns_template.c"
  779. #undef DEF
  780. #undef SET_RND
  781. #undef SCALE_OFFSET
  782. #undef PMULHRW
  783. #undef PHADDD
  784. #endif /* HAVE_SSSE3_INLINE */
  785. #endif /* HAVE_INLINE_ASM */
  786. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  787. int line_size, int h);
  788. #define hadamard_func(cpu) \
  789. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  790. uint8_t *src2, int stride, int h); \
  791. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  792. uint8_t *src2, int stride, int h);
  793. hadamard_func(mmx)
  794. hadamard_func(mmxext)
  795. hadamard_func(sse2)
  796. hadamard_func(ssse3)
  797. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
  798. unsigned high_bit_depth)
  799. {
  800. int cpu_flags = av_get_cpu_flags();
  801. const int dct_algo = avctx->dct_algo;
  802. if (EXTERNAL_MMX(cpu_flags)) {
  803. if (!high_bit_depth)
  804. c->get_pixels = ff_get_pixels_mmx;
  805. c->diff_pixels = ff_diff_pixels_mmx;
  806. c->pix_sum = ff_pix_sum16_mmx;
  807. c->pix_norm1 = ff_pix_norm1_mmx;
  808. }
  809. if (EXTERNAL_SSE2(cpu_flags))
  810. if (!high_bit_depth)
  811. c->get_pixels = ff_get_pixels_sse2;
  812. #if HAVE_INLINE_ASM
  813. if (INLINE_MMX(cpu_flags)) {
  814. if (!high_bit_depth &&
  815. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  816. c->fdct = ff_fdct_mmx;
  817. c->sum_abs_dctelem = sum_abs_dctelem_mmx;
  818. c->sse[0] = sse16_mmx;
  819. c->sse[1] = sse8_mmx;
  820. c->vsad[4] = vsad_intra16_mmx;
  821. c->nsse[0] = nsse16_mmx;
  822. c->nsse[1] = nsse8_mmx;
  823. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  824. c->vsad[0] = vsad16_mmx;
  825. c->try_8x8basis = try_8x8basis_mmx;
  826. }
  827. c->add_8x8basis = add_8x8basis_mmx;
  828. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  829. }
  830. if (INLINE_AMD3DNOW(cpu_flags)) {
  831. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  832. c->try_8x8basis = try_8x8basis_3dnow;
  833. }
  834. c->add_8x8basis = add_8x8basis_3dnow;
  835. }
  836. if (INLINE_MMXEXT(cpu_flags)) {
  837. if (!high_bit_depth &&
  838. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  839. c->fdct = ff_fdct_mmxext;
  840. c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
  841. c->vsad[4] = vsad_intra16_mmxext;
  842. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  843. c->vsad[0] = vsad16_mmxext;
  844. }
  845. }
  846. if (INLINE_SSE2(cpu_flags)) {
  847. if (!high_bit_depth &&
  848. (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
  849. c->fdct = ff_fdct_sse2;
  850. c->sum_abs_dctelem = sum_abs_dctelem_sse2;
  851. }
  852. #if HAVE_SSSE3_INLINE
  853. if (INLINE_SSSE3(cpu_flags)) {
  854. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  855. c->try_8x8basis = try_8x8basis_ssse3;
  856. }
  857. c->add_8x8basis = add_8x8basis_ssse3;
  858. c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
  859. }
  860. #endif
  861. #endif /* HAVE_INLINE_ASM */
  862. if (EXTERNAL_MMX(cpu_flags)) {
  863. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  864. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  865. }
  866. if (EXTERNAL_MMXEXT(cpu_flags)) {
  867. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  868. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  869. }
  870. if (EXTERNAL_SSE2(cpu_flags)) {
  871. c->sse[0] = ff_sse16_sse2;
  872. #if HAVE_ALIGNED_STACK
  873. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  874. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  875. #endif
  876. }
  877. if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
  878. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  879. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  880. }
  881. ff_dsputil_init_pix_mmx(c, avctx);
  882. }