You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

897 lines
27KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/dsputil.h"
  29. #include "libavcodec/mpegvideo.h"
  30. #include "dsputil_x86.h"
  31. void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
  32. void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
  33. void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
  34. int stride);
  35. #if HAVE_INLINE_ASM
  36. static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  37. int line_size, int h)
  38. {
  39. int tmp;
  40. __asm__ volatile (
  41. "movl %4, %%ecx \n"
  42. "shr $1, %%ecx \n"
  43. "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
  44. "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
  45. "1: \n"
  46. "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
  47. "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
  48. "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
  49. "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
  50. /* todo: mm1-mm2, mm3-mm4 */
  51. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  52. /* OR the results to get absolute difference */
  53. "movq %%mm1, %%mm5 \n"
  54. "movq %%mm3, %%mm6 \n"
  55. "psubusb %%mm2, %%mm1 \n"
  56. "psubusb %%mm4, %%mm3 \n"
  57. "psubusb %%mm5, %%mm2 \n"
  58. "psubusb %%mm6, %%mm4 \n"
  59. "por %%mm1, %%mm2 \n"
  60. "por %%mm3, %%mm4 \n"
  61. /* now convert to 16-bit vectors so we can square them */
  62. "movq %%mm2, %%mm1 \n"
  63. "movq %%mm4, %%mm3 \n"
  64. "punpckhbw %%mm0, %%mm2 \n"
  65. "punpckhbw %%mm0, %%mm4 \n"
  66. "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
  67. "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
  68. "pmaddwd %%mm2, %%mm2 \n"
  69. "pmaddwd %%mm4, %%mm4 \n"
  70. "pmaddwd %%mm1, %%mm1 \n"
  71. "pmaddwd %%mm3, %%mm3 \n"
  72. "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * line_size */
  73. "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * line_size */
  74. "paddd %%mm2, %%mm1 \n"
  75. "paddd %%mm4, %%mm3 \n"
  76. "paddd %%mm1, %%mm7 \n"
  77. "paddd %%mm3, %%mm7 \n"
  78. "decl %%ecx \n"
  79. "jnz 1b \n"
  80. "movq %%mm7, %%mm1 \n"
  81. "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
  82. "paddd %%mm7, %%mm1 \n"
  83. "movd %%mm1, %2 \n"
  84. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  85. : "r" ((x86_reg) line_size), "m" (h)
  86. : "%ecx");
  87. return tmp;
  88. }
  89. static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  90. int line_size, int h)
  91. {
  92. int tmp;
  93. __asm__ volatile (
  94. "movl %4, %%ecx\n"
  95. "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
  96. "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
  97. "1:\n"
  98. "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
  99. "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
  100. "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
  101. "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
  102. /* todo: mm1-mm2, mm3-mm4 */
  103. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  104. /* OR the results to get absolute difference */
  105. "movq %%mm1, %%mm5\n"
  106. "movq %%mm3, %%mm6\n"
  107. "psubusb %%mm2, %%mm1\n"
  108. "psubusb %%mm4, %%mm3\n"
  109. "psubusb %%mm5, %%mm2\n"
  110. "psubusb %%mm6, %%mm4\n"
  111. "por %%mm1, %%mm2\n"
  112. "por %%mm3, %%mm4\n"
  113. /* now convert to 16-bit vectors so we can square them */
  114. "movq %%mm2, %%mm1\n"
  115. "movq %%mm4, %%mm3\n"
  116. "punpckhbw %%mm0, %%mm2\n"
  117. "punpckhbw %%mm0, %%mm4\n"
  118. "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
  119. "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
  120. "pmaddwd %%mm2, %%mm2\n"
  121. "pmaddwd %%mm4, %%mm4\n"
  122. "pmaddwd %%mm1, %%mm1\n"
  123. "pmaddwd %%mm3, %%mm3\n"
  124. "add %3, %0\n"
  125. "add %3, %1\n"
  126. "paddd %%mm2, %%mm1\n"
  127. "paddd %%mm4, %%mm3\n"
  128. "paddd %%mm1, %%mm7\n"
  129. "paddd %%mm3, %%mm7\n"
  130. "decl %%ecx\n"
  131. "jnz 1b\n"
  132. "movq %%mm7, %%mm1\n"
  133. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  134. "paddd %%mm7, %%mm1\n"
  135. "movd %%mm1, %2\n"
  136. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  137. : "r" ((x86_reg) line_size), "m" (h)
  138. : "%ecx");
  139. return tmp;
  140. }
  141. static int hf_noise8_mmx(uint8_t *pix1, int line_size, int h)
  142. {
  143. int tmp;
  144. __asm__ volatile (
  145. "movl %3, %%ecx\n"
  146. "pxor %%mm7, %%mm7\n"
  147. "pxor %%mm6, %%mm6\n"
  148. "movq (%0), %%mm0\n"
  149. "movq %%mm0, %%mm1\n"
  150. "psllq $8, %%mm0\n"
  151. "psrlq $8, %%mm1\n"
  152. "psrlq $8, %%mm0\n"
  153. "movq %%mm0, %%mm2\n"
  154. "movq %%mm1, %%mm3\n"
  155. "punpcklbw %%mm7, %%mm0\n"
  156. "punpcklbw %%mm7, %%mm1\n"
  157. "punpckhbw %%mm7, %%mm2\n"
  158. "punpckhbw %%mm7, %%mm3\n"
  159. "psubw %%mm1, %%mm0\n"
  160. "psubw %%mm3, %%mm2\n"
  161. "add %2, %0\n"
  162. "movq (%0), %%mm4\n"
  163. "movq %%mm4, %%mm1\n"
  164. "psllq $8, %%mm4\n"
  165. "psrlq $8, %%mm1\n"
  166. "psrlq $8, %%mm4\n"
  167. "movq %%mm4, %%mm5\n"
  168. "movq %%mm1, %%mm3\n"
  169. "punpcklbw %%mm7, %%mm4\n"
  170. "punpcklbw %%mm7, %%mm1\n"
  171. "punpckhbw %%mm7, %%mm5\n"
  172. "punpckhbw %%mm7, %%mm3\n"
  173. "psubw %%mm1, %%mm4\n"
  174. "psubw %%mm3, %%mm5\n"
  175. "psubw %%mm4, %%mm0\n"
  176. "psubw %%mm5, %%mm2\n"
  177. "pxor %%mm3, %%mm3\n"
  178. "pxor %%mm1, %%mm1\n"
  179. "pcmpgtw %%mm0, %%mm3\n\t"
  180. "pcmpgtw %%mm2, %%mm1\n\t"
  181. "pxor %%mm3, %%mm0\n"
  182. "pxor %%mm1, %%mm2\n"
  183. "psubw %%mm3, %%mm0\n"
  184. "psubw %%mm1, %%mm2\n"
  185. "paddw %%mm0, %%mm2\n"
  186. "paddw %%mm2, %%mm6\n"
  187. "add %2, %0\n"
  188. "1:\n"
  189. "movq (%0), %%mm0\n"
  190. "movq %%mm0, %%mm1\n"
  191. "psllq $8, %%mm0\n"
  192. "psrlq $8, %%mm1\n"
  193. "psrlq $8, %%mm0\n"
  194. "movq %%mm0, %%mm2\n"
  195. "movq %%mm1, %%mm3\n"
  196. "punpcklbw %%mm7, %%mm0\n"
  197. "punpcklbw %%mm7, %%mm1\n"
  198. "punpckhbw %%mm7, %%mm2\n"
  199. "punpckhbw %%mm7, %%mm3\n"
  200. "psubw %%mm1, %%mm0\n"
  201. "psubw %%mm3, %%mm2\n"
  202. "psubw %%mm0, %%mm4\n"
  203. "psubw %%mm2, %%mm5\n"
  204. "pxor %%mm3, %%mm3\n"
  205. "pxor %%mm1, %%mm1\n"
  206. "pcmpgtw %%mm4, %%mm3\n\t"
  207. "pcmpgtw %%mm5, %%mm1\n\t"
  208. "pxor %%mm3, %%mm4\n"
  209. "pxor %%mm1, %%mm5\n"
  210. "psubw %%mm3, %%mm4\n"
  211. "psubw %%mm1, %%mm5\n"
  212. "paddw %%mm4, %%mm5\n"
  213. "paddw %%mm5, %%mm6\n"
  214. "add %2, %0\n"
  215. "movq (%0), %%mm4\n"
  216. "movq %%mm4, %%mm1\n"
  217. "psllq $8, %%mm4\n"
  218. "psrlq $8, %%mm1\n"
  219. "psrlq $8, %%mm4\n"
  220. "movq %%mm4, %%mm5\n"
  221. "movq %%mm1, %%mm3\n"
  222. "punpcklbw %%mm7, %%mm4\n"
  223. "punpcklbw %%mm7, %%mm1\n"
  224. "punpckhbw %%mm7, %%mm5\n"
  225. "punpckhbw %%mm7, %%mm3\n"
  226. "psubw %%mm1, %%mm4\n"
  227. "psubw %%mm3, %%mm5\n"
  228. "psubw %%mm4, %%mm0\n"
  229. "psubw %%mm5, %%mm2\n"
  230. "pxor %%mm3, %%mm3\n"
  231. "pxor %%mm1, %%mm1\n"
  232. "pcmpgtw %%mm0, %%mm3\n\t"
  233. "pcmpgtw %%mm2, %%mm1\n\t"
  234. "pxor %%mm3, %%mm0\n"
  235. "pxor %%mm1, %%mm2\n"
  236. "psubw %%mm3, %%mm0\n"
  237. "psubw %%mm1, %%mm2\n"
  238. "paddw %%mm0, %%mm2\n"
  239. "paddw %%mm2, %%mm6\n"
  240. "add %2, %0\n"
  241. "subl $2, %%ecx\n"
  242. " jnz 1b\n"
  243. "movq %%mm6, %%mm0\n"
  244. "punpcklwd %%mm7, %%mm0\n"
  245. "punpckhwd %%mm7, %%mm6\n"
  246. "paddd %%mm0, %%mm6\n"
  247. "movq %%mm6, %%mm0\n"
  248. "psrlq $32, %%mm6\n"
  249. "paddd %%mm6, %%mm0\n"
  250. "movd %%mm0, %1\n"
  251. : "+r" (pix1), "=r" (tmp)
  252. : "r" ((x86_reg) line_size), "g" (h - 2)
  253. : "%ecx");
  254. return tmp;
  255. }
  256. static int hf_noise16_mmx(uint8_t *pix1, int line_size, int h)
  257. {
  258. int tmp;
  259. uint8_t *pix = pix1;
  260. __asm__ volatile (
  261. "movl %3, %%ecx\n"
  262. "pxor %%mm7, %%mm7\n"
  263. "pxor %%mm6, %%mm6\n"
  264. "movq (%0), %%mm0\n"
  265. "movq 1(%0), %%mm1\n"
  266. "movq %%mm0, %%mm2\n"
  267. "movq %%mm1, %%mm3\n"
  268. "punpcklbw %%mm7, %%mm0\n"
  269. "punpcklbw %%mm7, %%mm1\n"
  270. "punpckhbw %%mm7, %%mm2\n"
  271. "punpckhbw %%mm7, %%mm3\n"
  272. "psubw %%mm1, %%mm0\n"
  273. "psubw %%mm3, %%mm2\n"
  274. "add %2, %0\n"
  275. "movq (%0), %%mm4\n"
  276. "movq 1(%0), %%mm1\n"
  277. "movq %%mm4, %%mm5\n"
  278. "movq %%mm1, %%mm3\n"
  279. "punpcklbw %%mm7, %%mm4\n"
  280. "punpcklbw %%mm7, %%mm1\n"
  281. "punpckhbw %%mm7, %%mm5\n"
  282. "punpckhbw %%mm7, %%mm3\n"
  283. "psubw %%mm1, %%mm4\n"
  284. "psubw %%mm3, %%mm5\n"
  285. "psubw %%mm4, %%mm0\n"
  286. "psubw %%mm5, %%mm2\n"
  287. "pxor %%mm3, %%mm3\n"
  288. "pxor %%mm1, %%mm1\n"
  289. "pcmpgtw %%mm0, %%mm3\n\t"
  290. "pcmpgtw %%mm2, %%mm1\n\t"
  291. "pxor %%mm3, %%mm0\n"
  292. "pxor %%mm1, %%mm2\n"
  293. "psubw %%mm3, %%mm0\n"
  294. "psubw %%mm1, %%mm2\n"
  295. "paddw %%mm0, %%mm2\n"
  296. "paddw %%mm2, %%mm6\n"
  297. "add %2, %0\n"
  298. "1:\n"
  299. "movq (%0), %%mm0\n"
  300. "movq 1(%0), %%mm1\n"
  301. "movq %%mm0, %%mm2\n"
  302. "movq %%mm1, %%mm3\n"
  303. "punpcklbw %%mm7, %%mm0\n"
  304. "punpcklbw %%mm7, %%mm1\n"
  305. "punpckhbw %%mm7, %%mm2\n"
  306. "punpckhbw %%mm7, %%mm3\n"
  307. "psubw %%mm1, %%mm0\n"
  308. "psubw %%mm3, %%mm2\n"
  309. "psubw %%mm0, %%mm4\n"
  310. "psubw %%mm2, %%mm5\n"
  311. "pxor %%mm3, %%mm3\n"
  312. "pxor %%mm1, %%mm1\n"
  313. "pcmpgtw %%mm4, %%mm3\n\t"
  314. "pcmpgtw %%mm5, %%mm1\n\t"
  315. "pxor %%mm3, %%mm4\n"
  316. "pxor %%mm1, %%mm5\n"
  317. "psubw %%mm3, %%mm4\n"
  318. "psubw %%mm1, %%mm5\n"
  319. "paddw %%mm4, %%mm5\n"
  320. "paddw %%mm5, %%mm6\n"
  321. "add %2, %0\n"
  322. "movq (%0), %%mm4\n"
  323. "movq 1(%0), %%mm1\n"
  324. "movq %%mm4, %%mm5\n"
  325. "movq %%mm1, %%mm3\n"
  326. "punpcklbw %%mm7, %%mm4\n"
  327. "punpcklbw %%mm7, %%mm1\n"
  328. "punpckhbw %%mm7, %%mm5\n"
  329. "punpckhbw %%mm7, %%mm3\n"
  330. "psubw %%mm1, %%mm4\n"
  331. "psubw %%mm3, %%mm5\n"
  332. "psubw %%mm4, %%mm0\n"
  333. "psubw %%mm5, %%mm2\n"
  334. "pxor %%mm3, %%mm3\n"
  335. "pxor %%mm1, %%mm1\n"
  336. "pcmpgtw %%mm0, %%mm3\n\t"
  337. "pcmpgtw %%mm2, %%mm1\n\t"
  338. "pxor %%mm3, %%mm0\n"
  339. "pxor %%mm1, %%mm2\n"
  340. "psubw %%mm3, %%mm0\n"
  341. "psubw %%mm1, %%mm2\n"
  342. "paddw %%mm0, %%mm2\n"
  343. "paddw %%mm2, %%mm6\n"
  344. "add %2, %0\n"
  345. "subl $2, %%ecx\n"
  346. " jnz 1b\n"
  347. "movq %%mm6, %%mm0\n"
  348. "punpcklwd %%mm7, %%mm0\n"
  349. "punpckhwd %%mm7, %%mm6\n"
  350. "paddd %%mm0, %%mm6\n"
  351. "movq %%mm6, %%mm0\n"
  352. "psrlq $32, %%mm6\n"
  353. "paddd %%mm6, %%mm0\n"
  354. "movd %%mm0, %1\n"
  355. : "+r" (pix1), "=r" (tmp)
  356. : "r" ((x86_reg) line_size), "g" (h - 2)
  357. : "%ecx");
  358. return tmp + hf_noise8_mmx(pix + 8, line_size, h);
  359. }
  360. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  361. int line_size, int h)
  362. {
  363. int score1, score2;
  364. if (c)
  365. score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  366. else
  367. score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  368. score2 = hf_noise16_mmx(pix1, line_size, h) -
  369. hf_noise16_mmx(pix2, line_size, h);
  370. if (c)
  371. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  372. else
  373. return score1 + FFABS(score2) * 8;
  374. }
  375. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  376. int line_size, int h)
  377. {
  378. int score1 = sse8_mmx(c, pix1, pix2, line_size, h);
  379. int score2 = hf_noise8_mmx(pix1, line_size, h) -
  380. hf_noise8_mmx(pix2, line_size, h);
  381. if (c)
  382. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  383. else
  384. return score1 + FFABS(score2) * 8;
  385. }
  386. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  387. int line_size, int h)
  388. {
  389. int tmp;
  390. assert((((int) pix) & 7) == 0);
  391. assert((line_size & 7) == 0);
  392. #define SUM(in0, in1, out0, out1) \
  393. "movq (%0), %%mm2\n" \
  394. "movq 8(%0), %%mm3\n" \
  395. "add %2,%0\n" \
  396. "movq %%mm2, " #out0 "\n" \
  397. "movq %%mm3, " #out1 "\n" \
  398. "psubusb " #in0 ", %%mm2\n" \
  399. "psubusb " #in1 ", %%mm3\n" \
  400. "psubusb " #out0 ", " #in0 "\n" \
  401. "psubusb " #out1 ", " #in1 "\n" \
  402. "por %%mm2, " #in0 "\n" \
  403. "por %%mm3, " #in1 "\n" \
  404. "movq " #in0 ", %%mm2\n" \
  405. "movq " #in1 ", %%mm3\n" \
  406. "punpcklbw %%mm7, " #in0 "\n" \
  407. "punpcklbw %%mm7, " #in1 "\n" \
  408. "punpckhbw %%mm7, %%mm2\n" \
  409. "punpckhbw %%mm7, %%mm3\n" \
  410. "paddw " #in1 ", " #in0 "\n" \
  411. "paddw %%mm3, %%mm2\n" \
  412. "paddw %%mm2, " #in0 "\n" \
  413. "paddw " #in0 ", %%mm6\n"
  414. __asm__ volatile (
  415. "movl %3, %%ecx\n"
  416. "pxor %%mm6, %%mm6\n"
  417. "pxor %%mm7, %%mm7\n"
  418. "movq (%0), %%mm0\n"
  419. "movq 8(%0), %%mm1\n"
  420. "add %2, %0\n"
  421. "jmp 2f\n"
  422. "1:\n"
  423. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  424. "2:\n"
  425. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  426. "subl $2, %%ecx\n"
  427. "jnz 1b\n"
  428. "movq %%mm6, %%mm0\n"
  429. "psrlq $32, %%mm6\n"
  430. "paddw %%mm6, %%mm0\n"
  431. "movq %%mm0, %%mm6\n"
  432. "psrlq $16, %%mm0\n"
  433. "paddw %%mm6, %%mm0\n"
  434. "movd %%mm0, %1\n"
  435. : "+r" (pix), "=r" (tmp)
  436. : "r" ((x86_reg) line_size), "m" (h)
  437. : "%ecx");
  438. return tmp & 0xFFFF;
  439. }
  440. #undef SUM
  441. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  442. int line_size, int h)
  443. {
  444. int tmp;
  445. assert((((int) pix) & 7) == 0);
  446. assert((line_size & 7) == 0);
  447. #define SUM(in0, in1, out0, out1) \
  448. "movq (%0), " #out0 "\n" \
  449. "movq 8(%0), " #out1 "\n" \
  450. "add %2, %0\n" \
  451. "psadbw " #out0 ", " #in0 "\n" \
  452. "psadbw " #out1 ", " #in1 "\n" \
  453. "paddw " #in1 ", " #in0 "\n" \
  454. "paddw " #in0 ", %%mm6\n"
  455. __asm__ volatile (
  456. "movl %3, %%ecx\n"
  457. "pxor %%mm6, %%mm6\n"
  458. "pxor %%mm7, %%mm7\n"
  459. "movq (%0), %%mm0\n"
  460. "movq 8(%0), %%mm1\n"
  461. "add %2, %0\n"
  462. "jmp 2f\n"
  463. "1:\n"
  464. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  465. "2:\n"
  466. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  467. "subl $2, %%ecx\n"
  468. "jnz 1b\n"
  469. "movd %%mm6, %1\n"
  470. : "+r" (pix), "=r" (tmp)
  471. : "r" ((x86_reg) line_size), "m" (h)
  472. : "%ecx");
  473. return tmp;
  474. }
  475. #undef SUM
  476. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  477. int line_size, int h)
  478. {
  479. int tmp;
  480. assert((((int) pix1) & 7) == 0);
  481. assert((((int) pix2) & 7) == 0);
  482. assert((line_size & 7) == 0);
  483. #define SUM(in0, in1, out0, out1) \
  484. "movq (%0), %%mm2\n" \
  485. "movq (%1), " #out0 "\n" \
  486. "movq 8(%0), %%mm3\n" \
  487. "movq 8(%1), " #out1 "\n" \
  488. "add %3, %0\n" \
  489. "add %3, %1\n" \
  490. "psubb " #out0 ", %%mm2\n" \
  491. "psubb " #out1 ", %%mm3\n" \
  492. "pxor %%mm7, %%mm2\n" \
  493. "pxor %%mm7, %%mm3\n" \
  494. "movq %%mm2, " #out0 "\n" \
  495. "movq %%mm3, " #out1 "\n" \
  496. "psubusb " #in0 ", %%mm2\n" \
  497. "psubusb " #in1 ", %%mm3\n" \
  498. "psubusb " #out0 ", " #in0 "\n" \
  499. "psubusb " #out1 ", " #in1 "\n" \
  500. "por %%mm2, " #in0 "\n" \
  501. "por %%mm3, " #in1 "\n" \
  502. "movq " #in0 ", %%mm2\n" \
  503. "movq " #in1 ", %%mm3\n" \
  504. "punpcklbw %%mm7, " #in0 "\n" \
  505. "punpcklbw %%mm7, " #in1 "\n" \
  506. "punpckhbw %%mm7, %%mm2\n" \
  507. "punpckhbw %%mm7, %%mm3\n" \
  508. "paddw " #in1 ", " #in0 "\n" \
  509. "paddw %%mm3, %%mm2\n" \
  510. "paddw %%mm2, " #in0 "\n" \
  511. "paddw " #in0 ", %%mm6\n"
  512. __asm__ volatile (
  513. "movl %4, %%ecx\n"
  514. "pxor %%mm6, %%mm6\n"
  515. "pcmpeqw %%mm7, %%mm7\n"
  516. "psllw $15, %%mm7\n"
  517. "packsswb %%mm7, %%mm7\n"
  518. "movq (%0), %%mm0\n"
  519. "movq (%1), %%mm2\n"
  520. "movq 8(%0), %%mm1\n"
  521. "movq 8(%1), %%mm3\n"
  522. "add %3, %0\n"
  523. "add %3, %1\n"
  524. "psubb %%mm2, %%mm0\n"
  525. "psubb %%mm3, %%mm1\n"
  526. "pxor %%mm7, %%mm0\n"
  527. "pxor %%mm7, %%mm1\n"
  528. "jmp 2f\n"
  529. "1:\n"
  530. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  531. "2:\n"
  532. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  533. "subl $2, %%ecx\n"
  534. "jnz 1b\n"
  535. "movq %%mm6, %%mm0\n"
  536. "psrlq $32, %%mm6\n"
  537. "paddw %%mm6, %%mm0\n"
  538. "movq %%mm0, %%mm6\n"
  539. "psrlq $16, %%mm0\n"
  540. "paddw %%mm6, %%mm0\n"
  541. "movd %%mm0, %2\n"
  542. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  543. : "r" ((x86_reg) line_size), "m" (h)
  544. : "%ecx");
  545. return tmp & 0x7FFF;
  546. }
  547. #undef SUM
  548. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  549. int line_size, int h)
  550. {
  551. int tmp;
  552. assert((((int) pix1) & 7) == 0);
  553. assert((((int) pix2) & 7) == 0);
  554. assert((line_size & 7) == 0);
  555. #define SUM(in0, in1, out0, out1) \
  556. "movq (%0), " #out0 "\n" \
  557. "movq (%1), %%mm2\n" \
  558. "movq 8(%0), " #out1 "\n" \
  559. "movq 8(%1), %%mm3\n" \
  560. "add %3, %0\n" \
  561. "add %3, %1\n" \
  562. "psubb %%mm2, " #out0 "\n" \
  563. "psubb %%mm3, " #out1 "\n" \
  564. "pxor %%mm7, " #out0 "\n" \
  565. "pxor %%mm7, " #out1 "\n" \
  566. "psadbw " #out0 ", " #in0 "\n" \
  567. "psadbw " #out1 ", " #in1 "\n" \
  568. "paddw " #in1 ", " #in0 "\n" \
  569. "paddw " #in0 ", %%mm6\n "
  570. __asm__ volatile (
  571. "movl %4, %%ecx\n"
  572. "pxor %%mm6, %%mm6\n"
  573. "pcmpeqw %%mm7, %%mm7\n"
  574. "psllw $15, %%mm7\n"
  575. "packsswb %%mm7, %%mm7\n"
  576. "movq (%0), %%mm0\n"
  577. "movq (%1), %%mm2\n"
  578. "movq 8(%0), %%mm1\n"
  579. "movq 8(%1), %%mm3\n"
  580. "add %3, %0\n"
  581. "add %3, %1\n"
  582. "psubb %%mm2, %%mm0\n"
  583. "psubb %%mm3, %%mm1\n"
  584. "pxor %%mm7, %%mm0\n"
  585. "pxor %%mm7, %%mm1\n"
  586. "jmp 2f\n"
  587. "1:\n"
  588. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  589. "2:\n"
  590. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  591. "subl $2, %%ecx\n"
  592. "jnz 1b\n"
  593. "movd %%mm6, %2\n"
  594. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  595. : "r" ((x86_reg) line_size), "m" (h)
  596. : "%ecx");
  597. return tmp;
  598. }
  599. #undef SUM
  600. #define MMABS_MMX(a,z) \
  601. "pxor " #z ", " #z " \n\t" \
  602. "pcmpgtw " #a ", " #z " \n\t" \
  603. "pxor " #z ", " #a " \n\t" \
  604. "psubw " #z ", " #a " \n\t"
  605. #define MMABS_MMXEXT(a, z) \
  606. "pxor " #z ", " #z " \n\t" \
  607. "psubw " #a ", " #z " \n\t" \
  608. "pmaxsw " #z ", " #a " \n\t"
  609. #define MMABS_SSSE3(a,z) \
  610. "pabsw " #a ", " #a " \n\t"
  611. #define MMABS_SUM(a,z, sum) \
  612. MMABS(a,z) \
  613. "paddusw " #a ", " #sum " \n\t"
  614. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
  615. * up to about 100k on extreme inputs. But that's very unlikely to occur in
  616. * natural video, and it's even more unlikely to not have any alternative
  617. * mvs/modes with lower cost. */
  618. #define HSUM_MMX(a, t, dst) \
  619. "movq " #a ", " #t " \n\t" \
  620. "psrlq $32, " #a " \n\t" \
  621. "paddusw " #t ", " #a " \n\t" \
  622. "movq " #a ", " #t " \n\t" \
  623. "psrlq $16, " #a " \n\t" \
  624. "paddusw " #t ", " #a " \n\t" \
  625. "movd " #a ", " #dst " \n\t" \
  626. #define HSUM_MMXEXT(a, t, dst) \
  627. "pshufw $0x0E, " #a ", " #t " \n\t" \
  628. "paddusw " #t ", " #a " \n\t" \
  629. "pshufw $0x01, " #a ", " #t " \n\t" \
  630. "paddusw " #t ", " #a " \n\t" \
  631. "movd " #a ", " #dst " \n\t" \
  632. #define HSUM_SSE2(a, t, dst) \
  633. "movhlps " #a ", " #t " \n\t" \
  634. "paddusw " #t ", " #a " \n\t" \
  635. "pshuflw $0x0E, " #a ", " #t " \n\t" \
  636. "paddusw " #t ", " #a " \n\t" \
  637. "pshuflw $0x01, " #a ", " #t " \n\t" \
  638. "paddusw " #t ", " #a " \n\t" \
  639. "movd " #a ", " #dst " \n\t" \
  640. #define DCT_SAD4(m, mm, o) \
  641. "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
  642. "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
  643. "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
  644. "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
  645. MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
  646. MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
  647. MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
  648. MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
  649. #define DCT_SAD_MMX \
  650. "pxor %%mm0, %%mm0 \n\t" \
  651. "pxor %%mm1, %%mm1 \n\t" \
  652. DCT_SAD4(q, %%mm, 0) \
  653. DCT_SAD4(q, %%mm, 8) \
  654. DCT_SAD4(q, %%mm, 64) \
  655. DCT_SAD4(q, %%mm, 72) \
  656. "paddusw %%mm1, %%mm0 \n\t" \
  657. HSUM(%%mm0, %%mm1, %0)
  658. #define DCT_SAD_SSE2 \
  659. "pxor %%xmm0, %%xmm0 \n\t" \
  660. "pxor %%xmm1, %%xmm1 \n\t" \
  661. DCT_SAD4(dqa, %%xmm, 0) \
  662. DCT_SAD4(dqa, %%xmm, 64) \
  663. "paddusw %%xmm1, %%xmm0 \n\t" \
  664. HSUM(%%xmm0, %%xmm1, %0)
  665. #define DCT_SAD_FUNC(cpu) \
  666. static int sum_abs_dctelem_ ## cpu(int16_t *block) \
  667. { \
  668. int sum; \
  669. __asm__ volatile ( \
  670. DCT_SAD \
  671. :"=r"(sum) \
  672. :"r"(block)); \
  673. return sum & 0xFFFF; \
  674. }
  675. #define DCT_SAD DCT_SAD_MMX
  676. #define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
  677. #define MMABS(a, z) MMABS_MMX(a, z)
  678. DCT_SAD_FUNC(mmx)
  679. #undef MMABS
  680. #undef HSUM
  681. #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
  682. #define MMABS(a, z) MMABS_MMXEXT(a, z)
  683. DCT_SAD_FUNC(mmxext)
  684. #undef HSUM
  685. #undef DCT_SAD
  686. #define DCT_SAD DCT_SAD_SSE2
  687. #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
  688. DCT_SAD_FUNC(sse2)
  689. #undef MMABS
  690. #if HAVE_SSSE3_INLINE
  691. #define MMABS(a, z) MMABS_SSSE3(a, z)
  692. DCT_SAD_FUNC(ssse3)
  693. #undef MMABS
  694. #endif
  695. #undef HSUM
  696. #undef DCT_SAD
  697. #endif /* HAVE_INLINE_ASM */
  698. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  699. int line_size, int h);
  700. #define hadamard_func(cpu) \
  701. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  702. uint8_t *src2, int stride, int h); \
  703. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  704. uint8_t *src2, int stride, int h);
  705. hadamard_func(mmx)
  706. hadamard_func(mmxext)
  707. hadamard_func(sse2)
  708. hadamard_func(ssse3)
  709. av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
  710. unsigned high_bit_depth)
  711. {
  712. int cpu_flags = av_get_cpu_flags();
  713. if (EXTERNAL_MMX(cpu_flags)) {
  714. if (!high_bit_depth)
  715. c->get_pixels = ff_get_pixels_mmx;
  716. c->diff_pixels = ff_diff_pixels_mmx;
  717. }
  718. if (EXTERNAL_SSE2(cpu_flags))
  719. if (!high_bit_depth)
  720. c->get_pixels = ff_get_pixels_sse2;
  721. #if HAVE_INLINE_ASM
  722. if (INLINE_MMX(cpu_flags)) {
  723. c->sum_abs_dctelem = sum_abs_dctelem_mmx;
  724. c->sse[0] = sse16_mmx;
  725. c->sse[1] = sse8_mmx;
  726. c->vsad[4] = vsad_intra16_mmx;
  727. c->nsse[0] = nsse16_mmx;
  728. c->nsse[1] = nsse8_mmx;
  729. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  730. c->vsad[0] = vsad16_mmx;
  731. }
  732. }
  733. if (INLINE_MMXEXT(cpu_flags)) {
  734. c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
  735. c->vsad[4] = vsad_intra16_mmxext;
  736. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  737. c->vsad[0] = vsad16_mmxext;
  738. }
  739. }
  740. if (INLINE_SSE2(cpu_flags)) {
  741. c->sum_abs_dctelem = sum_abs_dctelem_sse2;
  742. }
  743. #if HAVE_SSSE3_INLINE
  744. if (INLINE_SSSE3(cpu_flags)) {
  745. c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
  746. }
  747. #endif
  748. #endif /* HAVE_INLINE_ASM */
  749. if (EXTERNAL_MMX(cpu_flags)) {
  750. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  751. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  752. }
  753. if (EXTERNAL_MMXEXT(cpu_flags)) {
  754. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  755. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  756. }
  757. if (EXTERNAL_SSE2(cpu_flags)) {
  758. c->sse[0] = ff_sse16_sse2;
  759. #if HAVE_ALIGNED_STACK
  760. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  761. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  762. #endif
  763. }
  764. if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
  765. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  766. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  767. }
  768. ff_dsputil_init_pix_mmx(c, avctx);
  769. }