You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1326 lines
47KB

  1. /*
  2. * SIMD-optimized motion estimation
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. *
  8. * This file is part of Libav.
  9. *
  10. * Libav is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * Libav is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with Libav; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. #include "libavutil/attributes.h"
  25. #include "libavutil/cpu.h"
  26. #include "libavutil/x86/asm.h"
  27. #include "libavutil/x86/cpu.h"
  28. #include "libavcodec/me_cmp.h"
  29. #include "libavcodec/mpegvideo.h"
  30. #if HAVE_INLINE_ASM
  31. static int sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  32. ptrdiff_t stride, int h)
  33. {
  34. int tmp;
  35. __asm__ volatile (
  36. "movl %4, %%ecx \n"
  37. "shr $1, %%ecx \n"
  38. "pxor %%mm0, %%mm0 \n" /* mm0 = 0 */
  39. "pxor %%mm7, %%mm7 \n" /* mm7 holds the sum */
  40. "1: \n"
  41. "movq (%0), %%mm1 \n" /* mm1 = pix1[0][0 - 7] */
  42. "movq (%1), %%mm2 \n" /* mm2 = pix2[0][0 - 7] */
  43. "movq (%0, %3), %%mm3 \n" /* mm3 = pix1[1][0 - 7] */
  44. "movq (%1, %3), %%mm4 \n" /* mm4 = pix2[1][0 - 7] */
  45. /* todo: mm1-mm2, mm3-mm4 */
  46. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  47. /* OR the results to get absolute difference */
  48. "movq %%mm1, %%mm5 \n"
  49. "movq %%mm3, %%mm6 \n"
  50. "psubusb %%mm2, %%mm1 \n"
  51. "psubusb %%mm4, %%mm3 \n"
  52. "psubusb %%mm5, %%mm2 \n"
  53. "psubusb %%mm6, %%mm4 \n"
  54. "por %%mm1, %%mm2 \n"
  55. "por %%mm3, %%mm4 \n"
  56. /* now convert to 16-bit vectors so we can square them */
  57. "movq %%mm2, %%mm1 \n"
  58. "movq %%mm4, %%mm3 \n"
  59. "punpckhbw %%mm0, %%mm2 \n"
  60. "punpckhbw %%mm0, %%mm4 \n"
  61. "punpcklbw %%mm0, %%mm1 \n" /* mm1 now spread over (mm1, mm2) */
  62. "punpcklbw %%mm0, %%mm3 \n" /* mm4 now spread over (mm3, mm4) */
  63. "pmaddwd %%mm2, %%mm2 \n"
  64. "pmaddwd %%mm4, %%mm4 \n"
  65. "pmaddwd %%mm1, %%mm1 \n"
  66. "pmaddwd %%mm3, %%mm3 \n"
  67. "lea (%0, %3, 2), %0 \n" /* pix1 += 2 * stride */
  68. "lea (%1, %3, 2), %1 \n" /* pix2 += 2 * stride */
  69. "paddd %%mm2, %%mm1 \n"
  70. "paddd %%mm4, %%mm3 \n"
  71. "paddd %%mm1, %%mm7 \n"
  72. "paddd %%mm3, %%mm7 \n"
  73. "decl %%ecx \n"
  74. "jnz 1b \n"
  75. "movq %%mm7, %%mm1 \n"
  76. "psrlq $32, %%mm7 \n" /* shift hi dword to lo */
  77. "paddd %%mm7, %%mm1 \n"
  78. "movd %%mm1, %2 \n"
  79. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  80. : "r" (stride), "m" (h)
  81. : "%ecx");
  82. return tmp;
  83. }
  84. static int sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  85. ptrdiff_t stride, int h)
  86. {
  87. int tmp;
  88. __asm__ volatile (
  89. "movl %4, %%ecx\n"
  90. "pxor %%mm0, %%mm0\n" /* mm0 = 0 */
  91. "pxor %%mm7, %%mm7\n" /* mm7 holds the sum */
  92. "1:\n"
  93. "movq (%0), %%mm1\n" /* mm1 = pix1[0 - 7] */
  94. "movq (%1), %%mm2\n" /* mm2 = pix2[0 - 7] */
  95. "movq 8(%0), %%mm3\n" /* mm3 = pix1[8 - 15] */
  96. "movq 8(%1), %%mm4\n" /* mm4 = pix2[8 - 15] */
  97. /* todo: mm1-mm2, mm3-mm4 */
  98. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  99. /* OR the results to get absolute difference */
  100. "movq %%mm1, %%mm5\n"
  101. "movq %%mm3, %%mm6\n"
  102. "psubusb %%mm2, %%mm1\n"
  103. "psubusb %%mm4, %%mm3\n"
  104. "psubusb %%mm5, %%mm2\n"
  105. "psubusb %%mm6, %%mm4\n"
  106. "por %%mm1, %%mm2\n"
  107. "por %%mm3, %%mm4\n"
  108. /* now convert to 16-bit vectors so we can square them */
  109. "movq %%mm2, %%mm1\n"
  110. "movq %%mm4, %%mm3\n"
  111. "punpckhbw %%mm0, %%mm2\n"
  112. "punpckhbw %%mm0, %%mm4\n"
  113. "punpcklbw %%mm0, %%mm1\n" /* mm1 now spread over (mm1, mm2) */
  114. "punpcklbw %%mm0, %%mm3\n" /* mm4 now spread over (mm3, mm4) */
  115. "pmaddwd %%mm2, %%mm2\n"
  116. "pmaddwd %%mm4, %%mm4\n"
  117. "pmaddwd %%mm1, %%mm1\n"
  118. "pmaddwd %%mm3, %%mm3\n"
  119. "add %3, %0\n"
  120. "add %3, %1\n"
  121. "paddd %%mm2, %%mm1\n"
  122. "paddd %%mm4, %%mm3\n"
  123. "paddd %%mm1, %%mm7\n"
  124. "paddd %%mm3, %%mm7\n"
  125. "decl %%ecx\n"
  126. "jnz 1b\n"
  127. "movq %%mm7, %%mm1\n"
  128. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  129. "paddd %%mm7, %%mm1\n"
  130. "movd %%mm1, %2\n"
  131. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  132. : "r" (stride), "m" (h)
  133. : "%ecx");
  134. return tmp;
  135. }
  136. static int hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
  137. {
  138. int tmp;
  139. __asm__ volatile (
  140. "movl %3, %%ecx\n"
  141. "pxor %%mm7, %%mm7\n"
  142. "pxor %%mm6, %%mm6\n"
  143. "movq (%0), %%mm0\n"
  144. "movq %%mm0, %%mm1\n"
  145. "psllq $8, %%mm0\n"
  146. "psrlq $8, %%mm1\n"
  147. "psrlq $8, %%mm0\n"
  148. "movq %%mm0, %%mm2\n"
  149. "movq %%mm1, %%mm3\n"
  150. "punpcklbw %%mm7, %%mm0\n"
  151. "punpcklbw %%mm7, %%mm1\n"
  152. "punpckhbw %%mm7, %%mm2\n"
  153. "punpckhbw %%mm7, %%mm3\n"
  154. "psubw %%mm1, %%mm0\n"
  155. "psubw %%mm3, %%mm2\n"
  156. "add %2, %0\n"
  157. "movq (%0), %%mm4\n"
  158. "movq %%mm4, %%mm1\n"
  159. "psllq $8, %%mm4\n"
  160. "psrlq $8, %%mm1\n"
  161. "psrlq $8, %%mm4\n"
  162. "movq %%mm4, %%mm5\n"
  163. "movq %%mm1, %%mm3\n"
  164. "punpcklbw %%mm7, %%mm4\n"
  165. "punpcklbw %%mm7, %%mm1\n"
  166. "punpckhbw %%mm7, %%mm5\n"
  167. "punpckhbw %%mm7, %%mm3\n"
  168. "psubw %%mm1, %%mm4\n"
  169. "psubw %%mm3, %%mm5\n"
  170. "psubw %%mm4, %%mm0\n"
  171. "psubw %%mm5, %%mm2\n"
  172. "pxor %%mm3, %%mm3\n"
  173. "pxor %%mm1, %%mm1\n"
  174. "pcmpgtw %%mm0, %%mm3\n\t"
  175. "pcmpgtw %%mm2, %%mm1\n\t"
  176. "pxor %%mm3, %%mm0\n"
  177. "pxor %%mm1, %%mm2\n"
  178. "psubw %%mm3, %%mm0\n"
  179. "psubw %%mm1, %%mm2\n"
  180. "paddw %%mm0, %%mm2\n"
  181. "paddw %%mm2, %%mm6\n"
  182. "add %2, %0\n"
  183. "1:\n"
  184. "movq (%0), %%mm0\n"
  185. "movq %%mm0, %%mm1\n"
  186. "psllq $8, %%mm0\n"
  187. "psrlq $8, %%mm1\n"
  188. "psrlq $8, %%mm0\n"
  189. "movq %%mm0, %%mm2\n"
  190. "movq %%mm1, %%mm3\n"
  191. "punpcklbw %%mm7, %%mm0\n"
  192. "punpcklbw %%mm7, %%mm1\n"
  193. "punpckhbw %%mm7, %%mm2\n"
  194. "punpckhbw %%mm7, %%mm3\n"
  195. "psubw %%mm1, %%mm0\n"
  196. "psubw %%mm3, %%mm2\n"
  197. "psubw %%mm0, %%mm4\n"
  198. "psubw %%mm2, %%mm5\n"
  199. "pxor %%mm3, %%mm3\n"
  200. "pxor %%mm1, %%mm1\n"
  201. "pcmpgtw %%mm4, %%mm3\n\t"
  202. "pcmpgtw %%mm5, %%mm1\n\t"
  203. "pxor %%mm3, %%mm4\n"
  204. "pxor %%mm1, %%mm5\n"
  205. "psubw %%mm3, %%mm4\n"
  206. "psubw %%mm1, %%mm5\n"
  207. "paddw %%mm4, %%mm5\n"
  208. "paddw %%mm5, %%mm6\n"
  209. "add %2, %0\n"
  210. "movq (%0), %%mm4\n"
  211. "movq %%mm4, %%mm1\n"
  212. "psllq $8, %%mm4\n"
  213. "psrlq $8, %%mm1\n"
  214. "psrlq $8, %%mm4\n"
  215. "movq %%mm4, %%mm5\n"
  216. "movq %%mm1, %%mm3\n"
  217. "punpcklbw %%mm7, %%mm4\n"
  218. "punpcklbw %%mm7, %%mm1\n"
  219. "punpckhbw %%mm7, %%mm5\n"
  220. "punpckhbw %%mm7, %%mm3\n"
  221. "psubw %%mm1, %%mm4\n"
  222. "psubw %%mm3, %%mm5\n"
  223. "psubw %%mm4, %%mm0\n"
  224. "psubw %%mm5, %%mm2\n"
  225. "pxor %%mm3, %%mm3\n"
  226. "pxor %%mm1, %%mm1\n"
  227. "pcmpgtw %%mm0, %%mm3\n\t"
  228. "pcmpgtw %%mm2, %%mm1\n\t"
  229. "pxor %%mm3, %%mm0\n"
  230. "pxor %%mm1, %%mm2\n"
  231. "psubw %%mm3, %%mm0\n"
  232. "psubw %%mm1, %%mm2\n"
  233. "paddw %%mm0, %%mm2\n"
  234. "paddw %%mm2, %%mm6\n"
  235. "add %2, %0\n"
  236. "subl $2, %%ecx\n"
  237. " jnz 1b\n"
  238. "movq %%mm6, %%mm0\n"
  239. "punpcklwd %%mm7, %%mm0\n"
  240. "punpckhwd %%mm7, %%mm6\n"
  241. "paddd %%mm0, %%mm6\n"
  242. "movq %%mm6, %%mm0\n"
  243. "psrlq $32, %%mm6\n"
  244. "paddd %%mm6, %%mm0\n"
  245. "movd %%mm0, %1\n"
  246. : "+r" (pix1), "=r" (tmp)
  247. : "r" (stride), "g" (h - 2)
  248. : "%ecx");
  249. return tmp;
  250. }
  251. static int hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
  252. {
  253. int tmp;
  254. uint8_t *pix = pix1;
  255. __asm__ volatile (
  256. "movl %3, %%ecx\n"
  257. "pxor %%mm7, %%mm7\n"
  258. "pxor %%mm6, %%mm6\n"
  259. "movq (%0), %%mm0\n"
  260. "movq 1(%0), %%mm1\n"
  261. "movq %%mm0, %%mm2\n"
  262. "movq %%mm1, %%mm3\n"
  263. "punpcklbw %%mm7, %%mm0\n"
  264. "punpcklbw %%mm7, %%mm1\n"
  265. "punpckhbw %%mm7, %%mm2\n"
  266. "punpckhbw %%mm7, %%mm3\n"
  267. "psubw %%mm1, %%mm0\n"
  268. "psubw %%mm3, %%mm2\n"
  269. "add %2, %0\n"
  270. "movq (%0), %%mm4\n"
  271. "movq 1(%0), %%mm1\n"
  272. "movq %%mm4, %%mm5\n"
  273. "movq %%mm1, %%mm3\n"
  274. "punpcklbw %%mm7, %%mm4\n"
  275. "punpcklbw %%mm7, %%mm1\n"
  276. "punpckhbw %%mm7, %%mm5\n"
  277. "punpckhbw %%mm7, %%mm3\n"
  278. "psubw %%mm1, %%mm4\n"
  279. "psubw %%mm3, %%mm5\n"
  280. "psubw %%mm4, %%mm0\n"
  281. "psubw %%mm5, %%mm2\n"
  282. "pxor %%mm3, %%mm3\n"
  283. "pxor %%mm1, %%mm1\n"
  284. "pcmpgtw %%mm0, %%mm3\n\t"
  285. "pcmpgtw %%mm2, %%mm1\n\t"
  286. "pxor %%mm3, %%mm0\n"
  287. "pxor %%mm1, %%mm2\n"
  288. "psubw %%mm3, %%mm0\n"
  289. "psubw %%mm1, %%mm2\n"
  290. "paddw %%mm0, %%mm2\n"
  291. "paddw %%mm2, %%mm6\n"
  292. "add %2, %0\n"
  293. "1:\n"
  294. "movq (%0), %%mm0\n"
  295. "movq 1(%0), %%mm1\n"
  296. "movq %%mm0, %%mm2\n"
  297. "movq %%mm1, %%mm3\n"
  298. "punpcklbw %%mm7, %%mm0\n"
  299. "punpcklbw %%mm7, %%mm1\n"
  300. "punpckhbw %%mm7, %%mm2\n"
  301. "punpckhbw %%mm7, %%mm3\n"
  302. "psubw %%mm1, %%mm0\n"
  303. "psubw %%mm3, %%mm2\n"
  304. "psubw %%mm0, %%mm4\n"
  305. "psubw %%mm2, %%mm5\n"
  306. "pxor %%mm3, %%mm3\n"
  307. "pxor %%mm1, %%mm1\n"
  308. "pcmpgtw %%mm4, %%mm3\n\t"
  309. "pcmpgtw %%mm5, %%mm1\n\t"
  310. "pxor %%mm3, %%mm4\n"
  311. "pxor %%mm1, %%mm5\n"
  312. "psubw %%mm3, %%mm4\n"
  313. "psubw %%mm1, %%mm5\n"
  314. "paddw %%mm4, %%mm5\n"
  315. "paddw %%mm5, %%mm6\n"
  316. "add %2, %0\n"
  317. "movq (%0), %%mm4\n"
  318. "movq 1(%0), %%mm1\n"
  319. "movq %%mm4, %%mm5\n"
  320. "movq %%mm1, %%mm3\n"
  321. "punpcklbw %%mm7, %%mm4\n"
  322. "punpcklbw %%mm7, %%mm1\n"
  323. "punpckhbw %%mm7, %%mm5\n"
  324. "punpckhbw %%mm7, %%mm3\n"
  325. "psubw %%mm1, %%mm4\n"
  326. "psubw %%mm3, %%mm5\n"
  327. "psubw %%mm4, %%mm0\n"
  328. "psubw %%mm5, %%mm2\n"
  329. "pxor %%mm3, %%mm3\n"
  330. "pxor %%mm1, %%mm1\n"
  331. "pcmpgtw %%mm0, %%mm3\n\t"
  332. "pcmpgtw %%mm2, %%mm1\n\t"
  333. "pxor %%mm3, %%mm0\n"
  334. "pxor %%mm1, %%mm2\n"
  335. "psubw %%mm3, %%mm0\n"
  336. "psubw %%mm1, %%mm2\n"
  337. "paddw %%mm0, %%mm2\n"
  338. "paddw %%mm2, %%mm6\n"
  339. "add %2, %0\n"
  340. "subl $2, %%ecx\n"
  341. " jnz 1b\n"
  342. "movq %%mm6, %%mm0\n"
  343. "punpcklwd %%mm7, %%mm0\n"
  344. "punpckhwd %%mm7, %%mm6\n"
  345. "paddd %%mm0, %%mm6\n"
  346. "movq %%mm6, %%mm0\n"
  347. "psrlq $32, %%mm6\n"
  348. "paddd %%mm6, %%mm0\n"
  349. "movd %%mm0, %1\n"
  350. : "+r" (pix1), "=r" (tmp)
  351. : "r" (stride), "g" (h - 2)
  352. : "%ecx");
  353. return tmp + hf_noise8_mmx(pix + 8, stride, h);
  354. }
  355. static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  356. ptrdiff_t stride, int h)
  357. {
  358. int score1, score2;
  359. if (c)
  360. score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
  361. else
  362. score1 = sse16_mmx(c, pix1, pix2, stride, h);
  363. score2 = hf_noise16_mmx(pix1, stride, h) -
  364. hf_noise16_mmx(pix2, stride, h);
  365. if (c)
  366. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  367. else
  368. return score1 + FFABS(score2) * 8;
  369. }
  370. static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
  371. ptrdiff_t stride, int h)
  372. {
  373. int score1 = sse8_mmx(c, pix1, pix2, stride, h);
  374. int score2 = hf_noise8_mmx(pix1, stride, h) -
  375. hf_noise8_mmx(pix2, stride, h);
  376. if (c)
  377. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  378. else
  379. return score1 + FFABS(score2) * 8;
  380. }
  381. static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  382. ptrdiff_t stride, int h)
  383. {
  384. int tmp;
  385. assert((((int) pix) & 7) == 0);
  386. assert((stride & 7) == 0);
  387. #define SUM(in0, in1, out0, out1) \
  388. "movq (%0), %%mm2\n" \
  389. "movq 8(%0), %%mm3\n" \
  390. "add %2,%0\n" \
  391. "movq %%mm2, " #out0 "\n" \
  392. "movq %%mm3, " #out1 "\n" \
  393. "psubusb " #in0 ", %%mm2\n" \
  394. "psubusb " #in1 ", %%mm3\n" \
  395. "psubusb " #out0 ", " #in0 "\n" \
  396. "psubusb " #out1 ", " #in1 "\n" \
  397. "por %%mm2, " #in0 "\n" \
  398. "por %%mm3, " #in1 "\n" \
  399. "movq " #in0 ", %%mm2\n" \
  400. "movq " #in1 ", %%mm3\n" \
  401. "punpcklbw %%mm7, " #in0 "\n" \
  402. "punpcklbw %%mm7, " #in1 "\n" \
  403. "punpckhbw %%mm7, %%mm2\n" \
  404. "punpckhbw %%mm7, %%mm3\n" \
  405. "paddw " #in1 ", " #in0 "\n" \
  406. "paddw %%mm3, %%mm2\n" \
  407. "paddw %%mm2, " #in0 "\n" \
  408. "paddw " #in0 ", %%mm6\n"
  409. __asm__ volatile (
  410. "movl %3, %%ecx\n"
  411. "pxor %%mm6, %%mm6\n"
  412. "pxor %%mm7, %%mm7\n"
  413. "movq (%0), %%mm0\n"
  414. "movq 8(%0), %%mm1\n"
  415. "add %2, %0\n"
  416. "jmp 2f\n"
  417. "1:\n"
  418. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  419. "2:\n"
  420. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  421. "subl $2, %%ecx\n"
  422. "jnz 1b\n"
  423. "movq %%mm6, %%mm0\n"
  424. "psrlq $32, %%mm6\n"
  425. "paddw %%mm6, %%mm0\n"
  426. "movq %%mm0, %%mm6\n"
  427. "psrlq $16, %%mm0\n"
  428. "paddw %%mm6, %%mm0\n"
  429. "movd %%mm0, %1\n"
  430. : "+r" (pix), "=r" (tmp)
  431. : "r" (stride), "m" (h)
  432. : "%ecx");
  433. return tmp & 0xFFFF;
  434. }
  435. #undef SUM
  436. static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
  437. ptrdiff_t stride, int h)
  438. {
  439. int tmp;
  440. assert((((int) pix) & 7) == 0);
  441. assert((stride & 7) == 0);
  442. #define SUM(in0, in1, out0, out1) \
  443. "movq (%0), " #out0 "\n" \
  444. "movq 8(%0), " #out1 "\n" \
  445. "add %2, %0\n" \
  446. "psadbw " #out0 ", " #in0 "\n" \
  447. "psadbw " #out1 ", " #in1 "\n" \
  448. "paddw " #in1 ", " #in0 "\n" \
  449. "paddw " #in0 ", %%mm6\n"
  450. __asm__ volatile (
  451. "movl %3, %%ecx\n"
  452. "pxor %%mm6, %%mm6\n"
  453. "pxor %%mm7, %%mm7\n"
  454. "movq (%0), %%mm0\n"
  455. "movq 8(%0), %%mm1\n"
  456. "add %2, %0\n"
  457. "jmp 2f\n"
  458. "1:\n"
  459. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  460. "2:\n"
  461. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  462. "subl $2, %%ecx\n"
  463. "jnz 1b\n"
  464. "movd %%mm6, %1\n"
  465. : "+r" (pix), "=r" (tmp)
  466. : "r" (stride), "m" (h)
  467. : "%ecx");
  468. return tmp;
  469. }
  470. #undef SUM
  471. static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  472. ptrdiff_t stride, int h)
  473. {
  474. int tmp;
  475. assert((((int) pix1) & 7) == 0);
  476. assert((((int) pix2) & 7) == 0);
  477. assert((stride & 7) == 0);
  478. #define SUM(in0, in1, out0, out1) \
  479. "movq (%0), %%mm2\n" \
  480. "movq (%1), " #out0 "\n" \
  481. "movq 8(%0), %%mm3\n" \
  482. "movq 8(%1), " #out1 "\n" \
  483. "add %3, %0\n" \
  484. "add %3, %1\n" \
  485. "psubb " #out0 ", %%mm2\n" \
  486. "psubb " #out1 ", %%mm3\n" \
  487. "pxor %%mm7, %%mm2\n" \
  488. "pxor %%mm7, %%mm3\n" \
  489. "movq %%mm2, " #out0 "\n" \
  490. "movq %%mm3, " #out1 "\n" \
  491. "psubusb " #in0 ", %%mm2\n" \
  492. "psubusb " #in1 ", %%mm3\n" \
  493. "psubusb " #out0 ", " #in0 "\n" \
  494. "psubusb " #out1 ", " #in1 "\n" \
  495. "por %%mm2, " #in0 "\n" \
  496. "por %%mm3, " #in1 "\n" \
  497. "movq " #in0 ", %%mm2\n" \
  498. "movq " #in1 ", %%mm3\n" \
  499. "punpcklbw %%mm7, " #in0 "\n" \
  500. "punpcklbw %%mm7, " #in1 "\n" \
  501. "punpckhbw %%mm7, %%mm2\n" \
  502. "punpckhbw %%mm7, %%mm3\n" \
  503. "paddw " #in1 ", " #in0 "\n" \
  504. "paddw %%mm3, %%mm2\n" \
  505. "paddw %%mm2, " #in0 "\n" \
  506. "paddw " #in0 ", %%mm6\n"
  507. __asm__ volatile (
  508. "movl %4, %%ecx\n"
  509. "pxor %%mm6, %%mm6\n"
  510. "pcmpeqw %%mm7, %%mm7\n"
  511. "psllw $15, %%mm7\n"
  512. "packsswb %%mm7, %%mm7\n"
  513. "movq (%0), %%mm0\n"
  514. "movq (%1), %%mm2\n"
  515. "movq 8(%0), %%mm1\n"
  516. "movq 8(%1), %%mm3\n"
  517. "add %3, %0\n"
  518. "add %3, %1\n"
  519. "psubb %%mm2, %%mm0\n"
  520. "psubb %%mm3, %%mm1\n"
  521. "pxor %%mm7, %%mm0\n"
  522. "pxor %%mm7, %%mm1\n"
  523. "jmp 2f\n"
  524. "1:\n"
  525. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  526. "2:\n"
  527. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  528. "subl $2, %%ecx\n"
  529. "jnz 1b\n"
  530. "movq %%mm6, %%mm0\n"
  531. "psrlq $32, %%mm6\n"
  532. "paddw %%mm6, %%mm0\n"
  533. "movq %%mm0, %%mm6\n"
  534. "psrlq $16, %%mm0\n"
  535. "paddw %%mm6, %%mm0\n"
  536. "movd %%mm0, %2\n"
  537. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  538. : "r" (stride), "m" (h)
  539. : "%ecx");
  540. return tmp & 0x7FFF;
  541. }
  542. #undef SUM
  543. static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  544. ptrdiff_t stride, int h)
  545. {
  546. int tmp;
  547. assert((((int) pix1) & 7) == 0);
  548. assert((((int) pix2) & 7) == 0);
  549. assert((stride & 7) == 0);
  550. #define SUM(in0, in1, out0, out1) \
  551. "movq (%0), " #out0 "\n" \
  552. "movq (%1), %%mm2\n" \
  553. "movq 8(%0), " #out1 "\n" \
  554. "movq 8(%1), %%mm3\n" \
  555. "add %3, %0\n" \
  556. "add %3, %1\n" \
  557. "psubb %%mm2, " #out0 "\n" \
  558. "psubb %%mm3, " #out1 "\n" \
  559. "pxor %%mm7, " #out0 "\n" \
  560. "pxor %%mm7, " #out1 "\n" \
  561. "psadbw " #out0 ", " #in0 "\n" \
  562. "psadbw " #out1 ", " #in1 "\n" \
  563. "paddw " #in1 ", " #in0 "\n" \
  564. "paddw " #in0 ", %%mm6\n "
  565. __asm__ volatile (
  566. "movl %4, %%ecx\n"
  567. "pxor %%mm6, %%mm6\n"
  568. "pcmpeqw %%mm7, %%mm7\n"
  569. "psllw $15, %%mm7\n"
  570. "packsswb %%mm7, %%mm7\n"
  571. "movq (%0), %%mm0\n"
  572. "movq (%1), %%mm2\n"
  573. "movq 8(%0), %%mm1\n"
  574. "movq 8(%1), %%mm3\n"
  575. "add %3, %0\n"
  576. "add %3, %1\n"
  577. "psubb %%mm2, %%mm0\n"
  578. "psubb %%mm3, %%mm1\n"
  579. "pxor %%mm7, %%mm0\n"
  580. "pxor %%mm7, %%mm1\n"
  581. "jmp 2f\n"
  582. "1:\n"
  583. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  584. "2:\n"
  585. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  586. "subl $2, %%ecx\n"
  587. "jnz 1b\n"
  588. "movd %%mm6, %2\n"
  589. : "+r" (pix1), "+r" (pix2), "=r" (tmp)
  590. : "r" (stride), "m" (h)
  591. : "%ecx");
  592. return tmp;
  593. }
  594. #undef SUM
  595. #define MMABS_MMX(a,z) \
  596. "pxor " #z ", " #z " \n\t" \
  597. "pcmpgtw " #a ", " #z " \n\t" \
  598. "pxor " #z ", " #a " \n\t" \
  599. "psubw " #z ", " #a " \n\t"
  600. #define MMABS_MMXEXT(a, z) \
  601. "pxor " #z ", " #z " \n\t" \
  602. "psubw " #a ", " #z " \n\t" \
  603. "pmaxsw " #z ", " #a " \n\t"
  604. #define MMABS_SSSE3(a,z) \
  605. "pabsw " #a ", " #a " \n\t"
  606. #define MMABS_SUM(a,z, sum) \
  607. MMABS(a,z) \
  608. "paddusw " #a ", " #sum " \n\t"
  609. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get
  610. * up to about 100k on extreme inputs. But that's very unlikely to occur in
  611. * natural video, and it's even more unlikely to not have any alternative
  612. * mvs/modes with lower cost. */
  613. #define HSUM_MMX(a, t, dst) \
  614. "movq " #a ", " #t " \n\t" \
  615. "psrlq $32, " #a " \n\t" \
  616. "paddusw " #t ", " #a " \n\t" \
  617. "movq " #a ", " #t " \n\t" \
  618. "psrlq $16, " #a " \n\t" \
  619. "paddusw " #t ", " #a " \n\t" \
  620. "movd " #a ", " #dst " \n\t" \
  621. #define HSUM_MMXEXT(a, t, dst) \
  622. "pshufw $0x0E, " #a ", " #t " \n\t" \
  623. "paddusw " #t ", " #a " \n\t" \
  624. "pshufw $0x01, " #a ", " #t " \n\t" \
  625. "paddusw " #t ", " #a " \n\t" \
  626. "movd " #a ", " #dst " \n\t" \
  627. #define HSUM_SSE2(a, t, dst) \
  628. "movhlps " #a ", " #t " \n\t" \
  629. "paddusw " #t ", " #a " \n\t" \
  630. "pshuflw $0x0E, " #a ", " #t " \n\t" \
  631. "paddusw " #t ", " #a " \n\t" \
  632. "pshuflw $0x01, " #a ", " #t " \n\t" \
  633. "paddusw " #t ", " #a " \n\t" \
  634. "movd " #a ", " #dst " \n\t" \
  635. #define DCT_SAD4(m, mm, o) \
  636. "mov"#m" "#o" + 0(%1), " #mm "2 \n\t" \
  637. "mov"#m" "#o" + 16(%1), " #mm "3 \n\t" \
  638. "mov"#m" "#o" + 32(%1), " #mm "4 \n\t" \
  639. "mov"#m" "#o" + 48(%1), " #mm "5 \n\t" \
  640. MMABS_SUM(mm ## 2, mm ## 6, mm ## 0) \
  641. MMABS_SUM(mm ## 3, mm ## 7, mm ## 1) \
  642. MMABS_SUM(mm ## 4, mm ## 6, mm ## 0) \
  643. MMABS_SUM(mm ## 5, mm ## 7, mm ## 1) \
  644. #define DCT_SAD_MMX \
  645. "pxor %%mm0, %%mm0 \n\t" \
  646. "pxor %%mm1, %%mm1 \n\t" \
  647. DCT_SAD4(q, %%mm, 0) \
  648. DCT_SAD4(q, %%mm, 8) \
  649. DCT_SAD4(q, %%mm, 64) \
  650. DCT_SAD4(q, %%mm, 72) \
  651. "paddusw %%mm1, %%mm0 \n\t" \
  652. HSUM(%%mm0, %%mm1, %0)
  653. #define DCT_SAD_SSE2 \
  654. "pxor %%xmm0, %%xmm0 \n\t" \
  655. "pxor %%xmm1, %%xmm1 \n\t" \
  656. DCT_SAD4(dqa, %%xmm, 0) \
  657. DCT_SAD4(dqa, %%xmm, 64) \
  658. "paddusw %%xmm1, %%xmm0 \n\t" \
  659. HSUM(%%xmm0, %%xmm1, %0)
  660. #define DCT_SAD_FUNC(cpu) \
  661. static int sum_abs_dctelem_ ## cpu(int16_t *block) \
  662. { \
  663. int sum; \
  664. __asm__ volatile ( \
  665. DCT_SAD \
  666. :"=r"(sum) \
  667. :"r"(block)); \
  668. return sum & 0xFFFF; \
  669. }
  670. #define DCT_SAD DCT_SAD_MMX
  671. #define HSUM(a, t, dst) HSUM_MMX(a, t, dst)
  672. #define MMABS(a, z) MMABS_MMX(a, z)
  673. DCT_SAD_FUNC(mmx)
  674. #undef MMABS
  675. #undef HSUM
  676. #define HSUM(a, t, dst) HSUM_MMXEXT(a, t, dst)
  677. #define MMABS(a, z) MMABS_MMXEXT(a, z)
  678. DCT_SAD_FUNC(mmxext)
  679. #undef HSUM
  680. #undef DCT_SAD
  681. #define DCT_SAD DCT_SAD_SSE2
  682. #define HSUM(a, t, dst) HSUM_SSE2(a, t, dst)
  683. DCT_SAD_FUNC(sse2)
  684. #undef MMABS
  685. #if HAVE_SSSE3_INLINE
  686. #define MMABS(a, z) MMABS_SSSE3(a, z)
  687. DCT_SAD_FUNC(ssse3)
  688. #undef MMABS
  689. #endif
  690. #undef HSUM
  691. #undef DCT_SAD
  692. DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
  693. 0x0000000000000000ULL,
  694. 0x0001000100010001ULL,
  695. 0x0002000200020002ULL,
  696. };
  697. DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
  698. static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
  699. ptrdiff_t stride, int h)
  700. {
  701. x86_reg len = -(stride * h);
  702. __asm__ volatile (
  703. ".p2align 4 \n\t"
  704. "1: \n\t"
  705. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  706. "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
  707. "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
  708. "add %3, %%"FF_REG_a" \n\t"
  709. "psubusb %%mm0, %%mm2 \n\t"
  710. "psubusb %%mm4, %%mm0 \n\t"
  711. "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
  712. "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
  713. "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
  714. "psubusb %%mm1, %%mm3 \n\t"
  715. "psubusb %%mm5, %%mm1 \n\t"
  716. "por %%mm2, %%mm0 \n\t"
  717. "por %%mm1, %%mm3 \n\t"
  718. "movq %%mm0, %%mm1 \n\t"
  719. "movq %%mm3, %%mm2 \n\t"
  720. "punpcklbw %%mm7, %%mm0 \n\t"
  721. "punpckhbw %%mm7, %%mm1 \n\t"
  722. "punpcklbw %%mm7, %%mm3 \n\t"
  723. "punpckhbw %%mm7, %%mm2 \n\t"
  724. "paddw %%mm1, %%mm0 \n\t"
  725. "paddw %%mm3, %%mm2 \n\t"
  726. "paddw %%mm2, %%mm0 \n\t"
  727. "paddw %%mm0, %%mm6 \n\t"
  728. "add %3, %%"FF_REG_a" \n\t"
  729. " js 1b \n\t"
  730. : "+a" (len)
  731. : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
  732. }
  733. static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
  734. ptrdiff_t stride, int h)
  735. {
  736. __asm__ volatile (
  737. ".p2align 4 \n\t"
  738. "1: \n\t"
  739. "movq (%1), %%mm0 \n\t"
  740. "movq (%1, %3), %%mm1 \n\t"
  741. "psadbw (%2), %%mm0 \n\t"
  742. "psadbw (%2, %3), %%mm1 \n\t"
  743. "paddw %%mm0, %%mm6 \n\t"
  744. "paddw %%mm1, %%mm6 \n\t"
  745. "lea (%1,%3,2), %1 \n\t"
  746. "lea (%2,%3,2), %2 \n\t"
  747. "sub $2, %0 \n\t"
  748. " jg 1b \n\t"
  749. : "+r" (h), "+r" (blk1), "+r" (blk2)
  750. : "r" (stride));
  751. }
  752. static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
  753. ptrdiff_t stride, int h)
  754. {
  755. int ret;
  756. __asm__ volatile (
  757. "pxor %%xmm2, %%xmm2 \n\t"
  758. ".p2align 4 \n\t"
  759. "1: \n\t"
  760. "movdqu (%1), %%xmm0 \n\t"
  761. "movdqu (%1, %4), %%xmm1 \n\t"
  762. "psadbw (%2), %%xmm0 \n\t"
  763. "psadbw (%2, %4), %%xmm1 \n\t"
  764. "paddw %%xmm0, %%xmm2 \n\t"
  765. "paddw %%xmm1, %%xmm2 \n\t"
  766. "lea (%1,%4,2), %1 \n\t"
  767. "lea (%2,%4,2), %2 \n\t"
  768. "sub $2, %0 \n\t"
  769. " jg 1b \n\t"
  770. "movhlps %%xmm2, %%xmm0 \n\t"
  771. "paddw %%xmm0, %%xmm2 \n\t"
  772. "movd %%xmm2, %3 \n\t"
  773. : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
  774. : "r" (stride));
  775. return ret;
  776. }
  777. static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
  778. ptrdiff_t stride, int h)
  779. {
  780. __asm__ volatile (
  781. ".p2align 4 \n\t"
  782. "1: \n\t"
  783. "movq (%1), %%mm0 \n\t"
  784. "movq (%1, %3), %%mm1 \n\t"
  785. "pavgb 1(%1), %%mm0 \n\t"
  786. "pavgb 1(%1, %3), %%mm1 \n\t"
  787. "psadbw (%2), %%mm0 \n\t"
  788. "psadbw (%2, %3), %%mm1 \n\t"
  789. "paddw %%mm0, %%mm6 \n\t"
  790. "paddw %%mm1, %%mm6 \n\t"
  791. "lea (%1,%3,2), %1 \n\t"
  792. "lea (%2,%3,2), %2 \n\t"
  793. "sub $2, %0 \n\t"
  794. " jg 1b \n\t"
  795. : "+r" (h), "+r" (blk1), "+r" (blk2)
  796. : "r" (stride));
  797. }
  798. static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
  799. ptrdiff_t stride, int h)
  800. {
  801. __asm__ volatile (
  802. "movq (%1), %%mm0 \n\t"
  803. "add %3, %1 \n\t"
  804. ".p2align 4 \n\t"
  805. "1: \n\t"
  806. "movq (%1), %%mm1 \n\t"
  807. "movq (%1, %3), %%mm2 \n\t"
  808. "pavgb %%mm1, %%mm0 \n\t"
  809. "pavgb %%mm2, %%mm1 \n\t"
  810. "psadbw (%2), %%mm0 \n\t"
  811. "psadbw (%2, %3), %%mm1 \n\t"
  812. "paddw %%mm0, %%mm6 \n\t"
  813. "paddw %%mm1, %%mm6 \n\t"
  814. "movq %%mm2, %%mm0 \n\t"
  815. "lea (%1,%3,2), %1 \n\t"
  816. "lea (%2,%3,2), %2 \n\t"
  817. "sub $2, %0 \n\t"
  818. " jg 1b \n\t"
  819. : "+r" (h), "+r" (blk1), "+r" (blk2)
  820. : "r" (stride));
  821. }
  822. static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
  823. ptrdiff_t stride, int h)
  824. {
  825. __asm__ volatile (
  826. "movq "MANGLE(bone)", %%mm5 \n\t"
  827. "movq (%1), %%mm0 \n\t"
  828. "pavgb 1(%1), %%mm0 \n\t"
  829. "add %3, %1 \n\t"
  830. ".p2align 4 \n\t"
  831. "1: \n\t"
  832. "movq (%1), %%mm1 \n\t"
  833. "movq (%1,%3), %%mm2 \n\t"
  834. "pavgb 1(%1), %%mm1 \n\t"
  835. "pavgb 1(%1,%3), %%mm2 \n\t"
  836. "psubusb %%mm5, %%mm1 \n\t"
  837. "pavgb %%mm1, %%mm0 \n\t"
  838. "pavgb %%mm2, %%mm1 \n\t"
  839. "psadbw (%2), %%mm0 \n\t"
  840. "psadbw (%2,%3), %%mm1 \n\t"
  841. "paddw %%mm0, %%mm6 \n\t"
  842. "paddw %%mm1, %%mm6 \n\t"
  843. "movq %%mm2, %%mm0 \n\t"
  844. "lea (%1,%3,2), %1 \n\t"
  845. "lea (%2,%3,2), %2 \n\t"
  846. "sub $2, %0 \n\t"
  847. " jg 1b \n\t"
  848. : "+r" (h), "+r" (blk1), "+r" (blk2)
  849. : "r" (stride));
  850. }
  851. static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
  852. ptrdiff_t stride, int h)
  853. {
  854. x86_reg len = -(stride * h);
  855. __asm__ volatile (
  856. ".p2align 4 \n\t"
  857. "1: \n\t"
  858. "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
  859. "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
  860. "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
  861. "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
  862. "punpcklbw %%mm7, %%mm0 \n\t"
  863. "punpcklbw %%mm7, %%mm1 \n\t"
  864. "punpckhbw %%mm7, %%mm2 \n\t"
  865. "punpckhbw %%mm7, %%mm3 \n\t"
  866. "paddw %%mm0, %%mm1 \n\t"
  867. "paddw %%mm2, %%mm3 \n\t"
  868. "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
  869. "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
  870. "paddw %%mm5, %%mm1 \n\t"
  871. "paddw %%mm5, %%mm3 \n\t"
  872. "psrlw $1, %%mm1 \n\t"
  873. "psrlw $1, %%mm3 \n\t"
  874. "packuswb %%mm3, %%mm1 \n\t"
  875. "psubusb %%mm1, %%mm4 \n\t"
  876. "psubusb %%mm2, %%mm1 \n\t"
  877. "por %%mm4, %%mm1 \n\t"
  878. "movq %%mm1, %%mm0 \n\t"
  879. "punpcklbw %%mm7, %%mm0 \n\t"
  880. "punpckhbw %%mm7, %%mm1 \n\t"
  881. "paddw %%mm1, %%mm0 \n\t"
  882. "paddw %%mm0, %%mm6 \n\t"
  883. "add %4, %%"FF_REG_a" \n\t"
  884. " js 1b \n\t"
  885. : "+a" (len)
  886. : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
  887. "r" (stride));
  888. }
  889. static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
  890. ptrdiff_t stride, int h)
  891. {
  892. x86_reg len = -(stride * h);
  893. __asm__ volatile (
  894. "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
  895. "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
  896. "movq %%mm0, %%mm1 \n\t"
  897. "movq %%mm2, %%mm3 \n\t"
  898. "punpcklbw %%mm7, %%mm0 \n\t"
  899. "punpckhbw %%mm7, %%mm1 \n\t"
  900. "punpcklbw %%mm7, %%mm2 \n\t"
  901. "punpckhbw %%mm7, %%mm3 \n\t"
  902. "paddw %%mm2, %%mm0 \n\t"
  903. "paddw %%mm3, %%mm1 \n\t"
  904. ".p2align 4 \n\t"
  905. "1: \n\t"
  906. "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
  907. "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
  908. "movq %%mm2, %%mm3 \n\t"
  909. "movq %%mm4, %%mm5 \n\t"
  910. "punpcklbw %%mm7, %%mm2 \n\t"
  911. "punpckhbw %%mm7, %%mm3 \n\t"
  912. "punpcklbw %%mm7, %%mm4 \n\t"
  913. "punpckhbw %%mm7, %%mm5 \n\t"
  914. "paddw %%mm4, %%mm2 \n\t"
  915. "paddw %%mm5, %%mm3 \n\t"
  916. "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
  917. "paddw %%mm2, %%mm0 \n\t"
  918. "paddw %%mm3, %%mm1 \n\t"
  919. "paddw %%mm5, %%mm0 \n\t"
  920. "paddw %%mm5, %%mm1 \n\t"
  921. "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
  922. "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
  923. "psrlw $2, %%mm0 \n\t"
  924. "psrlw $2, %%mm1 \n\t"
  925. "packuswb %%mm1, %%mm0 \n\t"
  926. "psubusb %%mm0, %%mm4 \n\t"
  927. "psubusb %%mm5, %%mm0 \n\t"
  928. "por %%mm4, %%mm0 \n\t"
  929. "movq %%mm0, %%mm4 \n\t"
  930. "punpcklbw %%mm7, %%mm0 \n\t"
  931. "punpckhbw %%mm7, %%mm4 \n\t"
  932. "paddw %%mm0, %%mm6 \n\t"
  933. "paddw %%mm4, %%mm6 \n\t"
  934. "movq %%mm2, %%mm0 \n\t"
  935. "movq %%mm3, %%mm1 \n\t"
  936. "add %4, %%"FF_REG_a" \n\t"
  937. " js 1b \n\t"
  938. : "+a" (len)
  939. : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
  940. "r" (stride));
  941. }
  942. static inline int sum_mmx(void)
  943. {
  944. int ret;
  945. __asm__ volatile (
  946. "movq %%mm6, %%mm0 \n\t"
  947. "psrlq $32, %%mm6 \n\t"
  948. "paddw %%mm0, %%mm6 \n\t"
  949. "movq %%mm6, %%mm0 \n\t"
  950. "psrlq $16, %%mm6 \n\t"
  951. "paddw %%mm0, %%mm6 \n\t"
  952. "movd %%mm6, %0 \n\t"
  953. : "=r" (ret));
  954. return ret & 0xFFFF;
  955. }
  956. static inline int sum_mmxext(void)
  957. {
  958. int ret;
  959. __asm__ volatile (
  960. "movd %%mm6, %0 \n\t"
  961. : "=r" (ret));
  962. return ret;
  963. }
  964. static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
  965. ptrdiff_t stride, int h)
  966. {
  967. sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
  968. }
  969. static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
  970. ptrdiff_t stride, int h)
  971. {
  972. sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
  973. }
  974. #define PIX_SAD(suf) \
  975. static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  976. uint8_t *blk1, ptrdiff_t stride, int h) \
  977. { \
  978. assert(h == 8); \
  979. __asm__ volatile ( \
  980. "pxor %%mm7, %%mm7 \n\t" \
  981. "pxor %%mm6, %%mm6 \n\t" \
  982. :); \
  983. \
  984. sad8_1_ ## suf(blk1, blk2, stride, 8); \
  985. \
  986. return sum_ ## suf(); \
  987. } \
  988. \
  989. static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  990. uint8_t *blk1, ptrdiff_t stride, int h) \
  991. { \
  992. assert(h == 8); \
  993. __asm__ volatile ( \
  994. "pxor %%mm7, %%mm7 \n\t" \
  995. "pxor %%mm6, %%mm6 \n\t" \
  996. "movq %0, %%mm5 \n\t" \
  997. :: "m" (round_tab[1])); \
  998. \
  999. sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
  1000. \
  1001. return sum_ ## suf(); \
  1002. } \
  1003. \
  1004. static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1005. uint8_t *blk1, ptrdiff_t stride, int h) \
  1006. { \
  1007. assert(h == 8); \
  1008. __asm__ volatile ( \
  1009. "pxor %%mm7, %%mm7 \n\t" \
  1010. "pxor %%mm6, %%mm6 \n\t" \
  1011. "movq %0, %%mm5 \n\t" \
  1012. :: "m" (round_tab[1])); \
  1013. \
  1014. sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
  1015. \
  1016. return sum_ ## suf(); \
  1017. } \
  1018. \
  1019. static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1020. uint8_t *blk1, ptrdiff_t stride, int h) \
  1021. { \
  1022. assert(h == 8); \
  1023. __asm__ volatile ( \
  1024. "pxor %%mm7, %%mm7 \n\t" \
  1025. "pxor %%mm6, %%mm6 \n\t" \
  1026. ::); \
  1027. \
  1028. sad8_4_ ## suf(blk1, blk2, stride, 8); \
  1029. \
  1030. return sum_ ## suf(); \
  1031. } \
  1032. \
  1033. static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1034. uint8_t *blk1, ptrdiff_t stride, int h) \
  1035. { \
  1036. __asm__ volatile ( \
  1037. "pxor %%mm7, %%mm7 \n\t" \
  1038. "pxor %%mm6, %%mm6 \n\t" \
  1039. :); \
  1040. \
  1041. sad8_1_ ## suf(blk1, blk2, stride, h); \
  1042. sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  1043. \
  1044. return sum_ ## suf(); \
  1045. } \
  1046. \
  1047. static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1048. uint8_t *blk1, ptrdiff_t stride, int h) \
  1049. { \
  1050. __asm__ volatile ( \
  1051. "pxor %%mm7, %%mm7 \n\t" \
  1052. "pxor %%mm6, %%mm6 \n\t" \
  1053. "movq %0, %%mm5 \n\t" \
  1054. :: "m" (round_tab[1])); \
  1055. \
  1056. sad8_x2a_ ## suf(blk1, blk2, stride, h); \
  1057. sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  1058. \
  1059. return sum_ ## suf(); \
  1060. } \
  1061. \
  1062. static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1063. uint8_t *blk1, ptrdiff_t stride, int h) \
  1064. { \
  1065. __asm__ volatile ( \
  1066. "pxor %%mm7, %%mm7 \n\t" \
  1067. "pxor %%mm6, %%mm6 \n\t" \
  1068. "movq %0, %%mm5 \n\t" \
  1069. :: "m" (round_tab[1])); \
  1070. \
  1071. sad8_y2a_ ## suf(blk1, blk2, stride, h); \
  1072. sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  1073. \
  1074. return sum_ ## suf(); \
  1075. } \
  1076. \
  1077. static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
  1078. uint8_t *blk1, ptrdiff_t stride, int h) \
  1079. { \
  1080. __asm__ volatile ( \
  1081. "pxor %%mm7, %%mm7 \n\t" \
  1082. "pxor %%mm6, %%mm6 \n\t" \
  1083. ::); \
  1084. \
  1085. sad8_4_ ## suf(blk1, blk2, stride, h); \
  1086. sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
  1087. \
  1088. return sum_ ## suf(); \
  1089. } \
  1090. PIX_SAD(mmx)
  1091. PIX_SAD(mmxext)
  1092. #endif /* HAVE_INLINE_ASM */
  1093. int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  1094. ptrdiff_t stride, int h);
  1095. #define hadamard_func(cpu) \
  1096. int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  1097. uint8_t *src2, ptrdiff_t stride, int h); \
  1098. int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
  1099. uint8_t *src2, ptrdiff_t stride, int h);
  1100. hadamard_func(mmx)
  1101. hadamard_func(mmxext)
  1102. hadamard_func(sse2)
  1103. hadamard_func(ssse3)
  1104. av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
  1105. {
  1106. int cpu_flags = av_get_cpu_flags();
  1107. #if HAVE_INLINE_ASM
  1108. if (INLINE_MMX(cpu_flags)) {
  1109. c->sum_abs_dctelem = sum_abs_dctelem_mmx;
  1110. c->pix_abs[0][0] = sad16_mmx;
  1111. c->pix_abs[0][1] = sad16_x2_mmx;
  1112. c->pix_abs[0][2] = sad16_y2_mmx;
  1113. c->pix_abs[0][3] = sad16_xy2_mmx;
  1114. c->pix_abs[1][0] = sad8_mmx;
  1115. c->pix_abs[1][1] = sad8_x2_mmx;
  1116. c->pix_abs[1][2] = sad8_y2_mmx;
  1117. c->pix_abs[1][3] = sad8_xy2_mmx;
  1118. c->sad[0] = sad16_mmx;
  1119. c->sad[1] = sad8_mmx;
  1120. c->sse[0] = sse16_mmx;
  1121. c->sse[1] = sse8_mmx;
  1122. c->vsad[4] = vsad_intra16_mmx;
  1123. c->nsse[0] = nsse16_mmx;
  1124. c->nsse[1] = nsse8_mmx;
  1125. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  1126. c->vsad[0] = vsad16_mmx;
  1127. }
  1128. }
  1129. if (INLINE_MMXEXT(cpu_flags)) {
  1130. c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
  1131. c->vsad[4] = vsad_intra16_mmxext;
  1132. c->pix_abs[0][0] = sad16_mmxext;
  1133. c->pix_abs[1][0] = sad8_mmxext;
  1134. c->sad[0] = sad16_mmxext;
  1135. c->sad[1] = sad8_mmxext;
  1136. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  1137. c->pix_abs[0][1] = sad16_x2_mmxext;
  1138. c->pix_abs[0][2] = sad16_y2_mmxext;
  1139. c->pix_abs[0][3] = sad16_xy2_mmxext;
  1140. c->pix_abs[1][1] = sad8_x2_mmxext;
  1141. c->pix_abs[1][2] = sad8_y2_mmxext;
  1142. c->pix_abs[1][3] = sad8_xy2_mmxext;
  1143. c->vsad[0] = vsad16_mmxext;
  1144. }
  1145. }
  1146. if (INLINE_SSE2(cpu_flags)) {
  1147. c->sum_abs_dctelem = sum_abs_dctelem_sse2;
  1148. }
  1149. if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
  1150. c->sad[0] = sad16_sse2;
  1151. }
  1152. #if HAVE_SSSE3_INLINE
  1153. if (INLINE_SSSE3(cpu_flags)) {
  1154. c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
  1155. }
  1156. #endif
  1157. #endif /* HAVE_INLINE_ASM */
  1158. if (EXTERNAL_MMX(cpu_flags)) {
  1159. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  1160. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  1161. }
  1162. if (EXTERNAL_MMXEXT(cpu_flags)) {
  1163. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
  1164. c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
  1165. }
  1166. if (EXTERNAL_SSE2(cpu_flags)) {
  1167. c->sse[0] = ff_sse16_sse2;
  1168. #if HAVE_ALIGNED_STACK
  1169. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  1170. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  1171. #endif
  1172. }
  1173. if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
  1174. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  1175. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  1176. }
  1177. }