You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

409 lines
14KB

  1. /*
  2. * MMX optimized motion estimation
  3. * Copyright (c) 2001 Fabrice Bellard.
  4. * Copyright (c) 2002-2004 Michael Niedermayer
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * mostly by Michael Niedermayer <michaelni@gmx.at>
  23. */
  24. #include "../dsputil.h"
  25. #include "x86_cpu.h"
  26. static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
  27. 0x0000000000000000ULL,
  28. 0x0001000100010001ULL,
  29. 0x0002000200020002ULL,
  30. };
  31. static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
  32. static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  33. {
  34. long len= -(stride*h);
  35. asm volatile(
  36. ASMALIGN(4)
  37. "1: \n\t"
  38. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  39. "movq (%2, %%"REG_a"), %%mm2 \n\t"
  40. "movq (%2, %%"REG_a"), %%mm4 \n\t"
  41. "add %3, %%"REG_a" \n\t"
  42. "psubusb %%mm0, %%mm2 \n\t"
  43. "psubusb %%mm4, %%mm0 \n\t"
  44. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  45. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  46. "movq (%2, %%"REG_a"), %%mm5 \n\t"
  47. "psubusb %%mm1, %%mm3 \n\t"
  48. "psubusb %%mm5, %%mm1 \n\t"
  49. "por %%mm2, %%mm0 \n\t"
  50. "por %%mm1, %%mm3 \n\t"
  51. "movq %%mm0, %%mm1 \n\t"
  52. "movq %%mm3, %%mm2 \n\t"
  53. "punpcklbw %%mm7, %%mm0 \n\t"
  54. "punpckhbw %%mm7, %%mm1 \n\t"
  55. "punpcklbw %%mm7, %%mm3 \n\t"
  56. "punpckhbw %%mm7, %%mm2 \n\t"
  57. "paddw %%mm1, %%mm0 \n\t"
  58. "paddw %%mm3, %%mm2 \n\t"
  59. "paddw %%mm2, %%mm0 \n\t"
  60. "paddw %%mm0, %%mm6 \n\t"
  61. "add %3, %%"REG_a" \n\t"
  62. " js 1b \n\t"
  63. : "+a" (len)
  64. : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
  65. );
  66. }
  67. static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  68. {
  69. long len= -(stride*h);
  70. asm volatile(
  71. ASMALIGN(4)
  72. "1: \n\t"
  73. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  74. "movq (%2, %%"REG_a"), %%mm2 \n\t"
  75. "psadbw %%mm2, %%mm0 \n\t"
  76. "add %3, %%"REG_a" \n\t"
  77. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  78. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  79. "psadbw %%mm1, %%mm3 \n\t"
  80. "paddw %%mm3, %%mm0 \n\t"
  81. "paddw %%mm0, %%mm6 \n\t"
  82. "add %3, %%"REG_a" \n\t"
  83. " js 1b \n\t"
  84. : "+a" (len)
  85. : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
  86. );
  87. }
  88. static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
  89. {
  90. long len= -(stride*h);
  91. asm volatile(
  92. ASMALIGN(4)
  93. "1: \n\t"
  94. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  95. "movq (%2, %%"REG_a"), %%mm2 \n\t"
  96. "pavgb %%mm2, %%mm0 \n\t"
  97. "movq (%3, %%"REG_a"), %%mm2 \n\t"
  98. "psadbw %%mm2, %%mm0 \n\t"
  99. "add %4, %%"REG_a" \n\t"
  100. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  101. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  102. "pavgb %%mm1, %%mm3 \n\t"
  103. "movq (%3, %%"REG_a"), %%mm1 \n\t"
  104. "psadbw %%mm1, %%mm3 \n\t"
  105. "paddw %%mm3, %%mm0 \n\t"
  106. "paddw %%mm0, %%mm6 \n\t"
  107. "add %4, %%"REG_a" \n\t"
  108. " js 1b \n\t"
  109. : "+a" (len)
  110. : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
  111. );
  112. }
  113. static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  114. { //FIXME reuse src
  115. long len= -(stride*h);
  116. asm volatile(
  117. ASMALIGN(4)
  118. "movq "MANGLE(bone)", %%mm5 \n\t"
  119. "1: \n\t"
  120. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  121. "movq (%2, %%"REG_a"), %%mm2 \n\t"
  122. "movq 1(%1, %%"REG_a"), %%mm1 \n\t"
  123. "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
  124. "pavgb %%mm2, %%mm0 \n\t"
  125. "pavgb %%mm1, %%mm3 \n\t"
  126. "psubusb %%mm5, %%mm3 \n\t"
  127. "pavgb %%mm3, %%mm0 \n\t"
  128. "movq (%3, %%"REG_a"), %%mm2 \n\t"
  129. "psadbw %%mm2, %%mm0 \n\t"
  130. "add %4, %%"REG_a" \n\t"
  131. "movq (%1, %%"REG_a"), %%mm1 \n\t"
  132. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  133. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  134. "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
  135. "pavgb %%mm3, %%mm1 \n\t"
  136. "pavgb %%mm4, %%mm2 \n\t"
  137. "psubusb %%mm5, %%mm2 \n\t"
  138. "pavgb %%mm1, %%mm2 \n\t"
  139. "movq (%3, %%"REG_a"), %%mm1 \n\t"
  140. "psadbw %%mm1, %%mm2 \n\t"
  141. "paddw %%mm2, %%mm0 \n\t"
  142. "paddw %%mm0, %%mm6 \n\t"
  143. "add %4, %%"REG_a" \n\t"
  144. " js 1b \n\t"
  145. : "+a" (len)
  146. : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
  147. );
  148. }
  149. static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
  150. {
  151. long len= -(stride*h);
  152. asm volatile(
  153. ASMALIGN(4)
  154. "1: \n\t"
  155. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  156. "movq (%2, %%"REG_a"), %%mm1 \n\t"
  157. "movq (%1, %%"REG_a"), %%mm2 \n\t"
  158. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  159. "punpcklbw %%mm7, %%mm0 \n\t"
  160. "punpcklbw %%mm7, %%mm1 \n\t"
  161. "punpckhbw %%mm7, %%mm2 \n\t"
  162. "punpckhbw %%mm7, %%mm3 \n\t"
  163. "paddw %%mm0, %%mm1 \n\t"
  164. "paddw %%mm2, %%mm3 \n\t"
  165. "movq (%3, %%"REG_a"), %%mm4 \n\t"
  166. "movq (%3, %%"REG_a"), %%mm2 \n\t"
  167. "paddw %%mm5, %%mm1 \n\t"
  168. "paddw %%mm5, %%mm3 \n\t"
  169. "psrlw $1, %%mm1 \n\t"
  170. "psrlw $1, %%mm3 \n\t"
  171. "packuswb %%mm3, %%mm1 \n\t"
  172. "psubusb %%mm1, %%mm4 \n\t"
  173. "psubusb %%mm2, %%mm1 \n\t"
  174. "por %%mm4, %%mm1 \n\t"
  175. "movq %%mm1, %%mm0 \n\t"
  176. "punpcklbw %%mm7, %%mm0 \n\t"
  177. "punpckhbw %%mm7, %%mm1 \n\t"
  178. "paddw %%mm1, %%mm0 \n\t"
  179. "paddw %%mm0, %%mm6 \n\t"
  180. "add %4, %%"REG_a" \n\t"
  181. " js 1b \n\t"
  182. : "+a" (len)
  183. : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
  184. );
  185. }
  186. static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  187. {
  188. long len= -(stride*h);
  189. asm volatile(
  190. ASMALIGN(4)
  191. "1: \n\t"
  192. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  193. "movq (%2, %%"REG_a"), %%mm1 \n\t"
  194. "movq %%mm0, %%mm4 \n\t"
  195. "movq %%mm1, %%mm2 \n\t"
  196. "punpcklbw %%mm7, %%mm0 \n\t"
  197. "punpcklbw %%mm7, %%mm1 \n\t"
  198. "punpckhbw %%mm7, %%mm4 \n\t"
  199. "punpckhbw %%mm7, %%mm2 \n\t"
  200. "paddw %%mm1, %%mm0 \n\t"
  201. "paddw %%mm2, %%mm4 \n\t"
  202. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  203. "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
  204. "movq %%mm2, %%mm1 \n\t"
  205. "punpcklbw %%mm7, %%mm2 \n\t"
  206. "punpckhbw %%mm7, %%mm1 \n\t"
  207. "paddw %%mm0, %%mm2 \n\t"
  208. "paddw %%mm4, %%mm1 \n\t"
  209. "movq %%mm3, %%mm4 \n\t"
  210. "punpcklbw %%mm7, %%mm3 \n\t"
  211. "punpckhbw %%mm7, %%mm4 \n\t"
  212. "paddw %%mm3, %%mm2 \n\t"
  213. "paddw %%mm4, %%mm1 \n\t"
  214. "movq (%3, %%"REG_a"), %%mm3 \n\t"
  215. "movq (%3, %%"REG_a"), %%mm4 \n\t"
  216. "paddw %%mm5, %%mm2 \n\t"
  217. "paddw %%mm5, %%mm1 \n\t"
  218. "psrlw $2, %%mm2 \n\t"
  219. "psrlw $2, %%mm1 \n\t"
  220. "packuswb %%mm1, %%mm2 \n\t"
  221. "psubusb %%mm2, %%mm3 \n\t"
  222. "psubusb %%mm4, %%mm2 \n\t"
  223. "por %%mm3, %%mm2 \n\t"
  224. "movq %%mm2, %%mm0 \n\t"
  225. "punpcklbw %%mm7, %%mm0 \n\t"
  226. "punpckhbw %%mm7, %%mm2 \n\t"
  227. "paddw %%mm2, %%mm0 \n\t"
  228. "paddw %%mm0, %%mm6 \n\t"
  229. "add %4, %%"REG_a" \n\t"
  230. " js 1b \n\t"
  231. : "+a" (len)
  232. : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
  233. );
  234. }
  235. static inline int sum_mmx(void)
  236. {
  237. int ret;
  238. asm volatile(
  239. "movq %%mm6, %%mm0 \n\t"
  240. "psrlq $32, %%mm6 \n\t"
  241. "paddw %%mm0, %%mm6 \n\t"
  242. "movq %%mm6, %%mm0 \n\t"
  243. "psrlq $16, %%mm6 \n\t"
  244. "paddw %%mm0, %%mm6 \n\t"
  245. "movd %%mm6, %0 \n\t"
  246. : "=r" (ret)
  247. );
  248. return ret&0xFFFF;
  249. }
  250. static inline int sum_mmx2(void)
  251. {
  252. int ret;
  253. asm volatile(
  254. "movd %%mm6, %0 \n\t"
  255. : "=r" (ret)
  256. );
  257. return ret;
  258. }
  259. #define PIX_SAD(suf)\
  260. static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  261. {\
  262. assert(h==8);\
  263. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  264. "pxor %%mm6, %%mm6 \n\t":);\
  265. \
  266. sad8_1_ ## suf(blk1, blk2, stride, 8);\
  267. \
  268. return sum_ ## suf();\
  269. }\
  270. static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  271. {\
  272. assert(h==8);\
  273. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  274. "pxor %%mm6, %%mm6 \n\t"\
  275. "movq %0, %%mm5 \n\t"\
  276. :: "m"(round_tab[1]) \
  277. );\
  278. \
  279. sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
  280. \
  281. return sum_ ## suf();\
  282. }\
  283. \
  284. static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  285. {\
  286. assert(h==8);\
  287. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  288. "pxor %%mm6, %%mm6 \n\t"\
  289. "movq %0, %%mm5 \n\t"\
  290. :: "m"(round_tab[1]) \
  291. );\
  292. \
  293. sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
  294. \
  295. return sum_ ## suf();\
  296. }\
  297. \
  298. static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  299. {\
  300. assert(h==8);\
  301. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  302. "pxor %%mm6, %%mm6 \n\t"\
  303. "movq %0, %%mm5 \n\t"\
  304. :: "m"(round_tab[2]) \
  305. );\
  306. \
  307. sad8_4_ ## suf(blk1, blk2, stride, 8);\
  308. \
  309. return sum_ ## suf();\
  310. }\
  311. \
  312. static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  313. {\
  314. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  315. "pxor %%mm6, %%mm6 \n\t":);\
  316. \
  317. sad8_1_ ## suf(blk1 , blk2 , stride, h);\
  318. sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
  319. \
  320. return sum_ ## suf();\
  321. }\
  322. static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  323. {\
  324. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  325. "pxor %%mm6, %%mm6 \n\t"\
  326. "movq %0, %%mm5 \n\t"\
  327. :: "m"(round_tab[1]) \
  328. );\
  329. \
  330. sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\
  331. sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
  332. \
  333. return sum_ ## suf();\
  334. }\
  335. static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  336. {\
  337. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  338. "pxor %%mm6, %%mm6 \n\t"\
  339. "movq %0, %%mm5 \n\t"\
  340. :: "m"(round_tab[1]) \
  341. );\
  342. \
  343. sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\
  344. sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
  345. \
  346. return sum_ ## suf();\
  347. }\
  348. static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
  349. {\
  350. asm volatile("pxor %%mm7, %%mm7 \n\t"\
  351. "pxor %%mm6, %%mm6 \n\t"\
  352. "movq %0, %%mm5 \n\t"\
  353. :: "m"(round_tab[2]) \
  354. );\
  355. \
  356. sad8_4_ ## suf(blk1 , blk2 , stride, h);\
  357. sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
  358. \
  359. return sum_ ## suf();\
  360. }\
  361. PIX_SAD(mmx)
  362. PIX_SAD(mmx2)
  363. void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
  364. {
  365. if (mm_flags & MM_MMX) {
  366. c->pix_abs[0][0] = sad16_mmx;
  367. c->pix_abs[0][1] = sad16_x2_mmx;
  368. c->pix_abs[0][2] = sad16_y2_mmx;
  369. c->pix_abs[0][3] = sad16_xy2_mmx;
  370. c->pix_abs[1][0] = sad8_mmx;
  371. c->pix_abs[1][1] = sad8_x2_mmx;
  372. c->pix_abs[1][2] = sad8_y2_mmx;
  373. c->pix_abs[1][3] = sad8_xy2_mmx;
  374. c->sad[0]= sad16_mmx;
  375. c->sad[1]= sad8_mmx;
  376. }
  377. if (mm_flags & MM_MMXEXT) {
  378. c->pix_abs[0][0] = sad16_mmx2;
  379. c->pix_abs[1][0] = sad8_mmx2;
  380. c->sad[0]= sad16_mmx2;
  381. c->sad[1]= sad8_mmx2;
  382. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  383. c->pix_abs[0][1] = sad16_x2_mmx2;
  384. c->pix_abs[0][2] = sad16_y2_mmx2;
  385. c->pix_abs[0][3] = sad16_xy2_mmx2;
  386. c->pix_abs[1][1] = sad8_x2_mmx2;
  387. c->pix_abs[1][2] = sad8_y2_mmx2;
  388. c->pix_abs[1][3] = sad8_xy2_mmx2;
  389. }
  390. }
  391. }