You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

409 lines
17KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23. */
  24. #include "libavutil/cpu.h"
  25. #include "libavutil/x86/asm.h"
  26. #include "libavcodec/hpeldsp.h"
  27. #include "dsputil_mmx.h"
  28. void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  29. ptrdiff_t line_size, int h);
  30. void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  31. ptrdiff_t line_size, int h);
  32. void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  33. ptrdiff_t line_size, int h);
  34. void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  35. ptrdiff_t line_size, int h);
  36. void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  37. ptrdiff_t line_size, int h);
  38. void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  39. ptrdiff_t line_size, int h);
  40. void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
  41. const uint8_t *pixels,
  42. ptrdiff_t line_size, int h);
  43. void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
  44. const uint8_t *pixels,
  45. ptrdiff_t line_size, int h);
  46. void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  47. ptrdiff_t line_size, int h);
  48. void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  49. ptrdiff_t line_size, int h);
  50. void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  51. ptrdiff_t line_size, int h);
  52. void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  53. ptrdiff_t line_size, int h);
  54. void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
  55. const uint8_t *pixels,
  56. ptrdiff_t line_size, int h);
  57. void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
  58. const uint8_t *pixels,
  59. ptrdiff_t line_size, int h);
  60. void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
  61. ptrdiff_t line_size, int h);
  62. void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
  63. ptrdiff_t line_size, int h);
  64. void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
  65. ptrdiff_t line_size, int h);
  66. void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
  67. ptrdiff_t line_size, int h);
  68. void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
  69. ptrdiff_t line_size, int h);
  70. void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
  71. ptrdiff_t line_size, int h);
  72. void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
  73. ptrdiff_t line_size, int h);
  74. #if HAVE_INLINE_ASM
  75. #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
  76. #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
  77. #define MOVQ_BFE(regd) \
  78. __asm__ volatile ( \
  79. "pcmpeqd %%"#regd", %%"#regd" \n\t" \
  80. "paddb %%"#regd", %%"#regd" \n\t" ::)
  81. #ifndef PIC
  82. #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
  83. #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
  84. #else
  85. // for shared library it's better to use this way for accessing constants
  86. // pcmpeqd -> -1
  87. #define MOVQ_BONE(regd) \
  88. __asm__ volatile ( \
  89. "pcmpeqd %%"#regd", %%"#regd" \n\t" \
  90. "psrlw $15, %%"#regd" \n\t" \
  91. "packuswb %%"#regd", %%"#regd" \n\t" ::)
  92. #define MOVQ_WTWO(regd) \
  93. __asm__ volatile ( \
  94. "pcmpeqd %%"#regd", %%"#regd" \n\t" \
  95. "psrlw $15, %%"#regd" \n\t" \
  96. "psllw $1, %%"#regd" \n\t"::)
  97. #endif
  98. // using regr as temporary and for the output result
  99. // first argument is unmodifed and second is trashed
  100. // regfe is supposed to contain 0xfefefefefefefefe
  101. #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
  102. "movq "#rega", "#regr" \n\t" \
  103. "pand "#regb", "#regr" \n\t" \
  104. "pxor "#rega", "#regb" \n\t" \
  105. "pand "#regfe", "#regb" \n\t" \
  106. "psrlq $1, "#regb" \n\t" \
  107. "paddb "#regb", "#regr" \n\t"
  108. #define PAVGB_MMX(rega, regb, regr, regfe) \
  109. "movq "#rega", "#regr" \n\t" \
  110. "por "#regb", "#regr" \n\t" \
  111. "pxor "#rega", "#regb" \n\t" \
  112. "pand "#regfe", "#regb" \n\t" \
  113. "psrlq $1, "#regb" \n\t" \
  114. "psubb "#regb", "#regr" \n\t"
  115. // mm6 is supposed to contain 0xfefefefefefefefe
  116. #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
  117. "movq "#rega", "#regr" \n\t" \
  118. "movq "#regc", "#regp" \n\t" \
  119. "pand "#regb", "#regr" \n\t" \
  120. "pand "#regd", "#regp" \n\t" \
  121. "pxor "#rega", "#regb" \n\t" \
  122. "pxor "#regc", "#regd" \n\t" \
  123. "pand %%mm6, "#regb" \n\t" \
  124. "pand %%mm6, "#regd" \n\t" \
  125. "psrlq $1, "#regb" \n\t" \
  126. "psrlq $1, "#regd" \n\t" \
  127. "paddb "#regb", "#regr" \n\t" \
  128. "paddb "#regd", "#regp" \n\t"
  129. #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
  130. "movq "#rega", "#regr" \n\t" \
  131. "movq "#regc", "#regp" \n\t" \
  132. "por "#regb", "#regr" \n\t" \
  133. "por "#regd", "#regp" \n\t" \
  134. "pxor "#rega", "#regb" \n\t" \
  135. "pxor "#regc", "#regd" \n\t" \
  136. "pand %%mm6, "#regb" \n\t" \
  137. "pand %%mm6, "#regd" \n\t" \
  138. "psrlq $1, "#regd" \n\t" \
  139. "psrlq $1, "#regb" \n\t" \
  140. "psubb "#regb", "#regr" \n\t" \
  141. "psubb "#regd", "#regp" \n\t"
  142. /***********************************/
  143. /* MMX no rounding */
  144. #define NO_RND 1
  145. #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
  146. #define SET_RND MOVQ_WONE
  147. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
  148. #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
  149. #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
  150. #include "hpeldsp_rnd_template.c"
  151. #undef DEF
  152. #undef SET_RND
  153. #undef PAVGBP
  154. #undef PAVGB
  155. #undef NO_RND
  156. /***********************************/
  157. /* MMX rounding */
  158. #define DEF(x, y) x ## _ ## y ## _mmx
  159. #define SET_RND MOVQ_WTWO
  160. #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
  161. #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
  162. #include "hpeldsp_rnd_template.c"
  163. #undef DEF
  164. #undef SET_RND
  165. #undef PAVGBP
  166. #undef PAVGB
  167. #undef OP_AVG
  168. #endif /* HAVE_INLINE_ASM */
  169. #if HAVE_YASM
  170. #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
  171. /***********************************/
  172. /* 3Dnow specific */
  173. #define DEF(x) x ## _3dnow
  174. #include "hpeldsp_avg_template.c"
  175. #undef DEF
  176. /***********************************/
  177. /* MMXEXT specific */
  178. #define DEF(x) x ## _mmxext
  179. #include "hpeldsp_avg_template.c"
  180. #undef DEF
  181. #endif /* HAVE_YASM */
  182. #if HAVE_INLINE_ASM
  183. #define put_no_rnd_pixels16_mmx put_pixels16_mmx
  184. #define put_no_rnd_pixels8_mmx put_pixels8_mmx
  185. #define put_pixels16_mmxext put_pixels16_mmx
  186. #define put_pixels8_mmxext put_pixels8_mmx
  187. #define put_pixels4_mmxext put_pixels4_mmx
  188. #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
  189. #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
  190. static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
  191. ptrdiff_t line_size, int h)
  192. {
  193. __asm__ volatile (
  194. "lea (%3, %3), %%"REG_a" \n\t"
  195. ".p2align 3 \n\t"
  196. "1: \n\t"
  197. "movq (%1 ), %%mm0 \n\t"
  198. "movq (%1, %3), %%mm1 \n\t"
  199. "movq %%mm0, (%2) \n\t"
  200. "movq %%mm1, (%2, %3) \n\t"
  201. "add %%"REG_a", %1 \n\t"
  202. "add %%"REG_a", %2 \n\t"
  203. "movq (%1 ), %%mm0 \n\t"
  204. "movq (%1, %3), %%mm1 \n\t"
  205. "movq %%mm0, (%2) \n\t"
  206. "movq %%mm1, (%2, %3) \n\t"
  207. "add %%"REG_a", %1 \n\t"
  208. "add %%"REG_a", %2 \n\t"
  209. "subl $4, %0 \n\t"
  210. "jnz 1b \n\t"
  211. : "+g"(h), "+r"(pixels), "+r"(block)
  212. : "r"((x86_reg)line_size)
  213. : "%"REG_a, "memory"
  214. );
  215. }
  216. static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
  217. ptrdiff_t line_size, int h)
  218. {
  219. __asm__ volatile (
  220. "lea (%3, %3), %%"REG_a" \n\t"
  221. ".p2align 3 \n\t"
  222. "1: \n\t"
  223. "movq (%1 ), %%mm0 \n\t"
  224. "movq 8(%1 ), %%mm4 \n\t"
  225. "movq (%1, %3), %%mm1 \n\t"
  226. "movq 8(%1, %3), %%mm5 \n\t"
  227. "movq %%mm0, (%2) \n\t"
  228. "movq %%mm4, 8(%2) \n\t"
  229. "movq %%mm1, (%2, %3) \n\t"
  230. "movq %%mm5, 8(%2, %3) \n\t"
  231. "add %%"REG_a", %1 \n\t"
  232. "add %%"REG_a", %2 \n\t"
  233. "movq (%1 ), %%mm0 \n\t"
  234. "movq 8(%1 ), %%mm4 \n\t"
  235. "movq (%1, %3), %%mm1 \n\t"
  236. "movq 8(%1, %3), %%mm5 \n\t"
  237. "movq %%mm0, (%2) \n\t"
  238. "movq %%mm4, 8(%2) \n\t"
  239. "movq %%mm1, (%2, %3) \n\t"
  240. "movq %%mm5, 8(%2, %3) \n\t"
  241. "add %%"REG_a", %1 \n\t"
  242. "add %%"REG_a", %2 \n\t"
  243. "subl $4, %0 \n\t"
  244. "jnz 1b \n\t"
  245. : "+g"(h), "+r"(pixels), "+r"(block)
  246. : "r"((x86_reg)line_size)
  247. : "%"REG_a, "memory"
  248. );
  249. }
  250. #endif /* HAVE_INLINE_ASM */
  251. void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
  252. ptrdiff_t line_size, int h);
  253. void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
  254. ptrdiff_t line_size, int h);
  255. #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
  256. do { \
  257. c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
  258. c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
  259. c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
  260. c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
  261. } while (0)
  262. static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
  263. {
  264. #if HAVE_INLINE_ASM
  265. SET_HPEL_FUNCS(put, [0], 16, mmx);
  266. SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
  267. SET_HPEL_FUNCS(avg, [0], 16, mmx);
  268. SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
  269. SET_HPEL_FUNCS(put, [1], 8, mmx);
  270. SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
  271. SET_HPEL_FUNCS(avg, [1], 8, mmx);
  272. #endif /* HAVE_INLINE_ASM */
  273. }
  274. static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
  275. {
  276. #if HAVE_YASM
  277. c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
  278. c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
  279. c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
  280. c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
  281. c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
  282. c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
  283. c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
  284. c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
  285. c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
  286. c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
  287. if (!(flags & CODEC_FLAG_BITEXACT)) {
  288. c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
  289. c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
  290. c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
  291. c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
  292. c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
  293. c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
  294. }
  295. if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
  296. c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
  297. c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
  298. }
  299. #endif /* HAVE_YASM */
  300. }
  301. static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
  302. {
  303. #if HAVE_YASM
  304. c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
  305. c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
  306. c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
  307. c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
  308. c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
  309. c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
  310. c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
  311. c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
  312. c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
  313. c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
  314. if (!(flags & CODEC_FLAG_BITEXACT)){
  315. c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
  316. c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
  317. c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
  318. c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
  319. c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
  320. c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
  321. }
  322. if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
  323. c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
  324. c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
  325. }
  326. #endif /* HAVE_YASM */
  327. }
  328. static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
  329. {
  330. #if HAVE_YASM
  331. if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
  332. // these functions are slower than mmx on AMD, but faster on Intel
  333. c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
  334. c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
  335. c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
  336. }
  337. #endif /* HAVE_YASM */
  338. }
  339. void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
  340. {
  341. int mm_flags = av_get_cpu_flags();
  342. if (HAVE_MMX && mm_flags & AV_CPU_FLAG_MMX)
  343. hpeldsp_init_mmx(c, flags, mm_flags);
  344. if (mm_flags & AV_CPU_FLAG_MMXEXT)
  345. hpeldsp_init_mmxext(c, flags, mm_flags);
  346. if (mm_flags & AV_CPU_FLAG_3DNOW)
  347. hpeldsp_init_3dnow(c, flags, mm_flags);
  348. if (mm_flags & AV_CPU_FLAG_SSE2)
  349. hpeldsp_init_sse2(c, flags, mm_flags);
  350. }