You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

266 lines
9.1KB

  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with FFmpeg; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "libavutil/attributes.h"
  19. #include "libavutil/avassert.h"
  20. #include "libavutil/cpu.h"
  21. #include "libavutil/x86/cpu.h"
  22. #include "libavcodec/avcodec.h"
  23. #include "libavcodec/mpegvideoencdsp.h"
  24. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  25. int ff_pix_sum16_sse2(uint8_t *pix, int line_size);
  26. int ff_pix_sum16_xop(uint8_t *pix, int line_size);
  27. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  28. int ff_pix_norm1_sse2(uint8_t *pix, int line_size);
  29. #if HAVE_INLINE_ASM
  30. #define PHADDD(a, t) \
  31. "movq " #a ", " #t " \n\t" \
  32. "psrlq $32, " #a " \n\t" \
  33. "paddd " #t ", " #a " \n\t"
  34. /*
  35. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  36. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  37. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  38. */
  39. #define PMULHRW(x, y, s, o) \
  40. "pmulhw " #s ", " #x " \n\t" \
  41. "pmulhw " #s ", " #y " \n\t" \
  42. "paddw " #o ", " #x " \n\t" \
  43. "paddw " #o ", " #y " \n\t" \
  44. "psraw $1, " #x " \n\t" \
  45. "psraw $1, " #y " \n\t"
  46. #define DEF(x) x ## _mmx
  47. #define SET_RND MOVQ_WONE
  48. #define SCALE_OFFSET 1
  49. #include "mpegvideoenc_qns_template.c"
  50. #undef DEF
  51. #undef SET_RND
  52. #undef SCALE_OFFSET
  53. #undef PMULHRW
  54. #define DEF(x) x ## _3dnow
  55. #define SET_RND(x)
  56. #define SCALE_OFFSET 0
  57. #define PMULHRW(x, y, s, o) \
  58. "pmulhrw " #s ", " #x " \n\t" \
  59. "pmulhrw " #s ", " #y " \n\t"
  60. #include "mpegvideoenc_qns_template.c"
  61. #undef DEF
  62. #undef SET_RND
  63. #undef SCALE_OFFSET
  64. #undef PMULHRW
  65. #if HAVE_SSSE3_INLINE
  66. #undef PHADDD
  67. #define DEF(x) x ## _ssse3
  68. #define SET_RND(x)
  69. #define SCALE_OFFSET -1
  70. #define PHADDD(a, t) \
  71. "pshufw $0x0E, " #a ", " #t " \n\t" \
  72. /* faster than phaddd on core2 */ \
  73. "paddd " #t ", " #a " \n\t"
  74. #define PMULHRW(x, y, s, o) \
  75. "pmulhrsw " #s ", " #x " \n\t" \
  76. "pmulhrsw " #s ", " #y " \n\t"
  77. #include "mpegvideoenc_qns_template.c"
  78. #undef DEF
  79. #undef SET_RND
  80. #undef SCALE_OFFSET
  81. #undef PMULHRW
  82. #undef PHADDD
  83. #endif /* HAVE_SSSE3_INLINE */
  84. /* Draw the edges of width 'w' of an image of size width, height
  85. * this MMX version can only handle w == 8 || w == 16. */
  86. static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
  87. int w, int h, int sides)
  88. {
  89. uint8_t *ptr, *last_line;
  90. int i;
  91. last_line = buf + (height - 1) * wrap;
  92. /* left and right */
  93. ptr = buf;
  94. if (w == 8) {
  95. __asm__ volatile (
  96. "1: \n\t"
  97. "movd (%0), %%mm0 \n\t"
  98. "punpcklbw %%mm0, %%mm0 \n\t"
  99. "punpcklwd %%mm0, %%mm0 \n\t"
  100. "punpckldq %%mm0, %%mm0 \n\t"
  101. "movq %%mm0, -8(%0) \n\t"
  102. "movq -8(%0, %2), %%mm1 \n\t"
  103. "punpckhbw %%mm1, %%mm1 \n\t"
  104. "punpckhwd %%mm1, %%mm1 \n\t"
  105. "punpckhdq %%mm1, %%mm1 \n\t"
  106. "movq %%mm1, (%0, %2) \n\t"
  107. "add %1, %0 \n\t"
  108. "cmp %3, %0 \n\t"
  109. "jb 1b \n\t"
  110. : "+r" (ptr)
  111. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  112. "r" (ptr + wrap * height));
  113. } else if (w == 16) {
  114. __asm__ volatile (
  115. "1: \n\t"
  116. "movd (%0), %%mm0 \n\t"
  117. "punpcklbw %%mm0, %%mm0 \n\t"
  118. "punpcklwd %%mm0, %%mm0 \n\t"
  119. "punpckldq %%mm0, %%mm0 \n\t"
  120. "movq %%mm0, -8(%0) \n\t"
  121. "movq %%mm0, -16(%0) \n\t"
  122. "movq -8(%0, %2), %%mm1 \n\t"
  123. "punpckhbw %%mm1, %%mm1 \n\t"
  124. "punpckhwd %%mm1, %%mm1 \n\t"
  125. "punpckhdq %%mm1, %%mm1 \n\t"
  126. "movq %%mm1, (%0, %2) \n\t"
  127. "movq %%mm1, 8(%0, %2) \n\t"
  128. "add %1, %0 \n\t"
  129. "cmp %3, %0 \n\t"
  130. "jb 1b \n\t"
  131. : "+r"(ptr)
  132. : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
  133. );
  134. } else {
  135. av_assert1(w == 4);
  136. __asm__ volatile (
  137. "1: \n\t"
  138. "movd (%0), %%mm0 \n\t"
  139. "punpcklbw %%mm0, %%mm0 \n\t"
  140. "punpcklwd %%mm0, %%mm0 \n\t"
  141. "movd %%mm0, -4(%0) \n\t"
  142. "movd -4(%0, %2), %%mm1 \n\t"
  143. "punpcklbw %%mm1, %%mm1 \n\t"
  144. "punpckhwd %%mm1, %%mm1 \n\t"
  145. "punpckhdq %%mm1, %%mm1 \n\t"
  146. "movd %%mm1, (%0, %2) \n\t"
  147. "add %1, %0 \n\t"
  148. "cmp %3, %0 \n\t"
  149. "jb 1b \n\t"
  150. : "+r" (ptr)
  151. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  152. "r" (ptr + wrap * height));
  153. }
  154. /* top and bottom (and hopefully also the corners) */
  155. if (sides & EDGE_TOP) {
  156. for (i = 0; i < h; i += 4) {
  157. ptr = buf - (i + 1) * wrap - w;
  158. __asm__ volatile (
  159. "1: \n\t"
  160. "movq (%1, %0), %%mm0 \n\t"
  161. "movq %%mm0, (%0) \n\t"
  162. "movq %%mm0, (%0, %2) \n\t"
  163. "movq %%mm0, (%0, %2, 2) \n\t"
  164. "movq %%mm0, (%0, %3) \n\t"
  165. "add $8, %0 \n\t"
  166. "cmp %4, %0 \n\t"
  167. "jb 1b \n\t"
  168. : "+r" (ptr)
  169. : "r" ((x86_reg) buf - (x86_reg) ptr - w),
  170. "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
  171. "r" (ptr + width + 2 * w));
  172. }
  173. }
  174. if (sides & EDGE_BOTTOM) {
  175. for (i = 0; i < h; i += 4) {
  176. ptr = last_line + (i + 1) * wrap - w;
  177. __asm__ volatile (
  178. "1: \n\t"
  179. "movq (%1, %0), %%mm0 \n\t"
  180. "movq %%mm0, (%0) \n\t"
  181. "movq %%mm0, (%0, %2) \n\t"
  182. "movq %%mm0, (%0, %2, 2) \n\t"
  183. "movq %%mm0, (%0, %3) \n\t"
  184. "add $8, %0 \n\t"
  185. "cmp %4, %0 \n\t"
  186. "jb 1b \n\t"
  187. : "+r" (ptr)
  188. : "r" ((x86_reg) last_line - (x86_reg) ptr - w),
  189. "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
  190. "r" (ptr + width + 2 * w));
  191. }
  192. }
  193. }
  194. #endif /* HAVE_INLINE_ASM */
  195. av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
  196. AVCodecContext *avctx)
  197. {
  198. int cpu_flags = av_get_cpu_flags();
  199. if (EXTERNAL_MMX(cpu_flags)) {
  200. c->pix_sum = ff_pix_sum16_mmx;
  201. c->pix_norm1 = ff_pix_norm1_mmx;
  202. }
  203. if (EXTERNAL_SSE2(cpu_flags)) {
  204. c->pix_sum = ff_pix_sum16_sse2;
  205. c->pix_norm1 = ff_pix_norm1_sse2;
  206. }
  207. if (EXTERNAL_XOP(cpu_flags)) {
  208. c->pix_sum = ff_pix_sum16_xop;
  209. }
  210. #if HAVE_INLINE_ASM
  211. if (INLINE_MMX(cpu_flags)) {
  212. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  213. c->try_8x8basis = try_8x8basis_mmx;
  214. }
  215. c->add_8x8basis = add_8x8basis_mmx;
  216. if (avctx->bits_per_raw_sample <= 8) {
  217. c->draw_edges = draw_edges_mmx;
  218. }
  219. }
  220. if (INLINE_AMD3DNOW(cpu_flags)) {
  221. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  222. c->try_8x8basis = try_8x8basis_3dnow;
  223. }
  224. c->add_8x8basis = add_8x8basis_3dnow;
  225. }
  226. #if HAVE_SSSE3_INLINE
  227. if (INLINE_SSSE3(cpu_flags)) {
  228. if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
  229. c->try_8x8basis = try_8x8basis_ssse3;
  230. }
  231. c->add_8x8basis = add_8x8basis_ssse3;
  232. }
  233. #endif /* HAVE_SSSE3_INLINE */
  234. #endif /* HAVE_INLINE_ASM */
  235. }