You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
7.9KB

  1. /*
  2. * This file is part of Libav.
  3. *
  4. * Libav is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * Libav is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with Libav; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "libavutil/attributes.h"
  19. #include "libavutil/cpu.h"
  20. #include "libavutil/x86/cpu.h"
  21. #include "libavcodec/avcodec.h"
  22. #include "libavcodec/mpegvideoencdsp.h"
  23. int ff_pix_sum16_mmx(uint8_t *pix, int line_size);
  24. int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
  25. #if HAVE_INLINE_ASM
  26. #define PHADDD(a, t) \
  27. "movq " #a ", " #t " \n\t" \
  28. "psrlq $32, " #a " \n\t" \
  29. "paddd " #t ", " #a " \n\t"
  30. /*
  31. * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
  32. * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  33. * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  34. */
  35. #define PMULHRW(x, y, s, o) \
  36. "pmulhw " #s ", " #x " \n\t" \
  37. "pmulhw " #s ", " #y " \n\t" \
  38. "paddw " #o ", " #x " \n\t" \
  39. "paddw " #o ", " #y " \n\t" \
  40. "psraw $1, " #x " \n\t" \
  41. "psraw $1, " #y " \n\t"
  42. #define DEF(x) x ## _mmx
  43. #define SET_RND MOVQ_WONE
  44. #define SCALE_OFFSET 1
  45. #include "mpegvideoenc_qns_template.c"
  46. #undef DEF
  47. #undef SET_RND
  48. #undef SCALE_OFFSET
  49. #undef PMULHRW
  50. #define DEF(x) x ## _3dnow
  51. #define SET_RND(x)
  52. #define SCALE_OFFSET 0
  53. #define PMULHRW(x, y, s, o) \
  54. "pmulhrw " #s ", " #x " \n\t" \
  55. "pmulhrw " #s ", " #y " \n\t"
  56. #include "mpegvideoenc_qns_template.c"
  57. #undef DEF
  58. #undef SET_RND
  59. #undef SCALE_OFFSET
  60. #undef PMULHRW
  61. #if HAVE_SSSE3_INLINE
  62. #undef PHADDD
  63. #define DEF(x) x ## _ssse3
  64. #define SET_RND(x)
  65. #define SCALE_OFFSET -1
  66. #define PHADDD(a, t) \
  67. "pshufw $0x0E, " #a ", " #t " \n\t" \
  68. /* faster than phaddd on core2 */ \
  69. "paddd " #t ", " #a " \n\t"
  70. #define PMULHRW(x, y, s, o) \
  71. "pmulhrsw " #s ", " #x " \n\t" \
  72. "pmulhrsw " #s ", " #y " \n\t"
  73. #include "mpegvideoenc_qns_template.c"
  74. #undef DEF
  75. #undef SET_RND
  76. #undef SCALE_OFFSET
  77. #undef PMULHRW
  78. #undef PHADDD
  79. #endif /* HAVE_SSSE3_INLINE */
  80. /* Draw the edges of width 'w' of an image of size width, height
  81. * this MMX version can only handle w == 8 || w == 16. */
  82. static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
  83. int w, int h, int sides)
  84. {
  85. uint8_t *ptr, *last_line;
  86. int i;
  87. last_line = buf + (height - 1) * wrap;
  88. /* left and right */
  89. ptr = buf;
  90. if (w == 8) {
  91. __asm__ volatile (
  92. "1: \n\t"
  93. "movd (%0), %%mm0 \n\t"
  94. "punpcklbw %%mm0, %%mm0 \n\t"
  95. "punpcklwd %%mm0, %%mm0 \n\t"
  96. "punpckldq %%mm0, %%mm0 \n\t"
  97. "movq %%mm0, -8(%0) \n\t"
  98. "movq -8(%0, %2), %%mm1 \n\t"
  99. "punpckhbw %%mm1, %%mm1 \n\t"
  100. "punpckhwd %%mm1, %%mm1 \n\t"
  101. "punpckhdq %%mm1, %%mm1 \n\t"
  102. "movq %%mm1, (%0, %2) \n\t"
  103. "add %1, %0 \n\t"
  104. "cmp %3, %0 \n\t"
  105. "jb 1b \n\t"
  106. : "+r" (ptr)
  107. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  108. "r" (ptr + wrap * height));
  109. } else {
  110. __asm__ volatile (
  111. "1: \n\t"
  112. "movd (%0), %%mm0 \n\t"
  113. "punpcklbw %%mm0, %%mm0 \n\t"
  114. "punpcklwd %%mm0, %%mm0 \n\t"
  115. "punpckldq %%mm0, %%mm0 \n\t"
  116. "movq %%mm0, -8(%0) \n\t"
  117. "movq %%mm0, -16(%0) \n\t"
  118. "movq -8(%0, %2), %%mm1 \n\t"
  119. "punpckhbw %%mm1, %%mm1 \n\t"
  120. "punpckhwd %%mm1, %%mm1 \n\t"
  121. "punpckhdq %%mm1, %%mm1 \n\t"
  122. "movq %%mm1, (%0, %2) \n\t"
  123. "movq %%mm1, 8(%0, %2) \n\t"
  124. "add %1, %0 \n\t"
  125. "cmp %3, %0 \n\t"
  126. "jb 1b \n\t"
  127. : "+r" (ptr)
  128. : "r" ((x86_reg) wrap), "r" ((x86_reg) width),
  129. "r" (ptr + wrap * height));
  130. }
  131. /* top and bottom (and hopefully also the corners) */
  132. if (sides & EDGE_TOP) {
  133. for (i = 0; i < h; i += 4) {
  134. ptr = buf - (i + 1) * wrap - w;
  135. __asm__ volatile (
  136. "1: \n\t"
  137. "movq (%1, %0), %%mm0 \n\t"
  138. "movq %%mm0, (%0) \n\t"
  139. "movq %%mm0, (%0, %2) \n\t"
  140. "movq %%mm0, (%0, %2, 2) \n\t"
  141. "movq %%mm0, (%0, %3) \n\t"
  142. "add $8, %0 \n\t"
  143. "cmp %4, %0 \n\t"
  144. "jb 1b \n\t"
  145. : "+r" (ptr)
  146. : "r" ((x86_reg) buf - (x86_reg) ptr - w),
  147. "r" ((x86_reg) - wrap), "r" ((x86_reg) - wrap * 3),
  148. "r" (ptr + width + 2 * w));
  149. }
  150. }
  151. if (sides & EDGE_BOTTOM) {
  152. for (i = 0; i < h; i += 4) {
  153. ptr = last_line + (i + 1) * wrap - w;
  154. __asm__ volatile (
  155. "1: \n\t"
  156. "movq (%1, %0), %%mm0 \n\t"
  157. "movq %%mm0, (%0) \n\t"
  158. "movq %%mm0, (%0, %2) \n\t"
  159. "movq %%mm0, (%0, %2, 2) \n\t"
  160. "movq %%mm0, (%0, %3) \n\t"
  161. "add $8, %0 \n\t"
  162. "cmp %4, %0 \n\t"
  163. "jb 1b \n\t"
  164. : "+r" (ptr)
  165. : "r" ((x86_reg) last_line - (x86_reg) ptr - w),
  166. "r" ((x86_reg) wrap), "r" ((x86_reg) wrap * 3),
  167. "r" (ptr + width + 2 * w));
  168. }
  169. }
  170. }
  171. #endif /* HAVE_INLINE_ASM */
  172. av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
  173. AVCodecContext *avctx)
  174. {
  175. int cpu_flags = av_get_cpu_flags();
  176. if (EXTERNAL_MMX(cpu_flags)) {
  177. c->pix_sum = ff_pix_sum16_mmx;
  178. c->pix_norm1 = ff_pix_norm1_mmx;
  179. }
  180. #if HAVE_INLINE_ASM
  181. if (INLINE_MMX(cpu_flags)) {
  182. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  183. c->try_8x8basis = try_8x8basis_mmx;
  184. }
  185. c->add_8x8basis = add_8x8basis_mmx;
  186. if (avctx->bits_per_raw_sample <= 8) {
  187. c->draw_edges = draw_edges_mmx;
  188. }
  189. }
  190. if (INLINE_AMD3DNOW(cpu_flags)) {
  191. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  192. c->try_8x8basis = try_8x8basis_3dnow;
  193. }
  194. c->add_8x8basis = add_8x8basis_3dnow;
  195. }
  196. #if HAVE_SSSE3_INLINE
  197. if (INLINE_SSSE3(cpu_flags)) {
  198. if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
  199. c->try_8x8basis = try_8x8basis_ssse3;
  200. }
  201. c->add_8x8basis = add_8x8basis_ssse3;
  202. }
  203. #endif /* HAVE_SSSE3_INLINE */
  204. #endif /* HAVE_INLINE_ASM */
  205. }