You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

184 lines
6.1KB

  1. /*
  2. * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu>
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/attributes.h"
  21. #include "libavutil/cpu.h"
  22. #include "libavutil/x86_cpu.h"
  23. #include "libavfilter/gradfun.h"
  24. DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F};
  25. DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF};
  26. #if HAVE_MMX2
  27. static void gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
  28. {
  29. intptr_t x;
  30. if (width & 3) {
  31. x = width & ~3;
  32. ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
  33. width = x;
  34. }
  35. x = -width;
  36. __asm__ volatile(
  37. "movd %4, %%mm5 \n"
  38. "pxor %%mm7, %%mm7 \n"
  39. "pshufw $0, %%mm5, %%mm5 \n"
  40. "movq %6, %%mm6 \n"
  41. "movq %5, %%mm4 \n"
  42. "1: \n"
  43. "movd (%2,%0), %%mm0 \n"
  44. "movd (%3,%0), %%mm1 \n"
  45. "punpcklbw %%mm7, %%mm0 \n"
  46. "punpcklwd %%mm1, %%mm1 \n"
  47. "psllw $7, %%mm0 \n"
  48. "pxor %%mm2, %%mm2 \n"
  49. "psubw %%mm0, %%mm1 \n" // delta = dc - pix
  50. "psubw %%mm1, %%mm2 \n"
  51. "pmaxsw %%mm1, %%mm2 \n"
  52. "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16
  53. "psubw %%mm6, %%mm2 \n"
  54. "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m)
  55. "pmullw %%mm2, %%mm2 \n"
  56. "paddw %%mm4, %%mm0 \n" // pix += dither
  57. "pmulhw %%mm2, %%mm1 \n"
  58. "psllw $2, %%mm1 \n" // m = m*m*delta >> 14
  59. "paddw %%mm1, %%mm0 \n" // pix += m
  60. "psraw $7, %%mm0 \n"
  61. "packuswb %%mm0, %%mm0 \n"
  62. "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7)
  63. "add $4, %0 \n"
  64. "jl 1b \n"
  65. "emms \n"
  66. :"+r"(x)
  67. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  68. "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
  69. :"memory"
  70. );
  71. }
  72. #endif
  73. #if HAVE_SSSE3
  74. static void gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers)
  75. {
  76. intptr_t x;
  77. if (width & 7) {
  78. // could be 10% faster if I somehow eliminated this
  79. x = width & ~7;
  80. ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers);
  81. width = x;
  82. }
  83. x = -width;
  84. __asm__ volatile(
  85. "movd %4, %%xmm5 \n"
  86. "pxor %%xmm7, %%xmm7 \n"
  87. "pshuflw $0,%%xmm5, %%xmm5 \n"
  88. "movdqa %6, %%xmm6 \n"
  89. "punpcklqdq %%xmm5, %%xmm5 \n"
  90. "movdqa %5, %%xmm4 \n"
  91. "1: \n"
  92. "movq (%2,%0), %%xmm0 \n"
  93. "movq (%3,%0), %%xmm1 \n"
  94. "punpcklbw %%xmm7, %%xmm0 \n"
  95. "punpcklwd %%xmm1, %%xmm1 \n"
  96. "psllw $7, %%xmm0 \n"
  97. "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix
  98. "pabsw %%xmm1, %%xmm2 \n"
  99. "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16
  100. "psubw %%xmm6, %%xmm2 \n"
  101. "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m)
  102. "pmullw %%xmm2, %%xmm2 \n"
  103. "psllw $1, %%xmm2 \n"
  104. "paddw %%xmm4, %%xmm0 \n" // pix += dither
  105. "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14
  106. "paddw %%xmm1, %%xmm0 \n" // pix += m
  107. "psraw $7, %%xmm0 \n"
  108. "packuswb %%xmm0, %%xmm0 \n"
  109. "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7)
  110. "add $8, %0 \n"
  111. "jl 1b \n"
  112. :"+&r"(x)
  113. :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
  114. "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
  115. :"memory"
  116. );
  117. }
  118. #endif // HAVE_SSSE3
  119. #if HAVE_SSE
  120. static void gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width)
  121. {
  122. #define BLURV(load)\
  123. intptr_t x = -2*width;\
  124. __asm__ volatile(\
  125. "movdqa %6, %%xmm7 \n"\
  126. "1: \n"\
  127. load" (%4,%0), %%xmm0 \n"\
  128. load" (%5,%0), %%xmm1 \n"\
  129. "movdqa %%xmm0, %%xmm2 \n"\
  130. "movdqa %%xmm1, %%xmm3 \n"\
  131. "psrlw $8, %%xmm0 \n"\
  132. "psrlw $8, %%xmm1 \n"\
  133. "pand %%xmm7, %%xmm2 \n"\
  134. "pand %%xmm7, %%xmm3 \n"\
  135. "paddw %%xmm1, %%xmm0 \n"\
  136. "paddw %%xmm3, %%xmm2 \n"\
  137. "paddw %%xmm2, %%xmm0 \n"\
  138. "paddw (%2,%0), %%xmm0 \n"\
  139. "movdqa (%1,%0), %%xmm1 \n"\
  140. "movdqa %%xmm0, (%1,%0) \n"\
  141. "psubw %%xmm1, %%xmm0 \n"\
  142. "movdqa %%xmm0, (%3,%0) \n"\
  143. "add $16, %0 \n"\
  144. "jl 1b \n"\
  145. :"+&r"(x)\
  146. :"r"(buf+width),\
  147. "r"(buf1+width),\
  148. "r"(dc+width),\
  149. "r"(src+width*2),\
  150. "r"(src+width*2+src_linesize),\
  151. "m"(*pw_ff)\
  152. :"memory"\
  153. );
  154. if (((intptr_t) src | src_linesize) & 15) {
  155. BLURV("movdqu");
  156. } else {
  157. BLURV("movdqa");
  158. }
  159. }
  160. #endif // HAVE_SSE
  161. av_cold void ff_gradfun_init_x86(GradFunContext *gf)
  162. {
  163. int cpu_flags = av_get_cpu_flags();
  164. #if HAVE_MMX2
  165. if (cpu_flags & AV_CPU_FLAG_MMX2)
  166. gf->filter_line = gradfun_filter_line_mmx2;
  167. #endif
  168. #if HAVE_SSSE3
  169. if (cpu_flags & AV_CPU_FLAG_SSSE3)
  170. gf->filter_line = gradfun_filter_line_ssse3;
  171. #endif
  172. #if HAVE_SSE
  173. if (cpu_flags & AV_CPU_FLAG_SSE2)
  174. gf->blur_line = gradfun_blur_line_sse2;
  175. #endif
  176. }