You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

209 lines
7.0KB

  1. /*
  2. * Copyright (c) 2008 Loren Merritt
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * SSSE3 optimized version of (put|avg)_h264_chroma_mc8.
  22. * H264_CHROMA_MC8_TMPL must be defined to the desired function name
  23. * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
  24. * AVG_OP must be defined to empty for put and the identify for avg
  25. */
  26. static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
  27. {
  28. if(y==0 && x==0) {
  29. /* no filter needed */
  30. H264_CHROMA_MC8_MV0(dst, src, stride, h);
  31. return;
  32. }
  33. assert(x<8 && y<8 && x>=0 && y>=0);
  34. if(y==0 || x==0)
  35. {
  36. /* 1 dimensional filter only */
  37. __asm__ volatile(
  38. "movd %0, %%xmm7 \n\t"
  39. "movq %1, %%xmm6 \n\t"
  40. "pshuflw $0, %%xmm7, %%xmm7 \n\t"
  41. "movlhps %%xmm6, %%xmm6 \n\t"
  42. "movlhps %%xmm7, %%xmm7 \n\t"
  43. :: "r"(255*(x+y)+8), "m"(*(rnd?&ff_pw_4:&ff_pw_3))
  44. );
  45. if(x) {
  46. __asm__ volatile(
  47. "1: \n\t"
  48. "movq (%1), %%xmm0 \n\t"
  49. "movq 1(%1), %%xmm1 \n\t"
  50. "movq (%1,%3), %%xmm2 \n\t"
  51. "movq 1(%1,%3), %%xmm3 \n\t"
  52. "punpcklbw %%xmm1, %%xmm0 \n\t"
  53. "punpcklbw %%xmm3, %%xmm2 \n\t"
  54. "pmaddubsw %%xmm7, %%xmm0 \n\t"
  55. "pmaddubsw %%xmm7, %%xmm2 \n\t"
  56. AVG_OP("movq (%0), %%xmm4 \n\t")
  57. AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  58. "paddw %%xmm6, %%xmm0 \n\t"
  59. "paddw %%xmm6, %%xmm2 \n\t"
  60. "psrlw $3, %%xmm0 \n\t"
  61. "psrlw $3, %%xmm2 \n\t"
  62. "packuswb %%xmm2, %%xmm0 \n\t"
  63. AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  64. "movq %%xmm0, (%0) \n\t"
  65. "movhps %%xmm0, (%0,%3) \n\t"
  66. "sub $2, %2 \n\t"
  67. "lea (%1,%3,2), %1 \n\t"
  68. "lea (%0,%3,2), %0 \n\t"
  69. "jg 1b \n\t"
  70. :"+r"(dst), "+r"(src), "+r"(h)
  71. :"r"((x86_reg)stride)
  72. );
  73. } else {
  74. __asm__ volatile(
  75. "1: \n\t"
  76. "movq (%1), %%xmm0 \n\t"
  77. "movq (%1,%3), %%xmm1 \n\t"
  78. "movdqa %%xmm1, %%xmm2 \n\t"
  79. "movq (%1,%3,2), %%xmm3 \n\t"
  80. "punpcklbw %%xmm1, %%xmm0 \n\t"
  81. "punpcklbw %%xmm3, %%xmm2 \n\t"
  82. "pmaddubsw %%xmm7, %%xmm0 \n\t"
  83. "pmaddubsw %%xmm7, %%xmm2 \n\t"
  84. AVG_OP("movq (%0), %%xmm4 \n\t")
  85. AVG_OP("movhps (%0,%3), %%xmm4 \n\t")
  86. "paddw %%xmm6, %%xmm0 \n\t"
  87. "paddw %%xmm6, %%xmm2 \n\t"
  88. "psrlw $3, %%xmm0 \n\t"
  89. "psrlw $3, %%xmm2 \n\t"
  90. "packuswb %%xmm2, %%xmm0 \n\t"
  91. AVG_OP("pavgb %%xmm4, %%xmm0 \n\t")
  92. "movq %%xmm0, (%0) \n\t"
  93. "movhps %%xmm0, (%0,%3) \n\t"
  94. "sub $2, %2 \n\t"
  95. "lea (%1,%3,2), %1 \n\t"
  96. "lea (%0,%3,2), %0 \n\t"
  97. "jg 1b \n\t"
  98. :"+r"(dst), "+r"(src), "+r"(h)
  99. :"r"((x86_reg)stride)
  100. );
  101. }
  102. return;
  103. }
  104. /* general case, bilinear */
  105. __asm__ volatile(
  106. "movd %0, %%xmm7 \n\t"
  107. "movd %1, %%xmm6 \n\t"
  108. "movdqa %2, %%xmm5 \n\t"
  109. "pshuflw $0, %%xmm7, %%xmm7 \n\t"
  110. "pshuflw $0, %%xmm6, %%xmm6 \n\t"
  111. "movlhps %%xmm7, %%xmm7 \n\t"
  112. "movlhps %%xmm6, %%xmm6 \n\t"
  113. :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(*(rnd?&ff_pw_32:&ff_pw_28))
  114. );
  115. __asm__ volatile(
  116. "movq (%1), %%xmm0 \n\t"
  117. "movq 1(%1), %%xmm1 \n\t"
  118. "punpcklbw %%xmm1, %%xmm0 \n\t"
  119. "add %3, %1 \n\t"
  120. "1: \n\t"
  121. "movq (%1), %%xmm1 \n\t"
  122. "movq 1(%1), %%xmm2 \n\t"
  123. "movq (%1,%3), %%xmm3 \n\t"
  124. "movq 1(%1,%3), %%xmm4 \n\t"
  125. "lea (%1,%3,2), %1 \n\t"
  126. "punpcklbw %%xmm2, %%xmm1 \n\t"
  127. "punpcklbw %%xmm4, %%xmm3 \n\t"
  128. "movdqa %%xmm1, %%xmm2 \n\t"
  129. "movdqa %%xmm3, %%xmm4 \n\t"
  130. "pmaddubsw %%xmm7, %%xmm0 \n\t"
  131. "pmaddubsw %%xmm6, %%xmm1 \n\t"
  132. "pmaddubsw %%xmm7, %%xmm2 \n\t"
  133. "pmaddubsw %%xmm6, %%xmm3 \n\t"
  134. "paddw %%xmm5, %%xmm0 \n\t"
  135. "paddw %%xmm5, %%xmm2 \n\t"
  136. "paddw %%xmm0, %%xmm1 \n\t"
  137. "paddw %%xmm2, %%xmm3 \n\t"
  138. "movdqa %%xmm4, %%xmm0 \n\t"
  139. "psrlw $6, %%xmm1 \n\t"
  140. "psrlw $6, %%xmm3 \n\t"
  141. AVG_OP("movq (%0), %%xmm2 \n\t")
  142. AVG_OP("movhps (%0,%3), %%xmm2 \n\t")
  143. "packuswb %%xmm3, %%xmm1 \n\t"
  144. AVG_OP("pavgb %%xmm2, %%xmm1 \n\t")
  145. "movq %%xmm1, (%0)\n\t"
  146. "movhps %%xmm1, (%0,%3)\n\t"
  147. "sub $2, %2 \n\t"
  148. "lea (%0,%3,2), %0 \n\t"
  149. "jg 1b \n\t"
  150. :"+r"(dst), "+r"(src), "+r"(h)
  151. :"r"((x86_reg)stride)
  152. );
  153. }
  154. static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  155. {
  156. __asm__ volatile(
  157. "movd %0, %%mm7 \n\t"
  158. "movd %1, %%mm6 \n\t"
  159. "movq %2, %%mm5 \n\t"
  160. "pshufw $0, %%mm7, %%mm7 \n\t"
  161. "pshufw $0, %%mm6, %%mm6 \n\t"
  162. :: "r"((x*255+8)*(8-y)), "r"((x*255+8)*y), "m"(ff_pw_32)
  163. );
  164. __asm__ volatile(
  165. "movd (%1), %%mm0 \n\t"
  166. "punpcklbw 1(%1), %%mm0 \n\t"
  167. "add %3, %1 \n\t"
  168. "1: \n\t"
  169. "movd (%1), %%mm1 \n\t"
  170. "movd (%1,%3), %%mm3 \n\t"
  171. "punpcklbw 1(%1), %%mm1 \n\t"
  172. "punpcklbw 1(%1,%3), %%mm3 \n\t"
  173. "lea (%1,%3,2), %1 \n\t"
  174. "movq %%mm1, %%mm2 \n\t"
  175. "movq %%mm3, %%mm4 \n\t"
  176. "pmaddubsw %%mm7, %%mm0 \n\t"
  177. "pmaddubsw %%mm6, %%mm1 \n\t"
  178. "pmaddubsw %%mm7, %%mm2 \n\t"
  179. "pmaddubsw %%mm6, %%mm3 \n\t"
  180. "paddw %%mm5, %%mm0 \n\t"
  181. "paddw %%mm5, %%mm2 \n\t"
  182. "paddw %%mm0, %%mm1 \n\t"
  183. "paddw %%mm2, %%mm3 \n\t"
  184. "movq %%mm4, %%mm0 \n\t"
  185. "psrlw $6, %%mm1 \n\t"
  186. "psrlw $6, %%mm3 \n\t"
  187. "packuswb %%mm1, %%mm1 \n\t"
  188. "packuswb %%mm3, %%mm3 \n\t"
  189. AVG_OP("pavgb (%0), %%mm1 \n\t")
  190. AVG_OP("pavgb (%0,%3), %%mm3 \n\t")
  191. "movd %%mm1, (%0)\n\t"
  192. "movd %%mm3, (%0,%3)\n\t"
  193. "sub $2, %2 \n\t"
  194. "lea (%0,%3,2), %0 \n\t"
  195. "jg 1b \n\t"
  196. :"+r"(dst), "+r"(src), "+r"(h)
  197. :"r"((x86_reg)stride)
  198. );
  199. }