You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

203 lines
4.8KB

  1. ;******************************************************************************
  2. ;* SIMD-optimized HuffYUV functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2014 Christophe Gisquet
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION .text
  24. %macro INT16_LOOP 2 ; %1 = a/u (aligned/unaligned), %2 = add/sub
  25. movd m4, maskd
  26. SPLATW m4, m4
  27. add wd, wd
  28. test wq, 2*mmsize - 1
  29. jz %%.tomainloop
  30. push tmpq
  31. %%.wordloop:
  32. sub wq, 2
  33. %ifidn %2, add
  34. mov tmpw, [srcq+wq]
  35. add tmpw, [dstq+wq]
  36. %else
  37. mov tmpw, [src1q+wq]
  38. sub tmpw, [src2q+wq]
  39. %endif
  40. and tmpw, maskw
  41. mov [dstq+wq], tmpw
  42. test wq, 2*mmsize - 1
  43. jnz %%.wordloop
  44. pop tmpq
  45. %%.tomainloop:
  46. %ifidn %2, add
  47. add srcq, wq
  48. %else
  49. add src1q, wq
  50. add src2q, wq
  51. %endif
  52. add dstq, wq
  53. neg wq
  54. jz %%.end
  55. %%.loop:
  56. %ifidn %2, add
  57. mov%1 m0, [srcq+wq]
  58. mov%1 m1, [dstq+wq]
  59. mov%1 m2, [srcq+wq+mmsize]
  60. mov%1 m3, [dstq+wq+mmsize]
  61. %else
  62. mov%1 m0, [src1q+wq]
  63. mov%1 m1, [src2q+wq]
  64. mov%1 m2, [src1q+wq+mmsize]
  65. mov%1 m3, [src2q+wq+mmsize]
  66. %endif
  67. p%2w m0, m1
  68. p%2w m2, m3
  69. pand m0, m4
  70. pand m2, m4
  71. mov%1 [dstq+wq] , m0
  72. mov%1 [dstq+wq+mmsize], m2
  73. add wq, 2*mmsize
  74. jl %%.loop
  75. %%.end:
  76. RET
  77. %endmacro
  78. %if ARCH_X86_32
  79. INIT_MMX mmx
  80. cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
  81. INT16_LOOP a, add
  82. %endif
  83. INIT_XMM sse2
  84. cglobal add_int16, 4,4,5, dst, src, mask, w, tmp
  85. test srcq, mmsize-1
  86. jnz .unaligned
  87. test dstq, mmsize-1
  88. jnz .unaligned
  89. INT16_LOOP a, add
  90. .unaligned:
  91. INT16_LOOP u, add
  92. ; void add_hfyu_left_pred_bgr32(uint8_t *dst, const uint8_t *src,
  93. ; intptr_t w, uint8_t *left)
  94. %macro LEFT_BGR32 0
  95. cglobal add_hfyu_left_pred_bgr32, 4,4,3, dst, src, w, left
  96. shl wq, 2
  97. movd m0, [leftq]
  98. lea dstq, [dstq + wq]
  99. lea srcq, [srcq + wq]
  100. LSHIFT m0, mmsize-4
  101. neg wq
  102. .loop:
  103. movu m1, [srcq+wq]
  104. mova m2, m1
  105. %if mmsize == 8
  106. punpckhdq m0, m0
  107. %endif
  108. LSHIFT m1, 4
  109. paddb m1, m2
  110. %if mmsize == 16
  111. pshufd m0, m0, q3333
  112. mova m2, m1
  113. LSHIFT m1, 8
  114. paddb m1, m2
  115. %endif
  116. paddb m0, m1
  117. movu [dstq+wq], m0
  118. add wq, mmsize
  119. jl .loop
  120. movd m0, [dstq-4]
  121. movd [leftq], m0
  122. REP_RET
  123. %endmacro
  124. %if ARCH_X86_32
  125. INIT_MMX mmx
  126. LEFT_BGR32
  127. %endif
  128. INIT_XMM sse2
  129. LEFT_BGR32
  130. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int mask, int w, int *left, int *left_top)
  131. INIT_MMX mmxext
  132. cglobal add_hfyu_median_pred_int16, 7,7,0, dst, top, diff, mask, w, left, left_top
  133. add wd, wd
  134. movd mm6, maskd
  135. SPLATW mm6, mm6
  136. movq mm0, [topq]
  137. movq mm2, mm0
  138. movd mm4, [left_topq]
  139. psllq mm2, 16
  140. movq mm1, mm0
  141. por mm4, mm2
  142. movd mm3, [leftq]
  143. psubw mm0, mm4 ; t-tl
  144. add dstq, wq
  145. add topq, wq
  146. add diffq, wq
  147. neg wq
  148. jmp .skip
  149. .loop:
  150. movq mm4, [topq+wq]
  151. movq mm0, mm4
  152. psllq mm4, 16
  153. por mm4, mm1
  154. movq mm1, mm0 ; t
  155. psubw mm0, mm4 ; t-tl
  156. .skip:
  157. movq mm2, [diffq+wq]
  158. %assign i 0
  159. %rep 4
  160. movq mm4, mm0
  161. paddw mm4, mm3 ; t-tl+l
  162. pand mm4, mm6
  163. movq mm5, mm3
  164. pmaxsw mm3, mm1
  165. pminsw mm5, mm1
  166. pminsw mm3, mm4
  167. pmaxsw mm3, mm5 ; median
  168. paddw mm3, mm2 ; +residual
  169. pand mm3, mm6
  170. %if i==0
  171. movq mm7, mm3
  172. psllq mm7, 48
  173. %else
  174. movq mm4, mm3
  175. psrlq mm7, 16
  176. psllq mm4, 48
  177. por mm7, mm4
  178. %endif
  179. %if i<3
  180. psrlq mm0, 16
  181. psrlq mm1, 16
  182. psrlq mm2, 16
  183. %endif
  184. %assign i i+1
  185. %endrep
  186. movq [dstq+wq], mm7
  187. add wq, 8
  188. jl .loop
  189. movzx r2d, word [dstq-2]
  190. mov [leftq], r2d
  191. movzx r2d, word [topq-2]
  192. mov [left_topq], r2d
  193. RET