You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

277 lines
6.2KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA 32
  26. pw_pixel_max: times 8 dw ((1 << 10)-1)
  27. sq_1: dq 1
  28. dq 0
  29. cextern pw_1
  30. SECTION .text
  31. ;-----------------------------------------------------------------------------
  32. ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
  33. ; int weight, int offset);
  34. ;-----------------------------------------------------------------------------
  35. %macro WEIGHT_PROLOGUE 0
  36. .prologue:
  37. PROLOGUE 0,6,8
  38. movifnidn r0, r0mp
  39. movifnidn r1d, r1m
  40. movifnidn r2d, r2m
  41. movifnidn r4d, r4m
  42. movifnidn r5d, r5m
  43. %endmacro
  44. %macro WEIGHT_SETUP 1
  45. mova m0, [pw_1]
  46. movd m2, r3m
  47. pslld m0, m2 ; 1<<log2_denom
  48. SPLATW m0, m0
  49. shl r5, 19 ; *8, move to upper half of dword
  50. lea r5, [r5+r4*2+0x10000]
  51. movd m3, r5d ; weight<<1 | 1+(offset<<(3))
  52. pshufd m3, m3, 0
  53. mova m4, [pw_pixel_max]
  54. paddw m2, [sq_1] ; log2_denom+1
  55. %ifnidn %1, sse4
  56. pxor m7, m7
  57. %endif
  58. %endmacro
  59. %macro WEIGHT_OP 2-3
  60. %if %0==2
  61. mova m5, [r0+%2]
  62. punpckhwd m6, m5, m0
  63. punpcklwd m5, m0
  64. %else
  65. movq m5, [r0+%2]
  66. movq m6, [r0+%3]
  67. punpcklwd m5, m0
  68. punpcklwd m6, m0
  69. %endif
  70. pmaddwd m5, m3
  71. pmaddwd m6, m3
  72. psrad m5, m2
  73. psrad m6, m2
  74. %ifidn %1, sse4
  75. packusdw m5, m6
  76. pminsw m5, m4
  77. %else
  78. packssdw m5, m6
  79. CLIPW m5, m7, m4
  80. %endif
  81. %endmacro
  82. %macro WEIGHT_FUNC_DBL 1
  83. cglobal h264_weight_16_10_%1
  84. WEIGHT_PROLOGUE
  85. WEIGHT_SETUP %1
  86. .nextrow:
  87. WEIGHT_OP %1, 0
  88. mova [r0 ], m5
  89. WEIGHT_OP %1, 16
  90. mova [r0+16], m5
  91. add r0, r1
  92. dec r2d
  93. jnz .nextrow
  94. REP_RET
  95. %endmacro
  96. INIT_XMM
  97. WEIGHT_FUNC_DBL sse2
  98. WEIGHT_FUNC_DBL sse4
  99. %macro WEIGHT_FUNC_MM 1
  100. cglobal h264_weight_8_10_%1
  101. WEIGHT_PROLOGUE
  102. WEIGHT_SETUP %1
  103. .nextrow:
  104. WEIGHT_OP %1, 0
  105. mova [r0], m5
  106. add r0, r1
  107. dec r2d
  108. jnz .nextrow
  109. REP_RET
  110. %endmacro
  111. INIT_XMM
  112. WEIGHT_FUNC_MM sse2
  113. WEIGHT_FUNC_MM sse4
  114. %macro WEIGHT_FUNC_HALF_MM 1
  115. cglobal h264_weight_4_10_%1
  116. WEIGHT_PROLOGUE
  117. sar r2d, 1
  118. WEIGHT_SETUP %1
  119. lea r3, [r1*2]
  120. .nextrow:
  121. WEIGHT_OP %1, 0, r1
  122. movh [r0], m5
  123. movhps [r0+r1], m5
  124. add r0, r3
  125. dec r2d
  126. jnz .nextrow
  127. REP_RET
  128. %endmacro
  129. INIT_XMM
  130. WEIGHT_FUNC_HALF_MM sse2
  131. WEIGHT_FUNC_HALF_MM sse4
  132. ;-----------------------------------------------------------------------------
  133. ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
  134. ; int log2_denom, int weightd, int weights, int offset);
  135. ;-----------------------------------------------------------------------------
  136. %if ARCH_X86_32
  137. DECLARE_REG_TMP 3
  138. %else
  139. DECLARE_REG_TMP 7
  140. %endif
  141. %macro BIWEIGHT_PROLOGUE 0
  142. .prologue:
  143. PROLOGUE 0,8,8
  144. movifnidn r0, r0mp
  145. movifnidn r1, r1mp
  146. movifnidn r2d, r2m
  147. movifnidn r5d, r5m
  148. movifnidn r6d, r6m
  149. movifnidn t0d, r7m
  150. %endmacro
  151. %macro BIWEIGHT_SETUP 1
  152. lea t0, [t0*4+1] ; (offset<<2)+1
  153. or t0, 1
  154. shl r6, 16
  155. or r5, r6
  156. movd m4, r5d ; weightd | weights
  157. movd m5, t0d ; (offset+1)|1
  158. movd m6, r4m ; log2_denom
  159. pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
  160. paddd m6, [sq_1]
  161. pshufd m4, m4, 0
  162. pshufd m5, m5, 0
  163. mova m3, [pw_pixel_max]
  164. movifnidn r3d, r3m
  165. %ifnidn %1, sse4
  166. pxor m7, m7
  167. %endif
  168. %endmacro
  169. %macro BIWEIGHT 2-3
  170. %if %0==2
  171. mova m0, [r0+%2]
  172. mova m1, [r1+%2]
  173. punpckhwd m2, m0, m1
  174. punpcklwd m0, m1
  175. %else
  176. movq m0, [r0+%2]
  177. movq m1, [r1+%2]
  178. punpcklwd m0, m1
  179. movq m2, [r0+%3]
  180. movq m1, [r1+%3]
  181. punpcklwd m2, m1
  182. %endif
  183. pmaddwd m0, m4
  184. pmaddwd m2, m4
  185. paddd m0, m5
  186. paddd m2, m5
  187. psrad m0, m6
  188. psrad m2, m6
  189. %ifidn %1, sse4
  190. packusdw m0, m2
  191. pminsw m0, m3
  192. %else
  193. packssdw m0, m2
  194. CLIPW m0, m7, m3
  195. %endif
  196. %endmacro
  197. %macro BIWEIGHT_FUNC_DBL 1
  198. cglobal h264_biweight_16_10_%1
  199. BIWEIGHT_PROLOGUE
  200. BIWEIGHT_SETUP %1
  201. .nextrow:
  202. BIWEIGHT %1, 0
  203. mova [r0 ], m0
  204. BIWEIGHT %1, 16
  205. mova [r0+16], m0
  206. add r0, r2
  207. add r1, r2
  208. dec r3d
  209. jnz .nextrow
  210. REP_RET
  211. %endmacro
  212. INIT_XMM
  213. BIWEIGHT_FUNC_DBL sse2
  214. BIWEIGHT_FUNC_DBL sse4
  215. %macro BIWEIGHT_FUNC 1
  216. cglobal h264_biweight_8_10_%1
  217. BIWEIGHT_PROLOGUE
  218. BIWEIGHT_SETUP %1
  219. .nextrow:
  220. BIWEIGHT %1, 0
  221. mova [r0], m0
  222. add r0, r2
  223. add r1, r2
  224. dec r3d
  225. jnz .nextrow
  226. REP_RET
  227. %endmacro
  228. INIT_XMM
  229. BIWEIGHT_FUNC sse2
  230. BIWEIGHT_FUNC sse4
  231. %macro BIWEIGHT_FUNC_HALF 1
  232. cglobal h264_biweight_4_10_%1
  233. BIWEIGHT_PROLOGUE
  234. BIWEIGHT_SETUP %1
  235. sar r3d, 1
  236. lea r4, [r2*2]
  237. .nextrow:
  238. BIWEIGHT %1, 0, r2
  239. movh [r0 ], m0
  240. movhps [r0+r2], m0
  241. add r0, r4
  242. add r1, r4
  243. dec r3d
  244. jnz .nextrow
  245. REP_RET
  246. %endmacro
  247. INIT_XMM
  248. BIWEIGHT_FUNC_HALF sse2
  249. BIWEIGHT_FUNC_HALF sse4