You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
6.2KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA 32
  26. pw_pixel_max: times 8 dw ((1 << 10)-1)
  27. sq_1: dq 1
  28. dq 0
  29. cextern pw_1
  30. SECTION .text
  31. ;-----------------------------------------------------------------------------
  32. ; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
  33. ; int log2_denom, int weight, int offset);
  34. ;-----------------------------------------------------------------------------
  35. %macro WEIGHT_PROLOGUE 0
  36. .prologue:
  37. PROLOGUE 0,6,8
  38. movifnidn r0, r0mp
  39. movifnidn r1d, r1m
  40. movifnidn r2d, r2m
  41. movifnidn r4d, r4m
  42. movifnidn r5d, r5m
  43. %endmacro
  44. %macro WEIGHT_SETUP 0
  45. mova m0, [pw_1]
  46. movd m2, r3m
  47. pslld m0, m2 ; 1<<log2_denom
  48. SPLATW m0, m0
  49. shl r5, 19 ; *8, move to upper half of dword
  50. lea r5, [r5+r4*2+0x10000]
  51. movd m3, r5d ; weight<<1 | 1+(offset<<(3))
  52. pshufd m3, m3, 0
  53. mova m4, [pw_pixel_max]
  54. paddw m2, [sq_1] ; log2_denom+1
  55. %if notcpuflag(sse4)
  56. pxor m7, m7
  57. %endif
  58. %endmacro
  59. %macro WEIGHT_OP 1-2
  60. %if %0==1
  61. mova m5, [r0+%1]
  62. punpckhwd m6, m5, m0
  63. punpcklwd m5, m0
  64. %else
  65. movq m5, [r0+%1]
  66. movq m6, [r0+%2]
  67. punpcklwd m5, m0
  68. punpcklwd m6, m0
  69. %endif
  70. pmaddwd m5, m3
  71. pmaddwd m6, m3
  72. psrad m5, m2
  73. psrad m6, m2
  74. %if cpuflag(sse4)
  75. packusdw m5, m6
  76. pminsw m5, m4
  77. %else
  78. packssdw m5, m6
  79. CLIPW m5, m7, m4
  80. %endif
  81. %endmacro
  82. %macro WEIGHT_FUNC_DBL 0
  83. cglobal h264_weight_16_10
  84. WEIGHT_PROLOGUE
  85. WEIGHT_SETUP
  86. .nextrow:
  87. WEIGHT_OP 0
  88. mova [r0 ], m5
  89. WEIGHT_OP 16
  90. mova [r0+16], m5
  91. add r0, r1
  92. dec r2d
  93. jnz .nextrow
  94. REP_RET
  95. %endmacro
  96. INIT_XMM sse2
  97. WEIGHT_FUNC_DBL
  98. INIT_XMM sse4
  99. WEIGHT_FUNC_DBL
  100. %macro WEIGHT_FUNC_MM 0
  101. cglobal h264_weight_8_10
  102. WEIGHT_PROLOGUE
  103. WEIGHT_SETUP
  104. .nextrow:
  105. WEIGHT_OP 0
  106. mova [r0], m5
  107. add r0, r1
  108. dec r2d
  109. jnz .nextrow
  110. REP_RET
  111. %endmacro
  112. INIT_XMM sse2
  113. WEIGHT_FUNC_MM
  114. INIT_XMM sse4
  115. WEIGHT_FUNC_MM
  116. %macro WEIGHT_FUNC_HALF_MM 0
  117. cglobal h264_weight_4_10
  118. WEIGHT_PROLOGUE
  119. sar r2d, 1
  120. WEIGHT_SETUP
  121. lea r3, [r1*2]
  122. .nextrow:
  123. WEIGHT_OP 0, r1
  124. movh [r0], m5
  125. movhps [r0+r1], m5
  126. add r0, r3
  127. dec r2d
  128. jnz .nextrow
  129. REP_RET
  130. %endmacro
  131. INIT_XMM sse2
  132. WEIGHT_FUNC_HALF_MM
  133. INIT_XMM sse4
  134. WEIGHT_FUNC_HALF_MM
  135. ;-----------------------------------------------------------------------------
  136. ; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
  137. ; int height, int log2_denom, int weightd,
  138. ; int weights, int offset);
  139. ;-----------------------------------------------------------------------------
  140. %if ARCH_X86_32
  141. DECLARE_REG_TMP 3
  142. %else
  143. DECLARE_REG_TMP 7
  144. %endif
  145. %macro BIWEIGHT_PROLOGUE 0
  146. .prologue:
  147. PROLOGUE 0,8,8
  148. movifnidn r0, r0mp
  149. movifnidn r1, r1mp
  150. movifnidn r2d, r2m
  151. movifnidn r5d, r5m
  152. movifnidn r6d, r6m
  153. movifnidn t0d, r7m
  154. %endmacro
  155. %macro BIWEIGHT_SETUP 0
  156. lea t0, [t0*4+1] ; (offset<<2)+1
  157. or t0, 1
  158. shl r6, 16
  159. or r5, r6
  160. movd m4, r5d ; weightd | weights
  161. movd m5, t0d ; (offset+1)|1
  162. movd m6, r4m ; log2_denom
  163. pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
  164. paddd m6, [sq_1]
  165. pshufd m4, m4, 0
  166. pshufd m5, m5, 0
  167. mova m3, [pw_pixel_max]
  168. movifnidn r3d, r3m
  169. %if notcpuflag(sse4)
  170. pxor m7, m7
  171. %endif
  172. %endmacro
  173. %macro BIWEIGHT 1-2
  174. %if %0==1
  175. mova m0, [r0+%1]
  176. mova m1, [r1+%1]
  177. punpckhwd m2, m0, m1
  178. punpcklwd m0, m1
  179. %else
  180. movq m0, [r0+%1]
  181. movq m1, [r1+%1]
  182. punpcklwd m0, m1
  183. movq m2, [r0+%2]
  184. movq m1, [r1+%2]
  185. punpcklwd m2, m1
  186. %endif
  187. pmaddwd m0, m4
  188. pmaddwd m2, m4
  189. paddd m0, m5
  190. paddd m2, m5
  191. psrad m0, m6
  192. psrad m2, m6
  193. %if cpuflag(sse4)
  194. packusdw m0, m2
  195. pminsw m0, m3
  196. %else
  197. packssdw m0, m2
  198. CLIPW m0, m7, m3
  199. %endif
  200. %endmacro
  201. %macro BIWEIGHT_FUNC_DBL 0
  202. cglobal h264_biweight_16_10
  203. BIWEIGHT_PROLOGUE
  204. BIWEIGHT_SETUP
  205. .nextrow:
  206. BIWEIGHT 0
  207. mova [r0 ], m0
  208. BIWEIGHT 16
  209. mova [r0+16], m0
  210. add r0, r2
  211. add r1, r2
  212. dec r3d
  213. jnz .nextrow
  214. REP_RET
  215. %endmacro
  216. INIT_XMM sse2
  217. BIWEIGHT_FUNC_DBL
  218. INIT_XMM sse4
  219. BIWEIGHT_FUNC_DBL
  220. %macro BIWEIGHT_FUNC 0
  221. cglobal h264_biweight_8_10
  222. BIWEIGHT_PROLOGUE
  223. BIWEIGHT_SETUP
  224. .nextrow:
  225. BIWEIGHT 0
  226. mova [r0], m0
  227. add r0, r2
  228. add r1, r2
  229. dec r3d
  230. jnz .nextrow
  231. REP_RET
  232. %endmacro
  233. INIT_XMM sse2
  234. BIWEIGHT_FUNC
  235. INIT_XMM sse4
  236. BIWEIGHT_FUNC
  237. %macro BIWEIGHT_FUNC_HALF 0
  238. cglobal h264_biweight_4_10
  239. BIWEIGHT_PROLOGUE
  240. BIWEIGHT_SETUP
  241. sar r3d, 1
  242. lea r4, [r2*2]
  243. .nextrow:
  244. BIWEIGHT 0, r2
  245. movh [r0 ], m0
  246. movhps [r0+r2], m0
  247. add r0, r4
  248. add r1, r4
  249. dec r3d
  250. jnz .nextrow
  251. REP_RET
  252. %endmacro
  253. INIT_XMM sse2
  254. BIWEIGHT_FUNC_HALF
  255. INIT_XMM sse4
  256. BIWEIGHT_FUNC_HALF