You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
6.2KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA 32
  27. pw_pixel_max: times 8 dw ((1 << 10)-1)
  28. sq_1: dq 1
  29. dq 0
  30. cextern pw_1
  31. SECTION .text
  32. ;-----------------------------------------------------------------------------
  33. ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
  34. ; int weight, int offset);
  35. ;-----------------------------------------------------------------------------
  36. %macro WEIGHT_PROLOGUE 0
  37. .prologue
  38. PROLOGUE 0,6,8
  39. movifnidn r0, r0mp
  40. movifnidn r1d, r1m
  41. movifnidn r2d, r2m
  42. movifnidn r4d, r4m
  43. movifnidn r5d, r5m
  44. %endmacro
  45. %macro WEIGHT_SETUP 1
  46. mova m0, [pw_1]
  47. movd m2, r3m
  48. pslld m0, m2 ; 1<<log2_denom
  49. SPLATW m0, m0
  50. shl r5, 19 ; *8, move to upper half of dword
  51. lea r5, [r5+r4*2+0x10000]
  52. movd m3, r5d ; weight<<1 | 1+(offset<<(3))
  53. pshufd m3, m3, 0
  54. mova m4, [pw_pixel_max]
  55. paddw m2, [sq_1] ; log2_denom+1
  56. %ifnidn %1, sse4
  57. pxor m7, m7
  58. %endif
  59. %endmacro
  60. %macro WEIGHT_OP 2-3
  61. %if %0==2
  62. mova m5, [r0+%2]
  63. punpckhwd m6, m5, m0
  64. punpcklwd m5, m0
  65. %else
  66. movq m5, [r0+%2]
  67. movq m6, [r0+%3]
  68. punpcklwd m5, m0
  69. punpcklwd m6, m0
  70. %endif
  71. pmaddwd m5, m3
  72. pmaddwd m6, m3
  73. psrad m5, m2
  74. psrad m6, m2
  75. %ifidn %1, sse4
  76. packusdw m5, m6
  77. pminsw m5, m4
  78. %else
  79. packssdw m5, m6
  80. CLIPW m5, m7, m4
  81. %endif
  82. %endmacro
  83. %macro WEIGHT_FUNC_DBL 1
  84. cglobal h264_weight_16_10_%1
  85. WEIGHT_PROLOGUE
  86. WEIGHT_SETUP %1
  87. .nextrow
  88. WEIGHT_OP %1, 0
  89. mova [r0 ], m5
  90. WEIGHT_OP %1, 16
  91. mova [r0+16], m5
  92. add r0, r1
  93. dec r2d
  94. jnz .nextrow
  95. REP_RET
  96. %endmacro
  97. INIT_XMM
  98. WEIGHT_FUNC_DBL sse2
  99. WEIGHT_FUNC_DBL sse4
  100. %macro WEIGHT_FUNC_MM 1
  101. cglobal h264_weight_8_10_%1
  102. WEIGHT_PROLOGUE
  103. WEIGHT_SETUP %1
  104. .nextrow
  105. WEIGHT_OP %1, 0
  106. mova [r0], m5
  107. add r0, r1
  108. dec r2d
  109. jnz .nextrow
  110. REP_RET
  111. %endmacro
  112. INIT_XMM
  113. WEIGHT_FUNC_MM sse2
  114. WEIGHT_FUNC_MM sse4
  115. %macro WEIGHT_FUNC_HALF_MM 1
  116. cglobal h264_weight_4_10_%1
  117. WEIGHT_PROLOGUE
  118. sar r2d, 1
  119. WEIGHT_SETUP %1
  120. lea r3, [r1*2]
  121. .nextrow
  122. WEIGHT_OP %1, 0, r1
  123. movh [r0], m5
  124. movhps [r0+r1], m5
  125. add r0, r3
  126. dec r2d
  127. jnz .nextrow
  128. REP_RET
  129. %endmacro
  130. INIT_XMM
  131. WEIGHT_FUNC_HALF_MM sse2
  132. WEIGHT_FUNC_HALF_MM sse4
  133. ;-----------------------------------------------------------------------------
  134. ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
  135. ; int log2_denom, int weightd, int weights, int offset);
  136. ;-----------------------------------------------------------------------------
  137. %if ARCH_X86_32
  138. DECLARE_REG_TMP 3
  139. %else
  140. DECLARE_REG_TMP 7
  141. %endif
  142. %macro BIWEIGHT_PROLOGUE 0
  143. .prologue
  144. PROLOGUE 0,8,8
  145. movifnidn r0, r0mp
  146. movifnidn r1, r1mp
  147. movifnidn r2d, r2m
  148. movifnidn r5d, r5m
  149. movifnidn r6d, r6m
  150. movifnidn t0d, r7m
  151. %endmacro
  152. %macro BIWEIGHT_SETUP 1
  153. lea t0, [t0*4+1] ; (offset<<2)+1
  154. or t0, 1
  155. shl r6, 16
  156. or r5, r6
  157. movd m4, r5d ; weightd | weights
  158. movd m5, t0d ; (offset+1)|1
  159. movd m6, r4m ; log2_denom
  160. pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
  161. paddd m6, [sq_1]
  162. pshufd m4, m4, 0
  163. pshufd m5, m5, 0
  164. mova m3, [pw_pixel_max]
  165. movifnidn r3d, r3m
  166. %ifnidn %1, sse4
  167. pxor m7, m7
  168. %endif
  169. %endmacro
  170. %macro BIWEIGHT 2-3
  171. %if %0==2
  172. mova m0, [r0+%2]
  173. mova m1, [r1+%2]
  174. punpckhwd m2, m0, m1
  175. punpcklwd m0, m1
  176. %else
  177. movq m0, [r0+%2]
  178. movq m1, [r1+%2]
  179. punpcklwd m0, m1
  180. movq m2, [r0+%3]
  181. movq m1, [r1+%3]
  182. punpcklwd m2, m1
  183. %endif
  184. pmaddwd m0, m4
  185. pmaddwd m2, m4
  186. paddd m0, m5
  187. paddd m2, m5
  188. psrad m0, m6
  189. psrad m2, m6
  190. %ifidn %1, sse4
  191. packusdw m0, m2
  192. pminsw m0, m3
  193. %else
  194. packssdw m0, m2
  195. CLIPW m0, m7, m3
  196. %endif
  197. %endmacro
  198. %macro BIWEIGHT_FUNC_DBL 1
  199. cglobal h264_biweight_16_10_%1
  200. BIWEIGHT_PROLOGUE
  201. BIWEIGHT_SETUP %1
  202. .nextrow
  203. BIWEIGHT %1, 0
  204. mova [r0 ], m0
  205. BIWEIGHT %1, 16
  206. mova [r0+16], m0
  207. add r0, r2
  208. add r1, r2
  209. dec r3d
  210. jnz .nextrow
  211. REP_RET
  212. %endmacro
  213. INIT_XMM
  214. BIWEIGHT_FUNC_DBL sse2
  215. BIWEIGHT_FUNC_DBL sse4
  216. %macro BIWEIGHT_FUNC 1
  217. cglobal h264_biweight_8_10_%1
  218. BIWEIGHT_PROLOGUE
  219. BIWEIGHT_SETUP %1
  220. .nextrow
  221. BIWEIGHT %1, 0
  222. mova [r0], m0
  223. add r0, r2
  224. add r1, r2
  225. dec r3d
  226. jnz .nextrow
  227. REP_RET
  228. %endmacro
  229. INIT_XMM
  230. BIWEIGHT_FUNC sse2
  231. BIWEIGHT_FUNC sse4
  232. %macro BIWEIGHT_FUNC_HALF 1
  233. cglobal h264_biweight_4_10_%1
  234. BIWEIGHT_PROLOGUE
  235. BIWEIGHT_SETUP %1
  236. sar r3d, 1
  237. lea r4, [r2*2]
  238. .nextrow
  239. BIWEIGHT %1, 0, r2
  240. movh [r0 ], m0
  241. movhps [r0+r2], m0
  242. add r0, r4
  243. add r1, r4
  244. dec r3d
  245. jnz .nextrow
  246. REP_RET
  247. %endmacro
  248. INIT_XMM
  249. BIWEIGHT_FUNC_HALF sse2
  250. BIWEIGHT_FUNC_HALF sse4