You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

283 lines
6.2KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. SECTION_RODATA 32
  26. pw_pixel_max: times 8 dw ((1 << 10)-1)
  27. sq_1: dq 1
  28. dq 0
  29. cextern pw_1
  30. SECTION .text
  31. ;-----------------------------------------------------------------------------
  32. ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
  33. ; int weight, int offset);
  34. ;-----------------------------------------------------------------------------
  35. %macro WEIGHT_PROLOGUE 0
  36. .prologue:
  37. PROLOGUE 0,6,8
  38. movifnidn r0, r0mp
  39. movifnidn r1d, r1m
  40. movifnidn r2d, r2m
  41. movifnidn r4d, r4m
  42. movifnidn r5d, r5m
  43. %endmacro
  44. %macro WEIGHT_SETUP 0
  45. mova m0, [pw_1]
  46. movd m2, r3m
  47. pslld m0, m2 ; 1<<log2_denom
  48. SPLATW m0, m0
  49. shl r5, 19 ; *8, move to upper half of dword
  50. lea r5, [r5+r4*2+0x10000]
  51. movd m3, r5d ; weight<<1 | 1+(offset<<(3))
  52. pshufd m3, m3, 0
  53. mova m4, [pw_pixel_max]
  54. paddw m2, [sq_1] ; log2_denom+1
  55. %if notcpuflag(sse4)
  56. pxor m7, m7
  57. %endif
  58. %endmacro
  59. %macro WEIGHT_OP 1-2
  60. %if %0==1
  61. mova m5, [r0+%1]
  62. punpckhwd m6, m5, m0
  63. punpcklwd m5, m0
  64. %else
  65. movq m5, [r0+%1]
  66. movq m6, [r0+%2]
  67. punpcklwd m5, m0
  68. punpcklwd m6, m0
  69. %endif
  70. pmaddwd m5, m3
  71. pmaddwd m6, m3
  72. psrad m5, m2
  73. psrad m6, m2
  74. %if cpuflag(sse4)
  75. packusdw m5, m6
  76. pminsw m5, m4
  77. %else
  78. packssdw m5, m6
  79. CLIPW m5, m7, m4
  80. %endif
  81. %endmacro
  82. %macro WEIGHT_FUNC_DBL 0
  83. cglobal h264_weight_16_10
  84. WEIGHT_PROLOGUE
  85. WEIGHT_SETUP
  86. .nextrow:
  87. WEIGHT_OP 0
  88. mova [r0 ], m5
  89. WEIGHT_OP 16
  90. mova [r0+16], m5
  91. add r0, r1
  92. dec r2d
  93. jnz .nextrow
  94. REP_RET
  95. %endmacro
  96. INIT_XMM sse2
  97. WEIGHT_FUNC_DBL
  98. INIT_XMM sse4
  99. WEIGHT_FUNC_DBL
  100. %macro WEIGHT_FUNC_MM 0
  101. cglobal h264_weight_8_10
  102. WEIGHT_PROLOGUE
  103. WEIGHT_SETUP
  104. .nextrow:
  105. WEIGHT_OP 0
  106. mova [r0], m5
  107. add r0, r1
  108. dec r2d
  109. jnz .nextrow
  110. REP_RET
  111. %endmacro
  112. INIT_XMM sse2
  113. WEIGHT_FUNC_MM
  114. INIT_XMM sse4
  115. WEIGHT_FUNC_MM
  116. %macro WEIGHT_FUNC_HALF_MM 0
  117. cglobal h264_weight_4_10
  118. WEIGHT_PROLOGUE
  119. sar r2d, 1
  120. WEIGHT_SETUP
  121. lea r3, [r1*2]
  122. .nextrow:
  123. WEIGHT_OP 0, r1
  124. movh [r0], m5
  125. movhps [r0+r1], m5
  126. add r0, r3
  127. dec r2d
  128. jnz .nextrow
  129. REP_RET
  130. %endmacro
  131. INIT_XMM sse2
  132. WEIGHT_FUNC_HALF_MM
  133. INIT_XMM sse4
  134. WEIGHT_FUNC_HALF_MM
  135. ;-----------------------------------------------------------------------------
  136. ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
  137. ; int log2_denom, int weightd, int weights, int offset);
  138. ;-----------------------------------------------------------------------------
  139. %if ARCH_X86_32
  140. DECLARE_REG_TMP 3
  141. %else
  142. DECLARE_REG_TMP 7
  143. %endif
  144. %macro BIWEIGHT_PROLOGUE 0
  145. .prologue:
  146. PROLOGUE 0,8,8
  147. movifnidn r0, r0mp
  148. movifnidn r1, r1mp
  149. movifnidn r2d, r2m
  150. movifnidn r5d, r5m
  151. movifnidn r6d, r6m
  152. movifnidn t0d, r7m
  153. %endmacro
  154. %macro BIWEIGHT_SETUP 0
  155. lea t0, [t0*4+1] ; (offset<<2)+1
  156. or t0, 1
  157. shl r6, 16
  158. or r5, r6
  159. movd m4, r5d ; weightd | weights
  160. movd m5, t0d ; (offset+1)|1
  161. movd m6, r4m ; log2_denom
  162. pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
  163. paddd m6, [sq_1]
  164. pshufd m4, m4, 0
  165. pshufd m5, m5, 0
  166. mova m3, [pw_pixel_max]
  167. movifnidn r3d, r3m
  168. %if notcpuflag(sse4)
  169. pxor m7, m7
  170. %endif
  171. %endmacro
  172. %macro BIWEIGHT 1-2
  173. %if %0==1
  174. mova m0, [r0+%1]
  175. mova m1, [r1+%1]
  176. punpckhwd m2, m0, m1
  177. punpcklwd m0, m1
  178. %else
  179. movq m0, [r0+%1]
  180. movq m1, [r1+%1]
  181. punpcklwd m0, m1
  182. movq m2, [r0+%2]
  183. movq m1, [r1+%2]
  184. punpcklwd m2, m1
  185. %endif
  186. pmaddwd m0, m4
  187. pmaddwd m2, m4
  188. paddd m0, m5
  189. paddd m2, m5
  190. psrad m0, m6
  191. psrad m2, m6
  192. %if cpuflag(sse4)
  193. packusdw m0, m2
  194. pminsw m0, m3
  195. %else
  196. packssdw m0, m2
  197. CLIPW m0, m7, m3
  198. %endif
  199. %endmacro
  200. %macro BIWEIGHT_FUNC_DBL 0
  201. cglobal h264_biweight_16_10
  202. BIWEIGHT_PROLOGUE
  203. BIWEIGHT_SETUP
  204. .nextrow:
  205. BIWEIGHT 0
  206. mova [r0 ], m0
  207. BIWEIGHT 16
  208. mova [r0+16], m0
  209. add r0, r2
  210. add r1, r2
  211. dec r3d
  212. jnz .nextrow
  213. REP_RET
  214. %endmacro
  215. INIT_XMM sse2
  216. BIWEIGHT_FUNC_DBL
  217. INIT_XMM sse4
  218. BIWEIGHT_FUNC_DBL
  219. %macro BIWEIGHT_FUNC 0
  220. cglobal h264_biweight_8_10
  221. BIWEIGHT_PROLOGUE
  222. BIWEIGHT_SETUP
  223. .nextrow:
  224. BIWEIGHT 0
  225. mova [r0], m0
  226. add r0, r2
  227. add r1, r2
  228. dec r3d
  229. jnz .nextrow
  230. REP_RET
  231. %endmacro
  232. INIT_XMM sse2
  233. BIWEIGHT_FUNC
  234. INIT_XMM sse4
  235. BIWEIGHT_FUNC
  236. %macro BIWEIGHT_FUNC_HALF 0
  237. cglobal h264_biweight_4_10
  238. BIWEIGHT_PROLOGUE
  239. BIWEIGHT_SETUP
  240. sar r3d, 1
  241. lea r4, [r2*2]
  242. .nextrow:
  243. BIWEIGHT 0, r2
  244. movh [r0 ], m0
  245. movhps [r0+r2], m0
  246. add r0, r4
  247. add r1, r4
  248. dec r3d
  249. jnz .nextrow
  250. REP_RET
  251. %endmacro
  252. INIT_XMM sse2
  253. BIWEIGHT_FUNC_HALF
  254. INIT_XMM sse4
  255. BIWEIGHT_FUNC_HALF