You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
7.2KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2005-2011 x264 project
  5. ;*
  6. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  7. ;*
  8. ;* This file is part of Libav.
  9. ;*
  10. ;* Libav is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* Libav is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with Libav; if not, write to the Free Software
  22. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "x86inc.asm"
  25. %include "x86util.asm"
  26. SECTION_RODATA 32
  27. pw_pixel_max: times 8 dw ((1 << 10)-1)
  28. sq_1: dq 1
  29. dq 0
  30. cextern pw_1
  31. SECTION .text
  32. ;-----------------------------------------------------------------------------
  33. ; void h264_weight(uint8_t *dst, int stride, int log2_denom,
  34. ; int weight, int offset);
  35. ;-----------------------------------------------------------------------------
  36. %ifdef ARCH_X86_32
  37. DECLARE_REG_TMP 2
  38. %else
  39. DECLARE_REG_TMP 10
  40. %endif
  41. %macro WEIGHT_PROLOGUE 1
  42. mov t0, %1
  43. .prologue
  44. PROLOGUE 0,5,8
  45. movifnidn r0, r0mp
  46. movifnidn r1d, r1m
  47. movifnidn r3d, r3m
  48. movifnidn r4d, r4m
  49. %endmacro
  50. %macro WEIGHT_SETUP 1
  51. mova m0, [pw_1]
  52. movd m2, r2m
  53. pslld m0, m2 ; 1<<log2_denom
  54. SPLATW m0, m0
  55. shl r4, 19 ; *8, move to upper half of dword
  56. lea r4, [r4+r3*2+0x10000]
  57. movd m3, r4d ; weight<<1 | 1+(offset<<(3))
  58. pshufd m3, m3, 0
  59. mova m4, [pw_pixel_max]
  60. paddw m2, [sq_1] ; log2_denom+1
  61. %ifnidn %1, sse4
  62. pxor m7, m7
  63. %endif
  64. %endmacro
  65. %macro WEIGHT_OP 2-3
  66. %if %0==2
  67. mova m5, [r0+%2]
  68. punpckhwd m6, m5, m0
  69. punpcklwd m5, m0
  70. %else
  71. movq m5, [r0+%2]
  72. movq m6, [r0+%3]
  73. punpcklwd m5, m0
  74. punpcklwd m6, m0
  75. %endif
  76. pmaddwd m5, m3
  77. pmaddwd m6, m3
  78. psrad m5, m2
  79. psrad m6, m2
  80. %ifidn %1, sse4
  81. packusdw m5, m6
  82. pminsw m5, m4
  83. %else
  84. packssdw m5, m6
  85. CLIPW m5, m7, m4
  86. %endif
  87. %endmacro
  88. %macro WEIGHT_FUNC_DBL 1
  89. cglobal h264_weight_16x16_10_%1
  90. WEIGHT_PROLOGUE 16
  91. WEIGHT_SETUP %1
  92. .nextrow
  93. WEIGHT_OP %1, 0
  94. mova [r0 ], m5
  95. WEIGHT_OP %1, 16
  96. mova [r0+16], m5
  97. add r0, r1
  98. dec t0
  99. jnz .nextrow
  100. REP_RET
  101. cglobal h264_weight_16x8_10_%1
  102. mov t0, 8
  103. jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
  104. %endmacro
  105. INIT_XMM
  106. WEIGHT_FUNC_DBL sse2
  107. WEIGHT_FUNC_DBL sse4
  108. %macro WEIGHT_FUNC_MM 1
  109. cglobal h264_weight_8x16_10_%1
  110. WEIGHT_PROLOGUE 16
  111. WEIGHT_SETUP %1
  112. .nextrow
  113. WEIGHT_OP %1, 0
  114. mova [r0], m5
  115. add r0, r1
  116. dec t0
  117. jnz .nextrow
  118. REP_RET
  119. cglobal h264_weight_8x8_10_%1
  120. mov t0, 8
  121. jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
  122. cglobal h264_weight_8x4_10_%1
  123. mov t0, 4
  124. jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
  125. %endmacro
  126. INIT_XMM
  127. WEIGHT_FUNC_MM sse2
  128. WEIGHT_FUNC_MM sse4
  129. %macro WEIGHT_FUNC_HALF_MM 1
  130. cglobal h264_weight_4x8_10_%1
  131. WEIGHT_PROLOGUE 4
  132. WEIGHT_SETUP %1
  133. lea r3, [r1*2]
  134. .nextrow
  135. WEIGHT_OP %1, 0, r1
  136. movh [r0], m5
  137. movhps [r0+r1], m5
  138. add r0, r3
  139. dec t0
  140. jnz .nextrow
  141. REP_RET
  142. cglobal h264_weight_4x4_10_%1
  143. mov t0, 2
  144. jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
  145. cglobal h264_weight_4x2_10_%1
  146. mov t0, 1
  147. jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
  148. %endmacro
  149. INIT_XMM
  150. WEIGHT_FUNC_HALF_MM sse2
  151. WEIGHT_FUNC_HALF_MM sse4
  152. ;-----------------------------------------------------------------------------
  153. ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
  154. ; int weightd, int weights, int offset);
  155. ;-----------------------------------------------------------------------------
  156. %ifdef ARCH_X86_32
  157. DECLARE_REG_TMP 2,3
  158. %else
  159. DECLARE_REG_TMP 10,2
  160. %endif
  161. %macro BIWEIGHT_PROLOGUE 1
  162. mov t0, %1
  163. .prologue
  164. PROLOGUE 0,7,8
  165. movifnidn r0, r0mp
  166. movifnidn r1, r1mp
  167. movifnidn t1d, r2m
  168. movifnidn r4d, r4m
  169. movifnidn r5d, r5m
  170. movifnidn r6d, r6m
  171. %endmacro
  172. %macro BIWEIGHT_SETUP 1
  173. lea r6, [r6*4+1] ; (offset<<2)+1
  174. or r6, 1
  175. shl r5, 16
  176. or r4, r5
  177. movd m4, r4d ; weightd | weights
  178. movd m5, r6d ; (offset+1)|1
  179. movd m6, r3m ; log2_denom
  180. pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
  181. paddd m6, [sq_1]
  182. pshufd m4, m4, 0
  183. pshufd m5, m5, 0
  184. mova m3, [pw_pixel_max]
  185. %ifnidn %1, sse4
  186. pxor m7, m7
  187. %endif
  188. %endmacro
  189. %macro BIWEIGHT 2-3
  190. %if %0==2
  191. mova m0, [r0+%2]
  192. mova m1, [r1+%2]
  193. punpckhwd m2, m0, m1
  194. punpcklwd m0, m1
  195. %else
  196. movq m0, [r0+%2]
  197. movq m1, [r1+%2]
  198. punpcklwd m0, m1
  199. movq m2, [r0+%3]
  200. movq m1, [r1+%3]
  201. punpcklwd m2, m1
  202. %endif
  203. pmaddwd m0, m4
  204. pmaddwd m2, m4
  205. paddd m0, m5
  206. paddd m2, m5
  207. psrad m0, m6
  208. psrad m2, m6
  209. %ifidn %1, sse4
  210. packusdw m0, m2
  211. pminsw m0, m3
  212. %else
  213. packssdw m0, m2
  214. CLIPW m0, m7, m3
  215. %endif
  216. %endmacro
  217. %macro BIWEIGHT_FUNC_DBL 1
  218. cglobal h264_biweight_16x16_10_%1
  219. BIWEIGHT_PROLOGUE 16
  220. BIWEIGHT_SETUP %1
  221. .nextrow
  222. BIWEIGHT %1, 0
  223. mova [r0 ], m0
  224. BIWEIGHT %1, 16
  225. mova [r0+16], m0
  226. add r0, t1
  227. add r1, t1
  228. dec t0
  229. jnz .nextrow
  230. REP_RET
  231. cglobal h264_biweight_16x8_10_%1
  232. mov t0, 8
  233. jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
  234. %endmacro
  235. INIT_XMM
  236. BIWEIGHT_FUNC_DBL sse2
  237. BIWEIGHT_FUNC_DBL sse4
  238. %macro BIWEIGHT_FUNC 1
  239. cglobal h264_biweight_8x16_10_%1
  240. BIWEIGHT_PROLOGUE 16
  241. BIWEIGHT_SETUP %1
  242. .nextrow
  243. BIWEIGHT %1, 0
  244. mova [r0], m0
  245. add r0, t1
  246. add r1, t1
  247. dec t0
  248. jnz .nextrow
  249. REP_RET
  250. cglobal h264_biweight_8x8_10_%1
  251. mov t0, 8
  252. jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
  253. cglobal h264_biweight_8x4_10_%1
  254. mov t0, 4
  255. jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
  256. %endmacro
  257. INIT_XMM
  258. BIWEIGHT_FUNC sse2
  259. BIWEIGHT_FUNC sse4
  260. %macro BIWEIGHT_FUNC_HALF 1
  261. cglobal h264_biweight_4x8_10_%1
  262. BIWEIGHT_PROLOGUE 4
  263. BIWEIGHT_SETUP %1
  264. lea r4, [t1*2]
  265. .nextrow
  266. BIWEIGHT %1, 0, t1
  267. movh [r0 ], m0
  268. movhps [r0+t1], m0
  269. add r0, r4
  270. add r1, r4
  271. dec t0
  272. jnz .nextrow
  273. REP_RET
  274. cglobal h264_biweight_4x4_10_%1
  275. mov t0, 2
  276. jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
  277. cglobal h264_biweight_4x2_10_%1
  278. mov t0, 1
  279. jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
  280. %endmacro
  281. INIT_XMM
  282. BIWEIGHT_FUNC_HALF sse2
  283. BIWEIGHT_FUNC_HALF sse4