You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

370 lines
10KB

  1. ; *****************************************************************************
  2. ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
  3. ; * Copyright (c) 2014 Pierre-Edouard LEPERE
  4. ; *
  5. ; * This file is part of Libav.
  6. ; *
  7. ; * Libav is free software; you can redistribute it and/or
  8. ; * modify it under the terms of the GNU Lesser General Public
  9. ; * License as published by the Free Software Foundation; either
  10. ; * version 2.1 of the License, or (at your option) any later version.
  11. ; *
  12. ; * Libav is distributed in the hope that it will be useful,
  13. ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ; * Lesser General Public License for more details.
  16. ; *
  17. ; * You should have received a copy of the GNU Lesser General Public
  18. ; * License along with Libav; if not, write to the Free Software
  19. ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ; ******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA 32
  23. max_pixels_10: times 16 dw ((1 << 10)-1)
  24. SECTION .text
  25. ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
  26. %macro ADD_RES_MMX_4_8 0
  27. mova m0, [r1]
  28. mova m2, [r1+8]
  29. pxor m1, m1
  30. pxor m3, m3
  31. psubw m1, m0
  32. psubw m3, m2
  33. packuswb m0, m2
  34. packuswb m1, m3
  35. movd m2, [r0]
  36. movd m3, [r0+r2]
  37. punpckldq m2, m3
  38. paddusb m0, m2
  39. psubusb m0, m1
  40. movd [r0], m0
  41. psrlq m0, 32
  42. movd [r0+r2], m0
  43. %endmacro
  44. INIT_MMX mmxext
  45. ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
  46. cglobal hevc_add_residual_4_8, 3, 3, 6
  47. ADD_RES_MMX_4_8
  48. add r1, 16
  49. lea r0, [r0+r2*2]
  50. ADD_RES_MMX_4_8
  51. RET
  52. %macro ADD_RES_SSE_8_8 0
  53. pxor m3, m3
  54. mova m4, [r1]
  55. mova m6, [r1+16]
  56. mova m0, [r1+32]
  57. mova m2, [r1+48]
  58. psubw m5, m3, m4
  59. psubw m7, m3, m6
  60. psubw m1, m3, m0
  61. packuswb m4, m0
  62. packuswb m5, m1
  63. psubw m3, m2
  64. packuswb m6, m2
  65. packuswb m7, m3
  66. movq m0, [r0]
  67. movq m1, [r0+r2]
  68. movhps m0, [r0+r2*2]
  69. movhps m1, [r0+r3]
  70. paddusb m0, m4
  71. paddusb m1, m6
  72. psubusb m0, m5
  73. psubusb m1, m7
  74. movq [r0], m0
  75. movq [r0+r2], m1
  76. movhps [r0+2*r2], m0
  77. movhps [r0+r3], m1
  78. %endmacro
  79. %macro ADD_RES_SSE_16_32_8 3
  80. mova xm2, [r1+%1]
  81. mova xm6, [r1+%1+16]
  82. %if cpuflag(avx2)
  83. vinserti128 m2, m2, [r1+%1+32], 1
  84. vinserti128 m6, m6, [r1+%1+48], 1
  85. %endif
  86. psubw m1, m0, m2
  87. psubw m5, m0, m6
  88. packuswb m2, m6
  89. packuswb m1, m5
  90. mova xm4, [r1+%1+mmsize*2]
  91. mova xm6, [r1+%1+mmsize*2+16]
  92. %if cpuflag(avx2)
  93. vinserti128 m4, m4, [r1+%1+96 ], 1
  94. vinserti128 m6, m6, [r1+%1+112], 1
  95. %endif
  96. psubw m3, m0, m4
  97. psubw m5, m0, m6
  98. packuswb m4, m6
  99. packuswb m3, m5
  100. paddusb m2, [%2]
  101. paddusb m4, [%3]
  102. psubusb m2, m1
  103. psubusb m4, m3
  104. mova [%2], m2
  105. mova [%3], m4
  106. %endmacro
  107. %macro TRANSFORM_ADD_8 0
  108. ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
  109. cglobal hevc_add_residual_8_8, 3, 4, 8
  110. lea r3, [r2*3]
  111. ADD_RES_SSE_8_8
  112. add r1, 64
  113. lea r0, [r0+r2*4]
  114. ADD_RES_SSE_8_8
  115. RET
  116. ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
  117. cglobal hevc_add_residual_16_8, 3, 5, 7
  118. pxor m0, m0
  119. lea r3, [r2*3]
  120. mov r4d, 4
  121. .loop:
  122. ADD_RES_SSE_16_32_8 0, r0, r0+r2
  123. ADD_RES_SSE_16_32_8 64, r0+r2*2, r0+r3
  124. add r1, 128
  125. lea r0, [r0+r2*4]
  126. dec r4d
  127. jg .loop
  128. RET
  129. ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
  130. cglobal hevc_add_residual_32_8, 3, 5, 7
  131. pxor m0, m0
  132. mov r4d, 16
  133. .loop:
  134. ADD_RES_SSE_16_32_8 0, r0, r0+16
  135. ADD_RES_SSE_16_32_8 64, r0+r2, r0+r2+16
  136. add r1, 128
  137. lea r0, [r0+r2*2]
  138. dec r4d
  139. jg .loop
  140. RET
  141. %endmacro
  142. INIT_XMM sse2
  143. TRANSFORM_ADD_8
  144. INIT_XMM avx
  145. TRANSFORM_ADD_8
  146. %if HAVE_AVX2_EXTERNAL
  147. INIT_YMM avx2
  148. ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
  149. cglobal hevc_add_residual_32_8, 3, 5, 7
  150. pxor m0, m0
  151. lea r3, [r2*3]
  152. mov r4d, 8
  153. .loop:
  154. ADD_RES_SSE_16_32_8 0, r0, r0+r2
  155. ADD_RES_SSE_16_32_8 128, r0+r2*2, r0+r3
  156. add r1, 256
  157. lea r0, [r0+r2*4]
  158. dec r4d
  159. jg .loop
  160. RET
  161. %endif ;HAVE_AVX2_EXTERNAL
  162. %macro ADD_RES_SSE_8_10 4
  163. mova m0, [%4]
  164. mova m1, [%4+16]
  165. mova m2, [%4+32]
  166. mova m3, [%4+48]
  167. paddw m0, [%1+0]
  168. paddw m1, [%1+%2]
  169. paddw m2, [%1+%2*2]
  170. paddw m3, [%1+%3]
  171. CLIPW m0, m4, m5
  172. CLIPW m1, m4, m5
  173. CLIPW m2, m4, m5
  174. CLIPW m3, m4, m5
  175. mova [%1+0], m0
  176. mova [%1+%2], m1
  177. mova [%1+%2*2], m2
  178. mova [%1+%3], m3
  179. %endmacro
  180. %macro ADD_RES_MMX_4_10 3
  181. mova m0, [%1+0]
  182. mova m1, [%1+%2]
  183. paddw m0, [%3]
  184. paddw m1, [%3+8]
  185. CLIPW m0, m2, m3
  186. CLIPW m1, m2, m3
  187. mova [%1+0], m0
  188. mova [%1+%2], m1
  189. %endmacro
  190. %macro ADD_RES_SSE_16_10 3
  191. mova m0, [%3]
  192. mova m1, [%3+16]
  193. mova m2, [%3+32]
  194. mova m3, [%3+48]
  195. paddw m0, [%1]
  196. paddw m1, [%1+16]
  197. paddw m2, [%1+%2]
  198. paddw m3, [%1+%2+16]
  199. CLIPW m0, m4, m5
  200. CLIPW m1, m4, m5
  201. CLIPW m2, m4, m5
  202. CLIPW m3, m4, m5
  203. mova [%1], m0
  204. mova [%1+16], m1
  205. mova [%1+%2], m2
  206. mova [%1+%2+16], m3
  207. %endmacro
  208. %macro ADD_RES_SSE_32_10 2
  209. mova m0, [%2]
  210. mova m1, [%2+16]
  211. mova m2, [%2+32]
  212. mova m3, [%2+48]
  213. paddw m0, [%1]
  214. paddw m1, [%1+16]
  215. paddw m2, [%1+32]
  216. paddw m3, [%1+48]
  217. CLIPW m0, m4, m5
  218. CLIPW m1, m4, m5
  219. CLIPW m2, m4, m5
  220. CLIPW m3, m4, m5
  221. mova [%1], m0
  222. mova [%1+16], m1
  223. mova [%1+32], m2
  224. mova [%1+48], m3
  225. %endmacro
  226. %macro ADD_RES_AVX2_16_10 4
  227. mova m0, [%4]
  228. mova m1, [%4+32]
  229. mova m2, [%4+64]
  230. mova m3, [%4+96]
  231. paddw m0, [%1+0]
  232. paddw m1, [%1+%2]
  233. paddw m2, [%1+%2*2]
  234. paddw m3, [%1+%3]
  235. CLIPW m0, m4, m5
  236. CLIPW m1, m4, m5
  237. CLIPW m2, m4, m5
  238. CLIPW m3, m4, m5
  239. mova [%1+0], m0
  240. mova [%1+%2], m1
  241. mova [%1+%2*2], m2
  242. mova [%1+%3], m3
  243. %endmacro
  244. %macro ADD_RES_AVX2_32_10 3
  245. mova m0, [%3]
  246. mova m1, [%3+32]
  247. mova m2, [%3+64]
  248. mova m3, [%3+96]
  249. paddw m0, [%1]
  250. paddw m1, [%1+32]
  251. paddw m2, [%1+%2]
  252. paddw m3, [%1+%2+32]
  253. CLIPW m0, m4, m5
  254. CLIPW m1, m4, m5
  255. CLIPW m2, m4, m5
  256. CLIPW m3, m4, m5
  257. mova [%1], m0
  258. mova [%1+32], m1
  259. mova [%1+%2], m2
  260. mova [%1+%2+32], m3
  261. %endmacro
  262. ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
  263. INIT_MMX mmxext
  264. cglobal hevc_add_residual_4_10, 3, 3, 6
  265. pxor m2, m2
  266. mova m3, [max_pixels_10]
  267. ADD_RES_MMX_4_10 r0, r2, r1
  268. add r1, 16
  269. lea r0, [r0+2*r2]
  270. ADD_RES_MMX_4_10 r0, r2, r1
  271. RET
  272. INIT_XMM sse2
  273. cglobal hevc_add_residual_8_10, 3, 4, 6
  274. pxor m4, m4
  275. mova m5, [max_pixels_10]
  276. lea r3, [r2*3]
  277. ADD_RES_SSE_8_10 r0, r2, r3, r1
  278. lea r0, [r0+r2*4]
  279. add r1, 64
  280. ADD_RES_SSE_8_10 r0, r2, r3, r1
  281. RET
  282. cglobal hevc_add_residual_16_10, 3, 5, 6
  283. pxor m4, m4
  284. mova m5, [max_pixels_10]
  285. mov r4d, 8
  286. .loop:
  287. ADD_RES_SSE_16_10 r0, r2, r1
  288. lea r0, [r0+r2*2]
  289. add r1, 64
  290. dec r4d
  291. jg .loop
  292. RET
  293. cglobal hevc_add_residual_32_10, 3, 5, 6
  294. pxor m4, m4
  295. mova m5, [max_pixels_10]
  296. mov r4d, 32
  297. .loop:
  298. ADD_RES_SSE_32_10 r0, r1
  299. lea r0, [r0+r2]
  300. add r1, 64
  301. dec r4d
  302. jg .loop
  303. RET
  304. %if HAVE_AVX2_EXTERNAL
  305. INIT_YMM avx2
  306. cglobal hevc_add_residual_16_10, 3, 5, 6
  307. pxor m4, m4
  308. mova m5, [max_pixels_10]
  309. lea r3, [r2*3]
  310. mov r4d, 4
  311. .loop:
  312. ADD_RES_AVX2_16_10 r0, r2, r3, r1
  313. lea r0, [r0+r2*4]
  314. add r1, 128
  315. dec r4d
  316. jg .loop
  317. RET
  318. cglobal hevc_add_residual_32_10, 3, 5, 6
  319. pxor m4, m4
  320. mova m5, [max_pixels_10]
  321. mov r4d, 16
  322. .loop:
  323. ADD_RES_AVX2_32_10 r0, r2, r1
  324. lea r0, [r0+r2*2]
  325. add r1, 128
  326. dec r4d
  327. jg .loop
  328. RET
  329. %endif ;HAVE_AVX2_EXTERNAL