You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1153 lines
63KB

  1. /*
  2. * Copyright (c) 2013 Seppo Tomperi
  3. * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/cpu.h"
  23. #include "libavutil/mem_internal.h"
  24. #include "libavutil/x86/asm.h"
  25. #include "libavutil/x86/cpu.h"
  26. #include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
  27. #include "libavcodec/hevcdsp.h"
  28. #include "libavcodec/x86/hevcdsp.h"
  29. #define LFC_FUNC(DIR, DEPTH, OPT) \
  30. void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
  31. #define LFL_FUNC(DIR, DEPTH, OPT) \
  32. void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  33. #define LFC_FUNCS(type, depth, opt) \
  34. LFC_FUNC(h, depth, opt) \
  35. LFC_FUNC(v, depth, opt)
  36. #define LFL_FUNCS(type, depth, opt) \
  37. LFL_FUNC(h, depth, opt) \
  38. LFL_FUNC(v, depth, opt)
  39. LFC_FUNCS(uint8_t, 8, sse2)
  40. LFC_FUNCS(uint8_t, 10, sse2)
  41. LFC_FUNCS(uint8_t, 12, sse2)
  42. LFC_FUNCS(uint8_t, 8, avx)
  43. LFC_FUNCS(uint8_t, 10, avx)
  44. LFC_FUNCS(uint8_t, 12, avx)
  45. LFL_FUNCS(uint8_t, 8, sse2)
  46. LFL_FUNCS(uint8_t, 10, sse2)
  47. LFL_FUNCS(uint8_t, 12, sse2)
  48. LFL_FUNCS(uint8_t, 8, ssse3)
  49. LFL_FUNCS(uint8_t, 10, ssse3)
  50. LFL_FUNCS(uint8_t, 12, ssse3)
  51. LFL_FUNCS(uint8_t, 8, avx)
  52. LFL_FUNCS(uint8_t, 10, avx)
  53. LFL_FUNCS(uint8_t, 12, avx)
  54. #define IDCT_DC_FUNCS(W, opt) \
  55. void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
  56. void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
  57. void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
  58. IDCT_DC_FUNCS(4x4, mmxext);
  59. IDCT_DC_FUNCS(8x8, mmxext);
  60. IDCT_DC_FUNCS(8x8, sse2);
  61. IDCT_DC_FUNCS(16x16, sse2);
  62. IDCT_DC_FUNCS(32x32, sse2);
  63. IDCT_DC_FUNCS(16x16, avx2);
  64. IDCT_DC_FUNCS(32x32, avx2);
  65. #define IDCT_FUNCS(opt) \
  66. void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
  67. void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
  68. void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
  69. void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
  70. void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
  71. void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
  72. void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
  73. void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
  74. IDCT_FUNCS(sse2)
  75. IDCT_FUNCS(avx)
  76. #define mc_rep_func(name, bitd, step, W, opt) \
  77. void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
  78. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  79. intptr_t mx, intptr_t my, int width) \
  80. { \
  81. int i; \
  82. uint8_t *src; \
  83. int16_t *dst; \
  84. for (i = 0; i < W; i += step) { \
  85. src = _src + (i * ((bitd + 7) / 8)); \
  86. dst = _dst + i; \
  87. ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
  88. } \
  89. }
  90. #define mc_rep_uni_func(name, bitd, step, W, opt) \
  91. void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
  92. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  93. intptr_t mx, intptr_t my, int width) \
  94. { \
  95. int i; \
  96. uint8_t *src; \
  97. uint8_t *dst; \
  98. for (i = 0; i < W; i += step) { \
  99. src = _src + (i * ((bitd + 7) / 8)); \
  100. dst = _dst + (i * ((bitd + 7) / 8)); \
  101. ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
  102. height, mx, my, width); \
  103. } \
  104. }
  105. #define mc_rep_bi_func(name, bitd, step, W, opt) \
  106. void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \
  107. ptrdiff_t _srcstride, int16_t* _src2, \
  108. int height, intptr_t mx, intptr_t my, int width) \
  109. { \
  110. int i; \
  111. uint8_t *src; \
  112. uint8_t *dst; \
  113. int16_t *src2; \
  114. for (i = 0; i < W ; i += step) { \
  115. src = _src + (i * ((bitd + 7) / 8)); \
  116. dst = _dst + (i * ((bitd + 7) / 8)); \
  117. src2 = _src2 + i; \
  118. ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
  119. height, mx, my, width); \
  120. } \
  121. }
  122. #define mc_rep_funcs(name, bitd, step, W, opt) \
  123. mc_rep_func(name, bitd, step, W, opt) \
  124. mc_rep_uni_func(name, bitd, step, W, opt) \
  125. mc_rep_bi_func(name, bitd, step, W, opt)
  126. #define mc_rep_func2(name, bitd, step1, step2, W, opt) \
  127. void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
  128. uint8_t *src, ptrdiff_t _srcstride, int height, \
  129. intptr_t mx, intptr_t my, int width) \
  130. { \
  131. ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
  132. ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
  133. _srcstride, height, mx, my, width); \
  134. }
  135. #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
  136. void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
  137. uint8_t *src, ptrdiff_t _srcstride, int height, \
  138. intptr_t mx, intptr_t my, int width) \
  139. { \
  140. ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
  141. ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
  142. src + (step1 * ((bitd + 7) / 8)), _srcstride, \
  143. height, mx, my, width); \
  144. }
  145. #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
  146. void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  147. ptrdiff_t _srcstride, int16_t* src2, \
  148. int height, intptr_t mx, intptr_t my, int width) \
  149. { \
  150. ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
  151. ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
  152. src + (step1 * ((bitd + 7) / 8)), _srcstride, \
  153. src2 + step1, height, mx, my, width); \
  154. }
  155. #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
  156. mc_rep_func2(name, bitd, step1, step2, W, opt) \
  157. mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
  158. mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
  159. #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  160. #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  161. void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
  162. int height, intptr_t mx, intptr_t my, int width) \
  163. \
  164. { \
  165. ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
  166. ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
  167. }
  168. #define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  169. void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  170. ptrdiff_t _srcstride, int16_t *src2, \
  171. int height, intptr_t mx, intptr_t my, int width) \
  172. { \
  173. ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
  174. height, mx, my, width); \
  175. ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
  176. height, mx, my, width); \
  177. }
  178. #define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  179. void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
  180. uint8_t *src, ptrdiff_t _srcstride, int height, \
  181. intptr_t mx, intptr_t my, int width) \
  182. { \
  183. ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
  184. height, mx, my, width); \
  185. ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
  186. height, mx, my, width); \
  187. }
  188. #define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
  189. mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  190. mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
  191. mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
  192. #define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  193. void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
  194. int height, intptr_t mx, intptr_t my, int width) \
  195. \
  196. { \
  197. ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
  198. ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
  199. }
  200. #define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  201. void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
  202. ptrdiff_t _srcstride, int16_t* src2, \
  203. int height, intptr_t mx, intptr_t my, int width) \
  204. { \
  205. ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
  206. src2, height, mx, my, width); \
  207. ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
  208. src2+width2, height, mx, my, width); \
  209. }
  210. #define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  211. void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
  212. uint8_t *src, ptrdiff_t _srcstride, int height, \
  213. intptr_t mx, intptr_t my, int width) \
  214. { \
  215. ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
  216. height, mx, my, width); \
  217. ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
  218. height, mx, my, width); \
  219. }
  220. #define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
  221. mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  222. mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
  223. mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
  224. #if HAVE_AVX2_EXTERNAL
  225. mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
  226. mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
  227. mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
  228. mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
  229. mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
  230. mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
  231. mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
  232. mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
  233. mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
  234. mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
  235. mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
  236. mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
  237. mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
  238. mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
  239. mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
  240. mc_rep_func(pel_pixels, 10, 16, 32, avx2)
  241. mc_rep_func(pel_pixels, 10, 16, 48, avx2)
  242. mc_rep_func(pel_pixels, 10, 32, 64, avx2)
  243. mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
  244. mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
  245. mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
  246. mc_rep_funcs(epel_h, 8, 32, 64, avx2)
  247. mc_rep_funcs(epel_v, 8, 32, 64, avx2)
  248. mc_rep_funcs(epel_h, 10, 16, 32, avx2)
  249. mc_rep_funcs(epel_h, 10, 16, 48, avx2)
  250. mc_rep_funcs(epel_h, 10, 32, 64, avx2)
  251. mc_rep_funcs(epel_v, 10, 16, 32, avx2)
  252. mc_rep_funcs(epel_v, 10, 16, 48, avx2)
  253. mc_rep_funcs(epel_v, 10, 32, 64, avx2)
  254. mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
  255. mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
  256. mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
  257. mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
  258. mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
  259. mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
  260. mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
  261. mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
  262. mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
  263. mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
  264. mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
  265. mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
  266. mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
  267. mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
  268. mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
  269. mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
  270. mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
  271. #endif //AVX2
  272. mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
  273. mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
  274. mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
  275. mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
  276. mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
  277. mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
  278. mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
  279. mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
  280. mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
  281. mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
  282. mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
  283. mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
  284. mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
  285. mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
  286. mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
  287. mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
  288. mc_rep_funcs(epel_h, 8, 16, 64, sse4)
  289. mc_rep_funcs(epel_h, 8, 16, 48, sse4)
  290. mc_rep_funcs(epel_h, 8, 16, 32, sse4)
  291. mc_rep_funcs(epel_h, 8, 8, 24, sse4)
  292. mc_rep_funcs(epel_h,10, 8, 64, sse4)
  293. mc_rep_funcs(epel_h,10, 8, 48, sse4)
  294. mc_rep_funcs(epel_h,10, 8, 32, sse4)
  295. mc_rep_funcs(epel_h,10, 8, 24, sse4)
  296. mc_rep_funcs(epel_h,10, 8, 16, sse4)
  297. mc_rep_funcs(epel_h,10, 4, 12, sse4)
  298. mc_rep_funcs(epel_h,12, 8, 64, sse4)
  299. mc_rep_funcs(epel_h,12, 8, 48, sse4)
  300. mc_rep_funcs(epel_h,12, 8, 32, sse4)
  301. mc_rep_funcs(epel_h,12, 8, 24, sse4)
  302. mc_rep_funcs(epel_h,12, 8, 16, sse4)
  303. mc_rep_funcs(epel_h,12, 4, 12, sse4)
  304. mc_rep_funcs(epel_v, 8, 16, 64, sse4)
  305. mc_rep_funcs(epel_v, 8, 16, 48, sse4)
  306. mc_rep_funcs(epel_v, 8, 16, 32, sse4)
  307. mc_rep_funcs(epel_v, 8, 8, 24, sse4)
  308. mc_rep_funcs(epel_v,10, 8, 64, sse4)
  309. mc_rep_funcs(epel_v,10, 8, 48, sse4)
  310. mc_rep_funcs(epel_v,10, 8, 32, sse4)
  311. mc_rep_funcs(epel_v,10, 8, 24, sse4)
  312. mc_rep_funcs(epel_v,10, 8, 16, sse4)
  313. mc_rep_funcs(epel_v,10, 4, 12, sse4)
  314. mc_rep_funcs(epel_v,12, 8, 64, sse4)
  315. mc_rep_funcs(epel_v,12, 8, 48, sse4)
  316. mc_rep_funcs(epel_v,12, 8, 32, sse4)
  317. mc_rep_funcs(epel_v,12, 8, 24, sse4)
  318. mc_rep_funcs(epel_v,12, 8, 16, sse4)
  319. mc_rep_funcs(epel_v,12, 4, 12, sse4)
  320. mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
  321. mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
  322. mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
  323. mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
  324. mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
  325. mc_rep_funcs(epel_hv,10, 8, 64, sse4)
  326. mc_rep_funcs(epel_hv,10, 8, 48, sse4)
  327. mc_rep_funcs(epel_hv,10, 8, 32, sse4)
  328. mc_rep_funcs(epel_hv,10, 8, 24, sse4)
  329. mc_rep_funcs(epel_hv,10, 8, 16, sse4)
  330. mc_rep_funcs(epel_hv,10, 4, 12, sse4)
  331. mc_rep_funcs(epel_hv,12, 8, 64, sse4)
  332. mc_rep_funcs(epel_hv,12, 8, 48, sse4)
  333. mc_rep_funcs(epel_hv,12, 8, 32, sse4)
  334. mc_rep_funcs(epel_hv,12, 8, 24, sse4)
  335. mc_rep_funcs(epel_hv,12, 8, 16, sse4)
  336. mc_rep_funcs(epel_hv,12, 4, 12, sse4)
  337. mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
  338. mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
  339. mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
  340. mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
  341. mc_rep_funcs(qpel_h,10, 8, 64, sse4)
  342. mc_rep_funcs(qpel_h,10, 8, 48, sse4)
  343. mc_rep_funcs(qpel_h,10, 8, 32, sse4)
  344. mc_rep_funcs(qpel_h,10, 8, 24, sse4)
  345. mc_rep_funcs(qpel_h,10, 8, 16, sse4)
  346. mc_rep_funcs(qpel_h,10, 4, 12, sse4)
  347. mc_rep_funcs(qpel_h,12, 8, 64, sse4)
  348. mc_rep_funcs(qpel_h,12, 8, 48, sse4)
  349. mc_rep_funcs(qpel_h,12, 8, 32, sse4)
  350. mc_rep_funcs(qpel_h,12, 8, 24, sse4)
  351. mc_rep_funcs(qpel_h,12, 8, 16, sse4)
  352. mc_rep_funcs(qpel_h,12, 4, 12, sse4)
  353. mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
  354. mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
  355. mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
  356. mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
  357. mc_rep_funcs(qpel_v,10, 8, 64, sse4)
  358. mc_rep_funcs(qpel_v,10, 8, 48, sse4)
  359. mc_rep_funcs(qpel_v,10, 8, 32, sse4)
  360. mc_rep_funcs(qpel_v,10, 8, 24, sse4)
  361. mc_rep_funcs(qpel_v,10, 8, 16, sse4)
  362. mc_rep_funcs(qpel_v,10, 4, 12, sse4)
  363. mc_rep_funcs(qpel_v,12, 8, 64, sse4)
  364. mc_rep_funcs(qpel_v,12, 8, 48, sse4)
  365. mc_rep_funcs(qpel_v,12, 8, 32, sse4)
  366. mc_rep_funcs(qpel_v,12, 8, 24, sse4)
  367. mc_rep_funcs(qpel_v,12, 8, 16, sse4)
  368. mc_rep_funcs(qpel_v,12, 4, 12, sse4)
  369. mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
  370. mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
  371. mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
  372. mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
  373. mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
  374. mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
  375. mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
  376. mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
  377. mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
  378. mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
  379. mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
  380. mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
  381. mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
  382. mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
  383. mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
  384. mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
  385. mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
  386. mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
  387. #define mc_rep_uni_w(bitd, step, W, opt) \
  388. void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
  389. int height, int denom, int _wx, int _ox) \
  390. { \
  391. int i; \
  392. int16_t *src; \
  393. uint8_t *dst; \
  394. for (i = 0; i < W; i += step) { \
  395. src= _src + i; \
  396. dst= _dst + (i * ((bitd + 7) / 8)); \
  397. ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
  398. height, denom, _wx, _ox); \
  399. } \
  400. }
  401. mc_rep_uni_w(8, 6, 12, sse4)
  402. mc_rep_uni_w(8, 8, 16, sse4)
  403. mc_rep_uni_w(8, 8, 24, sse4)
  404. mc_rep_uni_w(8, 8, 32, sse4)
  405. mc_rep_uni_w(8, 8, 48, sse4)
  406. mc_rep_uni_w(8, 8, 64, sse4)
  407. mc_rep_uni_w(10, 6, 12, sse4)
  408. mc_rep_uni_w(10, 8, 16, sse4)
  409. mc_rep_uni_w(10, 8, 24, sse4)
  410. mc_rep_uni_w(10, 8, 32, sse4)
  411. mc_rep_uni_w(10, 8, 48, sse4)
  412. mc_rep_uni_w(10, 8, 64, sse4)
  413. mc_rep_uni_w(12, 6, 12, sse4)
  414. mc_rep_uni_w(12, 8, 16, sse4)
  415. mc_rep_uni_w(12, 8, 24, sse4)
  416. mc_rep_uni_w(12, 8, 32, sse4)
  417. mc_rep_uni_w(12, 8, 48, sse4)
  418. mc_rep_uni_w(12, 8, 64, sse4)
  419. #define mc_rep_bi_w(bitd, step, W, opt) \
  420. void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
  421. int16_t *_src2, int height, \
  422. int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
  423. { \
  424. int i; \
  425. int16_t *src; \
  426. int16_t *src2; \
  427. uint8_t *dst; \
  428. for (i = 0; i < W; i += step) { \
  429. src = _src + i; \
  430. src2 = _src2 + i; \
  431. dst = _dst + (i * ((bitd + 7) / 8)); \
  432. ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
  433. height, denom, _wx0, _wx1, _ox0, _ox1); \
  434. } \
  435. }
  436. mc_rep_bi_w(8, 6, 12, sse4)
  437. mc_rep_bi_w(8, 8, 16, sse4)
  438. mc_rep_bi_w(8, 8, 24, sse4)
  439. mc_rep_bi_w(8, 8, 32, sse4)
  440. mc_rep_bi_w(8, 8, 48, sse4)
  441. mc_rep_bi_w(8, 8, 64, sse4)
  442. mc_rep_bi_w(10, 6, 12, sse4)
  443. mc_rep_bi_w(10, 8, 16, sse4)
  444. mc_rep_bi_w(10, 8, 24, sse4)
  445. mc_rep_bi_w(10, 8, 32, sse4)
  446. mc_rep_bi_w(10, 8, 48, sse4)
  447. mc_rep_bi_w(10, 8, 64, sse4)
  448. mc_rep_bi_w(12, 6, 12, sse4)
  449. mc_rep_bi_w(12, 8, 16, sse4)
  450. mc_rep_bi_w(12, 8, 24, sse4)
  451. mc_rep_bi_w(12, 8, 32, sse4)
  452. mc_rep_bi_w(12, 8, 48, sse4)
  453. mc_rep_bi_w(12, 8, 64, sse4)
  454. #define mc_uni_w_func(name, bitd, W, opt) \
  455. void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  456. uint8_t *_src, ptrdiff_t _srcstride, \
  457. int height, int denom, \
  458. int _wx, int _ox, \
  459. intptr_t mx, intptr_t my, int width) \
  460. { \
  461. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  462. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  463. ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
  464. }
  465. #define mc_uni_w_funcs(name, bitd, opt) \
  466. mc_uni_w_func(name, bitd, 4, opt) \
  467. mc_uni_w_func(name, bitd, 8, opt) \
  468. mc_uni_w_func(name, bitd, 12, opt) \
  469. mc_uni_w_func(name, bitd, 16, opt) \
  470. mc_uni_w_func(name, bitd, 24, opt) \
  471. mc_uni_w_func(name, bitd, 32, opt) \
  472. mc_uni_w_func(name, bitd, 48, opt) \
  473. mc_uni_w_func(name, bitd, 64, opt)
  474. mc_uni_w_funcs(pel_pixels, 8, sse4)
  475. mc_uni_w_func(pel_pixels, 8, 6, sse4)
  476. mc_uni_w_funcs(epel_h, 8, sse4)
  477. mc_uni_w_func(epel_h, 8, 6, sse4)
  478. mc_uni_w_funcs(epel_v, 8, sse4)
  479. mc_uni_w_func(epel_v, 8, 6, sse4)
  480. mc_uni_w_funcs(epel_hv, 8, sse4)
  481. mc_uni_w_func(epel_hv, 8, 6, sse4)
  482. mc_uni_w_funcs(qpel_h, 8, sse4)
  483. mc_uni_w_funcs(qpel_v, 8, sse4)
  484. mc_uni_w_funcs(qpel_hv, 8, sse4)
  485. mc_uni_w_funcs(pel_pixels, 10, sse4)
  486. mc_uni_w_func(pel_pixels, 10, 6, sse4)
  487. mc_uni_w_funcs(epel_h, 10, sse4)
  488. mc_uni_w_func(epel_h, 10, 6, sse4)
  489. mc_uni_w_funcs(epel_v, 10, sse4)
  490. mc_uni_w_func(epel_v, 10, 6, sse4)
  491. mc_uni_w_funcs(epel_hv, 10, sse4)
  492. mc_uni_w_func(epel_hv, 10, 6, sse4)
  493. mc_uni_w_funcs(qpel_h, 10, sse4)
  494. mc_uni_w_funcs(qpel_v, 10, sse4)
  495. mc_uni_w_funcs(qpel_hv, 10, sse4)
  496. mc_uni_w_funcs(pel_pixels, 12, sse4)
  497. mc_uni_w_func(pel_pixels, 12, 6, sse4)
  498. mc_uni_w_funcs(epel_h, 12, sse4)
  499. mc_uni_w_func(epel_h, 12, 6, sse4)
  500. mc_uni_w_funcs(epel_v, 12, sse4)
  501. mc_uni_w_func(epel_v, 12, 6, sse4)
  502. mc_uni_w_funcs(epel_hv, 12, sse4)
  503. mc_uni_w_func(epel_hv, 12, 6, sse4)
  504. mc_uni_w_funcs(qpel_h, 12, sse4)
  505. mc_uni_w_funcs(qpel_v, 12, sse4)
  506. mc_uni_w_funcs(qpel_hv, 12, sse4)
  507. #define mc_bi_w_func(name, bitd, W, opt) \
  508. void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  509. uint8_t *_src, ptrdiff_t _srcstride, \
  510. int16_t *_src2, \
  511. int height, int denom, \
  512. int _wx0, int _wx1, int _ox0, int _ox1, \
  513. intptr_t mx, intptr_t my, int width) \
  514. { \
  515. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  516. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  517. ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
  518. height, denom, _wx0, _wx1, _ox0, _ox1); \
  519. }
  520. #define mc_bi_w_funcs(name, bitd, opt) \
  521. mc_bi_w_func(name, bitd, 4, opt) \
  522. mc_bi_w_func(name, bitd, 8, opt) \
  523. mc_bi_w_func(name, bitd, 12, opt) \
  524. mc_bi_w_func(name, bitd, 16, opt) \
  525. mc_bi_w_func(name, bitd, 24, opt) \
  526. mc_bi_w_func(name, bitd, 32, opt) \
  527. mc_bi_w_func(name, bitd, 48, opt) \
  528. mc_bi_w_func(name, bitd, 64, opt)
  529. mc_bi_w_funcs(pel_pixels, 8, sse4)
  530. mc_bi_w_func(pel_pixels, 8, 6, sse4)
  531. mc_bi_w_funcs(epel_h, 8, sse4)
  532. mc_bi_w_func(epel_h, 8, 6, sse4)
  533. mc_bi_w_funcs(epel_v, 8, sse4)
  534. mc_bi_w_func(epel_v, 8, 6, sse4)
  535. mc_bi_w_funcs(epel_hv, 8, sse4)
  536. mc_bi_w_func(epel_hv, 8, 6, sse4)
  537. mc_bi_w_funcs(qpel_h, 8, sse4)
  538. mc_bi_w_funcs(qpel_v, 8, sse4)
  539. mc_bi_w_funcs(qpel_hv, 8, sse4)
  540. mc_bi_w_funcs(pel_pixels, 10, sse4)
  541. mc_bi_w_func(pel_pixels, 10, 6, sse4)
  542. mc_bi_w_funcs(epel_h, 10, sse4)
  543. mc_bi_w_func(epel_h, 10, 6, sse4)
  544. mc_bi_w_funcs(epel_v, 10, sse4)
  545. mc_bi_w_func(epel_v, 10, 6, sse4)
  546. mc_bi_w_funcs(epel_hv, 10, sse4)
  547. mc_bi_w_func(epel_hv, 10, 6, sse4)
  548. mc_bi_w_funcs(qpel_h, 10, sse4)
  549. mc_bi_w_funcs(qpel_v, 10, sse4)
  550. mc_bi_w_funcs(qpel_hv, 10, sse4)
  551. mc_bi_w_funcs(pel_pixels, 12, sse4)
  552. mc_bi_w_func(pel_pixels, 12, 6, sse4)
  553. mc_bi_w_funcs(epel_h, 12, sse4)
  554. mc_bi_w_func(epel_h, 12, 6, sse4)
  555. mc_bi_w_funcs(epel_v, 12, sse4)
  556. mc_bi_w_func(epel_v, 12, 6, sse4)
  557. mc_bi_w_funcs(epel_hv, 12, sse4)
  558. mc_bi_w_func(epel_hv, 12, 6, sse4)
  559. mc_bi_w_funcs(qpel_h, 12, sse4)
  560. mc_bi_w_funcs(qpel_v, 12, sse4)
  561. mc_bi_w_funcs(qpel_hv, 12, sse4)
  562. #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  563. #define SAO_BAND_FILTER_FUNCS(bitd, opt) \
  564. void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  565. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  566. void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  567. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  568. void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  569. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  570. void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  571. int16_t *sao_offset_val, int sao_left_class, int width, int height); \
  572. void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
  573. int16_t *sao_offset_val, int sao_left_class, int width, int height);
  574. SAO_BAND_FILTER_FUNCS(8, sse2)
  575. SAO_BAND_FILTER_FUNCS(10, sse2)
  576. SAO_BAND_FILTER_FUNCS(12, sse2)
  577. SAO_BAND_FILTER_FUNCS(8, avx)
  578. SAO_BAND_FILTER_FUNCS(10, avx)
  579. SAO_BAND_FILTER_FUNCS(12, avx)
  580. SAO_BAND_FILTER_FUNCS(8, avx2)
  581. SAO_BAND_FILTER_FUNCS(10, avx2)
  582. SAO_BAND_FILTER_FUNCS(12, avx2)
  583. #define SAO_BAND_INIT(bitd, opt) do { \
  584. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
  585. c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
  586. c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
  587. c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
  588. c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
  589. } while (0)
  590. #define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
  591. void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  592. int eo, int width, int height); \
  593. void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  594. int eo, int width, int height); \
  595. void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  596. int eo, int width, int height); \
  597. void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  598. int eo, int width, int height); \
  599. void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
  600. int eo, int width, int height); \
  601. SAO_EDGE_FILTER_FUNCS(8, ssse3)
  602. SAO_EDGE_FILTER_FUNCS(8, avx2)
  603. SAO_EDGE_FILTER_FUNCS(10, sse2)
  604. SAO_EDGE_FILTER_FUNCS(10, avx2)
  605. SAO_EDGE_FILTER_FUNCS(12, sse2)
  606. SAO_EDGE_FILTER_FUNCS(12, avx2)
  607. #define SAO_EDGE_INIT(bitd, opt) do { \
  608. c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
  609. c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
  610. c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
  611. c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
  612. c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
  613. } while (0)
  614. #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
  615. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  616. PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
  617. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  618. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  619. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  620. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  621. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  622. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  623. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  624. #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
  625. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  626. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  627. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  628. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  629. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  630. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  631. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  632. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  633. void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  634. {
  635. int cpu_flags = av_get_cpu_flags();
  636. if (bit_depth == 8) {
  637. if (EXTERNAL_MMXEXT(cpu_flags)) {
  638. c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
  639. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
  640. c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
  641. }
  642. if (EXTERNAL_SSE2(cpu_flags)) {
  643. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
  644. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
  645. if (ARCH_X86_64) {
  646. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
  647. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
  648. c->idct[2] = ff_hevc_idct_16x16_8_sse2;
  649. c->idct[3] = ff_hevc_idct_32x32_8_sse2;
  650. }
  651. SAO_BAND_INIT(8, sse2);
  652. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
  653. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
  654. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
  655. c->idct[0] = ff_hevc_idct_4x4_8_sse2;
  656. c->idct[1] = ff_hevc_idct_8x8_8_sse2;
  657. c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
  658. c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
  659. c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
  660. }
  661. if (EXTERNAL_SSSE3(cpu_flags)) {
  662. if(ARCH_X86_64) {
  663. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
  664. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
  665. }
  666. SAO_EDGE_INIT(8, ssse3);
  667. }
  668. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  669. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
  670. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
  671. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
  672. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
  673. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
  674. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
  675. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
  676. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
  677. }
  678. if (EXTERNAL_AVX(cpu_flags)) {
  679. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
  680. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
  681. if (ARCH_X86_64) {
  682. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
  683. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
  684. c->idct[2] = ff_hevc_idct_16x16_8_avx;
  685. c->idct[3] = ff_hevc_idct_32x32_8_avx;
  686. }
  687. SAO_BAND_INIT(8, avx);
  688. c->idct[0] = ff_hevc_idct_4x4_8_avx;
  689. c->idct[1] = ff_hevc_idct_8x8_8_avx;
  690. c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
  691. c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
  692. c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
  693. }
  694. if (EXTERNAL_AVX2(cpu_flags)) {
  695. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
  696. c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
  697. }
  698. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  699. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
  700. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
  701. if (ARCH_X86_64) {
  702. c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
  703. c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
  704. c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
  705. c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
  706. c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
  707. c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
  708. c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  709. c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  710. c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  711. c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  712. c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  713. c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  714. c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
  715. c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
  716. c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
  717. c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
  718. c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
  719. c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
  720. c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
  721. c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
  722. c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
  723. c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
  724. c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
  725. c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
  726. c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
  727. c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
  728. c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
  729. c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
  730. c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
  731. c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
  732. c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
  733. c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
  734. c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
  735. c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
  736. c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
  737. c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
  738. c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
  739. c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
  740. c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
  741. c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
  742. c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
  743. c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
  744. c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
  745. c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
  746. c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
  747. c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
  748. c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
  749. c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
  750. c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
  751. c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
  752. c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
  753. c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
  754. c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
  755. c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
  756. c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
  757. c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
  758. c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
  759. c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
  760. c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
  761. c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
  762. c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
  763. c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
  764. c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
  765. }
  766. SAO_BAND_INIT(8, avx2);
  767. c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
  768. c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
  769. c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
  770. c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
  771. }
  772. } else if (bit_depth == 10) {
  773. if (EXTERNAL_MMXEXT(cpu_flags)) {
  774. c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
  775. c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
  776. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
  777. }
  778. if (EXTERNAL_SSE2(cpu_flags)) {
  779. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
  780. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
  781. if (ARCH_X86_64) {
  782. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
  783. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
  784. c->idct[2] = ff_hevc_idct_16x16_10_sse2;
  785. c->idct[3] = ff_hevc_idct_32x32_10_sse2;
  786. }
  787. SAO_BAND_INIT(10, sse2);
  788. SAO_EDGE_INIT(10, sse2);
  789. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
  790. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
  791. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  792. c->idct[0] = ff_hevc_idct_4x4_10_sse2;
  793. c->idct[1] = ff_hevc_idct_8x8_10_sse2;
  794. c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
  795. c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
  796. c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
  797. }
  798. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  799. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
  800. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
  801. }
  802. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  803. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
  804. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
  805. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
  806. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
  807. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
  808. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
  809. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
  810. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
  811. }
  812. if (EXTERNAL_AVX(cpu_flags)) {
  813. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
  814. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
  815. if (ARCH_X86_64) {
  816. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
  817. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
  818. c->idct[2] = ff_hevc_idct_16x16_10_avx;
  819. c->idct[3] = ff_hevc_idct_32x32_10_avx;
  820. }
  821. c->idct[0] = ff_hevc_idct_4x4_10_avx;
  822. c->idct[1] = ff_hevc_idct_8x8_10_avx;
  823. SAO_BAND_INIT(10, avx);
  824. }
  825. if (EXTERNAL_AVX2(cpu_flags)) {
  826. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
  827. }
  828. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  829. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
  830. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
  831. if (ARCH_X86_64) {
  832. c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
  833. c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
  834. c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
  835. c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
  836. c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  837. c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
  838. c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
  839. c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
  840. c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
  841. c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
  842. c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  843. c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  844. c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  845. c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
  846. c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
  847. c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
  848. c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
  849. c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
  850. c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
  851. c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
  852. c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
  853. c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
  854. c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
  855. c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
  856. c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
  857. c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
  858. c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
  859. c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
  860. c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
  861. c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
  862. c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
  863. c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
  864. c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
  865. c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
  866. c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
  867. c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
  868. c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
  869. c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
  870. c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
  871. c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
  872. c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
  873. c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
  874. c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
  875. c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
  876. c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
  877. c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
  878. c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
  879. c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
  880. c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
  881. c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
  882. c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
  883. c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
  884. c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
  885. c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
  886. c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
  887. c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
  888. c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
  889. c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
  890. c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
  891. c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
  892. c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
  893. c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
  894. c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
  895. c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
  896. c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
  897. c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
  898. c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
  899. c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
  900. c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
  901. c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
  902. c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
  903. c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
  904. c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
  905. c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
  906. c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
  907. c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
  908. c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
  909. c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
  910. c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
  911. c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
  912. c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
  913. c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
  914. c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
  915. c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
  916. c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
  917. c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
  918. c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
  919. c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
  920. c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
  921. c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
  922. c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
  923. c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
  924. c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
  925. c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
  926. c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
  927. c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
  928. c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
  929. c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
  930. c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
  931. c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
  932. c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
  933. c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
  934. c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
  935. c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
  936. c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
  937. c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
  938. c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
  939. c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
  940. c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
  941. c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
  942. c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
  943. c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
  944. c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
  945. c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
  946. c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
  947. c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
  948. c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
  949. c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
  950. c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
  951. c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
  952. }
  953. SAO_BAND_INIT(10, avx2);
  954. SAO_EDGE_INIT(10, avx2);
  955. c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
  956. c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
  957. }
  958. } else if (bit_depth == 12) {
  959. if (EXTERNAL_MMXEXT(cpu_flags)) {
  960. c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
  961. c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_mmxext;
  962. }
  963. if (EXTERNAL_SSE2(cpu_flags)) {
  964. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
  965. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
  966. if (ARCH_X86_64) {
  967. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
  968. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
  969. }
  970. SAO_BAND_INIT(12, sse2);
  971. SAO_EDGE_INIT(12, sse2);
  972. c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
  973. c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
  974. c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
  975. }
  976. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  977. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
  978. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
  979. }
  980. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  981. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
  982. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
  983. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
  984. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
  985. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
  986. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
  987. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
  988. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
  989. }
  990. if (EXTERNAL_AVX(cpu_flags)) {
  991. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
  992. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
  993. if (ARCH_X86_64) {
  994. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
  995. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
  996. }
  997. SAO_BAND_INIT(12, avx);
  998. }
  999. if (EXTERNAL_AVX2(cpu_flags)) {
  1000. c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
  1001. }
  1002. if (EXTERNAL_AVX2_FAST(cpu_flags)) {
  1003. c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
  1004. c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
  1005. SAO_BAND_INIT(12, avx2);
  1006. SAO_EDGE_INIT(12, avx2);
  1007. }
  1008. }
  1009. }