You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

446 lines
21KB

  1. /*
  2. * Copyright (c) 2013 Seppo Tomperi
  3. * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/cpu.h"
  23. #include "libavutil/x86/cpu.h"
  24. #include "libavcodec/hevcdsp.h"
  25. #define LFC_FUNC(DIR, DEPTH, OPT) \
  26. void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
  27. #define LFL_FUNC(DIR, DEPTH, OPT) \
  28. void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  29. #define LFC_FUNCS(type, depth) \
  30. LFC_FUNC(h, depth, sse2) \
  31. LFC_FUNC(v, depth, sse2)
  32. #define LFL_FUNCS(type, depth) \
  33. LFL_FUNC(h, depth, ssse3) \
  34. LFL_FUNC(v, depth, ssse3)
  35. LFC_FUNCS(uint8_t, 8)
  36. LFC_FUNCS(uint8_t, 10)
  37. LFL_FUNCS(uint8_t, 8)
  38. LFL_FUNCS(uint8_t, 10)
  39. #define idct_dc_proto(size, bitd, opt) \
  40. void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
  41. idct_dc_proto(4, 8,mmxext);
  42. idct_dc_proto(8, 8,mmxext);
  43. idct_dc_proto(16,8, sse2);
  44. idct_dc_proto(32,8, sse2);
  45. idct_dc_proto(32,8, avx2);
  46. idct_dc_proto(4, 10,mmxext);
  47. idct_dc_proto(8, 10, sse2);
  48. idct_dc_proto(16,10, sse2);
  49. idct_dc_proto(32,10, sse2);
  50. idct_dc_proto(8, 10, avx);
  51. idct_dc_proto(16,10, avx);
  52. idct_dc_proto(32,10, avx);
  53. idct_dc_proto(16,10, avx2);
  54. idct_dc_proto(32,10, avx2);
  55. #define IDCT_DC_FUNCS(W, opt) \
  56. void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
  57. void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
  58. IDCT_DC_FUNCS(4x4, mmxext);
  59. IDCT_DC_FUNCS(8x8, mmxext);
  60. IDCT_DC_FUNCS(8x8, sse2);
  61. IDCT_DC_FUNCS(16x16, sse2);
  62. IDCT_DC_FUNCS(32x32, sse2);
  63. IDCT_DC_FUNCS(16x16, avx2);
  64. IDCT_DC_FUNCS(32x32, avx2);
  65. #define IDCT_FUNCS(opt) \
  66. void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
  67. void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
  68. void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
  69. void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
  70. void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
  71. void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
  72. void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
  73. void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
  74. IDCT_FUNCS(sse2)
  75. IDCT_FUNCS(avx)
  76. void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  77. void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  78. void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  79. void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  80. void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  81. void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  82. void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  83. void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  84. void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  85. void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  86. void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  87. void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  88. void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  89. void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride);
  90. #define GET_PIXELS(width, depth, cf) \
  91. void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  92. uint8_t *src, ptrdiff_t srcstride, \
  93. int height, int mx, int my, int16_t *mcbuffer);
  94. GET_PIXELS(4, 8, sse2)
  95. GET_PIXELS(8, 8, sse2)
  96. GET_PIXELS(12, 8, sse2)
  97. GET_PIXELS(16, 8, sse2)
  98. GET_PIXELS(24, 8, sse2)
  99. GET_PIXELS(32, 8, sse2)
  100. GET_PIXELS(48, 8, sse2)
  101. GET_PIXELS(64, 8, sse2)
  102. GET_PIXELS(4, 10, sse2)
  103. GET_PIXELS(8, 10, sse2)
  104. GET_PIXELS(12, 10, sse2)
  105. GET_PIXELS(16, 10, sse2)
  106. GET_PIXELS(24, 10, sse2)
  107. GET_PIXELS(32, 10, sse2)
  108. GET_PIXELS(48, 10, sse2)
  109. GET_PIXELS(64, 10, sse2)
  110. /* those are independent of the bit depth, so declared separately */
  111. #define INTERP_HV_FUNC(width, cf) \
  112. void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  113. int16_t *src, ptrdiff_t srcstride, \
  114. int height, int mx, int my, int16_t *mcbuffer); \
  115. void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  116. int16_t *src, ptrdiff_t srcstride, \
  117. int height, int mx, int my, int16_t *mcbuffer);
  118. INTERP_HV_FUNC(4, avx)
  119. INTERP_HV_FUNC(8, avx)
  120. INTERP_HV_FUNC(12, avx)
  121. INTERP_HV_FUNC(16, avx)
  122. INTERP_HV_FUNC(24, avx)
  123. INTERP_HV_FUNC(32, avx)
  124. INTERP_HV_FUNC(48, avx)
  125. INTERP_HV_FUNC(64, avx)
  126. #if ARCH_X86_64 && HAVE_AVX_EXTERNAL
  127. #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
  128. static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
  129. uint8_t *src, ptrdiff_t srcstride, \
  130. int height, int mx, int my, int16_t *mcbuffer) \
  131. { \
  132. const ptrdiff_t stride = FFALIGN(width + 7, 8); \
  133. ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
  134. height + 7, mx, my, mcbuffer); \
  135. ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
  136. height, mx, my, mcbuffer); \
  137. }
  138. #else
  139. #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  140. #endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
  141. #define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
  142. void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
  143. uint8_t *src, ptrdiff_t srcstride, \
  144. int height, int mx, int my, int16_t *mcbuffer); \
  145. void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
  146. uint8_t *src, ptrdiff_t srcstride, \
  147. int height, int mx, int my, int16_t *mcbuffer); \
  148. QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  149. QPEL_FUNCS(4, 8, ssse3, ssse3, avx)
  150. QPEL_FUNCS(8, 8, ssse3, ssse3, avx)
  151. QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
  152. QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
  153. QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
  154. QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
  155. QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
  156. QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
  157. QPEL_FUNCS(4, 10, avx, avx, avx)
  158. QPEL_FUNCS(8, 10, avx, avx, avx)
  159. QPEL_FUNCS(12, 10, avx, avx, avx)
  160. QPEL_FUNCS(16, 10, avx, avx, avx)
  161. QPEL_FUNCS(24, 10, avx, avx, avx)
  162. QPEL_FUNCS(32, 10, avx, avx, avx)
  163. QPEL_FUNCS(48, 10, avx, avx, avx)
  164. QPEL_FUNCS(64, 10, avx, avx, avx)
  165. #if ARCH_X86_64 && HAVE_AVX_EXTERNAL
  166. #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
  167. static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
  168. uint8_t *src, ptrdiff_t srcstride, \
  169. int height, int mx, int my, int16_t *mcbuffer) \
  170. { \
  171. const ptrdiff_t stride = FFALIGN(width + 3, 8); \
  172. ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
  173. height + 3, mx, my, mcbuffer); \
  174. ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
  175. height, mx, my, mcbuffer); \
  176. }
  177. #else
  178. #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  179. #endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
  180. #define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
  181. void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
  182. uint8_t *src, ptrdiff_t srcstride, \
  183. int height, int mx, int my, int16_t *mcbuffer); \
  184. void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
  185. uint8_t *src, ptrdiff_t srcstride, \
  186. int height, int mx, int my, int16_t *mcbuffer); \
  187. EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  188. EPEL_FUNCS(4, 8, ssse3, ssse3, avx)
  189. EPEL_FUNCS(8, 8, ssse3, ssse3, avx)
  190. EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
  191. EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
  192. EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
  193. EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
  194. EPEL_FUNCS(4, 10, avx, avx, avx)
  195. EPEL_FUNCS(8, 10, avx, avx, avx)
  196. EPEL_FUNCS(12, 10, avx, avx, avx)
  197. EPEL_FUNCS(16, 10, avx, avx, avx)
  198. EPEL_FUNCS(24, 10, avx, avx, avx)
  199. EPEL_FUNCS(32, 10, avx, avx, avx)
  200. #define PUT_PRED(width, depth, cf_uw, cf_w) \
  201. void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
  202. int16_t *src, ptrdiff_t srcstride, \
  203. int height); \
  204. void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
  205. int16_t *src1, int16_t *src2, \
  206. ptrdiff_t srcstride, int height); \
  207. void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
  208. uint8_t *dst, ptrdiff_t dststride, \
  209. int16_t *src, ptrdiff_t srcstride, \
  210. int height); \
  211. void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
  212. int16_t offset0, int16_t offset1, \
  213. uint8_t *dst, ptrdiff_t dststride, \
  214. int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
  215. int height);
  216. PUT_PRED(4, 8, sse2, sse4)
  217. PUT_PRED(8, 8, sse2, sse4)
  218. PUT_PRED(12, 8, sse2, sse4)
  219. PUT_PRED(16, 8, sse2, sse4)
  220. PUT_PRED(24, 8, sse2, sse4)
  221. PUT_PRED(32, 8, sse2, sse4)
  222. PUT_PRED(48, 8, sse2, sse4)
  223. PUT_PRED(64, 8, sse2, sse4)
  224. PUT_PRED(4, 10, sse2, sse4)
  225. PUT_PRED(8, 10, sse2, sse4)
  226. PUT_PRED(12, 10, sse2, sse4)
  227. PUT_PRED(16, 10, sse2, sse4)
  228. PUT_PRED(24, 10, sse2, sse4)
  229. PUT_PRED(32, 10, sse2, sse4)
  230. PUT_PRED(48, 10, sse2, sse4)
  231. PUT_PRED(64, 10, sse2, sse4)
  232. void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  233. {
  234. int cpu_flags = av_get_cpu_flags();
  235. #define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
  236. c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
  237. c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
  238. c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
  239. c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
  240. c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
  241. c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
  242. c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
  243. c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
  244. #define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
  245. c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
  246. c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
  247. c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
  248. c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
  249. c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
  250. c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
  251. #define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
  252. #define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
  253. if (bit_depth == 8) {
  254. if (EXTERNAL_MMXEXT(cpu_flags)) {
  255. c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
  256. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
  257. c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
  258. }
  259. if (EXTERNAL_SSE2(cpu_flags)) {
  260. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
  261. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
  262. c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
  263. c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
  264. c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
  265. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
  266. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
  267. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
  268. c->idct[0] = ff_hevc_idct_4x4_8_sse2;
  269. c->idct[1] = ff_hevc_idct_8x8_8_sse2;
  270. SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  271. SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  272. SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
  273. SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
  274. SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
  275. SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
  276. }
  277. if (EXTERNAL_SSSE3(cpu_flags)) {
  278. SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
  279. SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
  280. SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
  281. SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
  282. }
  283. if (EXTERNAL_AVX(cpu_flags)) {
  284. c->idct[0] = ff_hevc_idct_4x4_8_avx;
  285. c->idct[1] = ff_hevc_idct_8x8_8_avx;
  286. c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
  287. c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
  288. c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
  289. }
  290. if (EXTERNAL_AVX2(cpu_flags)) {
  291. c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
  292. }
  293. } else if (bit_depth == 10) {
  294. if (EXTERNAL_MMXEXT(cpu_flags)) {
  295. c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
  296. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
  297. c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
  298. }
  299. if (EXTERNAL_SSE2(cpu_flags)) {
  300. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
  301. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
  302. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
  303. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
  304. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  305. c->idct[0] = ff_hevc_idct_4x4_10_sse2;
  306. c->idct[1] = ff_hevc_idct_8x8_10_sse2;
  307. SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
  308. SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
  309. SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
  310. SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
  311. SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
  312. SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
  313. c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
  314. c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
  315. c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
  316. }
  317. if (EXTERNAL_AVX(cpu_flags)) {
  318. c->idct[0] = ff_hevc_idct_4x4_10_avx;
  319. c->idct[1] = ff_hevc_idct_8x8_10_avx;
  320. }
  321. if (EXTERNAL_AVX2(cpu_flags)) {
  322. c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
  323. c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
  324. }
  325. }
  326. #if ARCH_X86_64
  327. if (bit_depth == 8) {
  328. if (EXTERNAL_SSE2(cpu_flags)) {
  329. c->idct[2] = ff_hevc_idct_16x16_8_sse2;
  330. c->idct[3] = ff_hevc_idct_32x32_8_sse2;
  331. }
  332. if (EXTERNAL_SSSE3(cpu_flags)) {
  333. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
  334. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
  335. }
  336. if (EXTERNAL_SSE4(cpu_flags)) {
  337. SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
  338. SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
  339. SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
  340. SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
  341. }
  342. if (EXTERNAL_AVX(cpu_flags)) {
  343. #if HAVE_AVX_EXTERNAL
  344. SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
  345. SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
  346. #endif /* HAVE_AVX_EXTERNAL */
  347. c->idct[2] = ff_hevc_idct_16x16_8_avx;
  348. c->idct[3] = ff_hevc_idct_32x32_8_avx;
  349. }
  350. if (EXTERNAL_AVX2(cpu_flags)) {
  351. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
  352. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
  353. }
  354. } else if (bit_depth == 10) {
  355. if (EXTERNAL_SSE2(cpu_flags)) {
  356. c->idct[2] = ff_hevc_idct_16x16_10_sse2;
  357. c->idct[3] = ff_hevc_idct_32x32_10_sse2;
  358. }
  359. if (EXTERNAL_SSSE3(cpu_flags)) {
  360. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
  361. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
  362. }
  363. if (EXTERNAL_SSE4(cpu_flags)) {
  364. SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
  365. SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
  366. SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
  367. SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
  368. }
  369. if (EXTERNAL_AVX(cpu_flags)) {
  370. #if HAVE_AVX_EXTERNAL
  371. SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
  372. SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
  373. SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
  374. SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
  375. SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
  376. SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
  377. #endif /* HAVE_AVX_EXTERNAL */
  378. c->idct[2] = ff_hevc_idct_16x16_10_avx;
  379. c->idct[3] = ff_hevc_idct_32x32_10_avx;
  380. }
  381. if (EXTERNAL_AVX2(cpu_flags)) {
  382. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
  383. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
  384. }
  385. }
  386. #endif /* ARCH_X86_64 */
  387. }