You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

366 lines
18KB

  1. /*
  2. * Copyright (c) 2013 Seppo Tomperi
  3. * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/cpu.h"
  23. #include "libavutil/x86/cpu.h"
  24. #include "libavcodec/hevcdsp.h"
  25. #define LFC_FUNC(DIR, DEPTH, OPT) \
  26. void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
  27. #define LFL_FUNC(DIR, DEPTH, OPT) \
  28. void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  29. #define LFC_FUNCS(type, depth) \
  30. LFC_FUNC(h, depth, sse2) \
  31. LFC_FUNC(v, depth, sse2)
  32. #define LFL_FUNCS(type, depth) \
  33. LFL_FUNC(h, depth, ssse3) \
  34. LFL_FUNC(v, depth, ssse3)
  35. LFC_FUNCS(uint8_t, 8)
  36. LFC_FUNCS(uint8_t, 10)
  37. LFL_FUNCS(uint8_t, 8)
  38. LFL_FUNCS(uint8_t, 10)
  39. #define idct_dc_proto(size, bitd, opt) \
  40. void ff_hevc_idct_ ## size ## _dc_add_ ## bitd ## _ ## opt(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
  41. idct_dc_proto(4, 8,mmxext);
  42. idct_dc_proto(8, 8,mmxext);
  43. idct_dc_proto(16,8, sse2);
  44. idct_dc_proto(32,8, sse2);
  45. idct_dc_proto(32,8, avx2);
  46. idct_dc_proto(4, 10,mmxext);
  47. idct_dc_proto(8, 10, sse2);
  48. idct_dc_proto(16,10, sse2);
  49. idct_dc_proto(32,10, sse2);
  50. idct_dc_proto(8, 10, avx);
  51. idct_dc_proto(16,10, avx);
  52. idct_dc_proto(32,10, avx);
  53. idct_dc_proto(16,10, avx2);
  54. idct_dc_proto(32,10, avx2);
  55. #define IDCT_FUNCS(W, opt) \
  56. void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
  57. void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs)
  58. IDCT_FUNCS(4x4, mmxext);
  59. IDCT_FUNCS(8x8, mmxext);
  60. IDCT_FUNCS(8x8, sse2);
  61. IDCT_FUNCS(16x16, sse2);
  62. IDCT_FUNCS(32x32, sse2);
  63. IDCT_FUNCS(16x16, avx2);
  64. IDCT_FUNCS(32x32, avx2);
  65. #define GET_PIXELS(width, depth, cf) \
  66. void ff_hevc_get_pixels_ ## width ## _ ## depth ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  67. uint8_t *src, ptrdiff_t srcstride, \
  68. int height, int mx, int my, int16_t *mcbuffer);
  69. GET_PIXELS(4, 8, sse2)
  70. GET_PIXELS(8, 8, sse2)
  71. GET_PIXELS(12, 8, sse2)
  72. GET_PIXELS(16, 8, sse2)
  73. GET_PIXELS(24, 8, sse2)
  74. GET_PIXELS(32, 8, sse2)
  75. GET_PIXELS(48, 8, sse2)
  76. GET_PIXELS(64, 8, sse2)
  77. GET_PIXELS(4, 10, sse2)
  78. GET_PIXELS(8, 10, sse2)
  79. GET_PIXELS(12, 10, sse2)
  80. GET_PIXELS(16, 10, sse2)
  81. GET_PIXELS(24, 10, sse2)
  82. GET_PIXELS(32, 10, sse2)
  83. GET_PIXELS(48, 10, sse2)
  84. GET_PIXELS(64, 10, sse2)
  85. /* those are independent of the bit depth, so declared separately */
  86. #define INTERP_HV_FUNC(width, cf) \
  87. void ff_hevc_qpel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  88. int16_t *src, ptrdiff_t srcstride, \
  89. int height, int mx, int my, int16_t *mcbuffer); \
  90. void ff_hevc_epel_hv_ ## width ## _ ## cf(int16_t *dst, ptrdiff_t dststride, \
  91. int16_t *src, ptrdiff_t srcstride, \
  92. int height, int mx, int my, int16_t *mcbuffer);
  93. INTERP_HV_FUNC(4, avx)
  94. INTERP_HV_FUNC(8, avx)
  95. INTERP_HV_FUNC(12, avx)
  96. INTERP_HV_FUNC(16, avx)
  97. INTERP_HV_FUNC(24, avx)
  98. INTERP_HV_FUNC(32, avx)
  99. INTERP_HV_FUNC(48, avx)
  100. INTERP_HV_FUNC(64, avx)
  101. #if ARCH_X86_64 && HAVE_AVX_EXTERNAL
  102. #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
  103. static void hevc_qpel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
  104. uint8_t *src, ptrdiff_t srcstride, \
  105. int height, int mx, int my, int16_t *mcbuffer) \
  106. { \
  107. const ptrdiff_t stride = FFALIGN(width + 7, 8); \
  108. ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - 3 * srcstride, srcstride, \
  109. height + 7, mx, my, mcbuffer); \
  110. ff_hevc_qpel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + 3 * stride, 2 * stride, \
  111. height, mx, my, mcbuffer); \
  112. }
  113. #else
  114. #define QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  115. #endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
  116. #define QPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
  117. void ff_hevc_qpel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
  118. uint8_t *src, ptrdiff_t srcstride, \
  119. int height, int mx, int my, int16_t *mcbuffer); \
  120. void ff_hevc_qpel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
  121. uint8_t *src, ptrdiff_t srcstride, \
  122. int height, int mx, int my, int16_t *mcbuffer); \
  123. QPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  124. QPEL_FUNCS(4, 8, ssse3, ssse3, avx)
  125. QPEL_FUNCS(8, 8, ssse3, ssse3, avx)
  126. QPEL_FUNCS(12, 8, ssse3, ssse3, avx)
  127. QPEL_FUNCS(16, 8, ssse3, ssse3, avx)
  128. QPEL_FUNCS(24, 8, ssse3, ssse3, avx)
  129. QPEL_FUNCS(32, 8, ssse3, ssse3, avx)
  130. QPEL_FUNCS(48, 8, ssse3, ssse3, avx)
  131. QPEL_FUNCS(64, 8, ssse3, ssse3, avx)
  132. QPEL_FUNCS(4, 10, avx, avx, avx)
  133. QPEL_FUNCS(8, 10, avx, avx, avx)
  134. QPEL_FUNCS(12, 10, avx, avx, avx)
  135. QPEL_FUNCS(16, 10, avx, avx, avx)
  136. QPEL_FUNCS(24, 10, avx, avx, avx)
  137. QPEL_FUNCS(32, 10, avx, avx, avx)
  138. QPEL_FUNCS(48, 10, avx, avx, avx)
  139. QPEL_FUNCS(64, 10, avx, avx, avx)
  140. #if ARCH_X86_64 && HAVE_AVX_EXTERNAL
  141. #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv) \
  142. static void hevc_epel_hv_ ## width ## _ ## depth ## _ ## cf_hv(int16_t *dst, ptrdiff_t dststride, \
  143. uint8_t *src, ptrdiff_t srcstride, \
  144. int height, int mx, int my, int16_t *mcbuffer) \
  145. { \
  146. const ptrdiff_t stride = FFALIGN(width + 3, 8); \
  147. ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(mcbuffer, 2 * stride, src - srcstride, srcstride, \
  148. height + 3, mx, my, mcbuffer); \
  149. ff_hevc_epel_hv_ ## width ## _ ## cf_hv(dst, dststride, mcbuffer + stride, 2 * stride, \
  150. height, mx, my, mcbuffer); \
  151. }
  152. #else
  153. #define EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  154. #endif /* ARCH_X86_64 && HAVE_AVX_EXTERNAL */
  155. #define EPEL_FUNCS(width, depth, cf_h, cf_v, cf_hv) \
  156. void ff_hevc_epel_h_ ## width ## _ ## depth ## _ ## cf_h(int16_t *dst, ptrdiff_t dststride, \
  157. uint8_t *src, ptrdiff_t srcstride, \
  158. int height, int mx, int my, int16_t *mcbuffer); \
  159. void ff_hevc_epel_v_ ## width ## _ ## depth ## _ ## cf_v(int16_t *dst, ptrdiff_t dststride, \
  160. uint8_t *src, ptrdiff_t srcstride, \
  161. int height, int mx, int my, int16_t *mcbuffer); \
  162. EPEL_FUNC_HV(width, depth, cf_h, cf_v, cf_hv)
  163. EPEL_FUNCS(4, 8, ssse3, ssse3, avx)
  164. EPEL_FUNCS(8, 8, ssse3, ssse3, avx)
  165. EPEL_FUNCS(12, 8, ssse3, ssse3, avx)
  166. EPEL_FUNCS(16, 8, ssse3, ssse3, avx)
  167. EPEL_FUNCS(24, 8, ssse3, ssse3, avx)
  168. EPEL_FUNCS(32, 8, ssse3, ssse3, avx)
  169. EPEL_FUNCS(4, 10, avx, avx, avx)
  170. EPEL_FUNCS(8, 10, avx, avx, avx)
  171. EPEL_FUNCS(12, 10, avx, avx, avx)
  172. EPEL_FUNCS(16, 10, avx, avx, avx)
  173. EPEL_FUNCS(24, 10, avx, avx, avx)
  174. EPEL_FUNCS(32, 10, avx, avx, avx)
  175. #define PUT_PRED(width, depth, cf_uw, cf_w) \
  176. void ff_hevc_put_unweighted_pred_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
  177. int16_t *src, ptrdiff_t srcstride, \
  178. int height); \
  179. void ff_hevc_put_unweighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_uw(uint8_t *dst, ptrdiff_t dststride, \
  180. int16_t *src1, int16_t *src2, \
  181. ptrdiff_t srcstride, int height); \
  182. void ff_hevc_put_weighted_pred_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight, int16_t offset, \
  183. uint8_t *dst, ptrdiff_t dststride, \
  184. int16_t *src, ptrdiff_t srcstride, \
  185. int height); \
  186. void ff_hevc_put_weighted_pred_avg_ ## width ## _ ## depth ## _ ## cf_w(uint8_t denom, int16_t weight0, int16_t weight1, \
  187. int16_t offset0, int16_t offset1, \
  188. uint8_t *dst, ptrdiff_t dststride, \
  189. int16_t *src0, int16_t *src1, ptrdiff_t srcstride, \
  190. int height);
  191. PUT_PRED(4, 8, sse2, sse4)
  192. PUT_PRED(8, 8, sse2, sse4)
  193. PUT_PRED(12, 8, sse2, sse4)
  194. PUT_PRED(16, 8, sse2, sse4)
  195. PUT_PRED(24, 8, sse2, sse4)
  196. PUT_PRED(32, 8, sse2, sse4)
  197. PUT_PRED(48, 8, sse2, sse4)
  198. PUT_PRED(64, 8, sse2, sse4)
  199. PUT_PRED(4, 10, sse2, sse4)
  200. PUT_PRED(8, 10, sse2, sse4)
  201. PUT_PRED(12, 10, sse2, sse4)
  202. PUT_PRED(16, 10, sse2, sse4)
  203. PUT_PRED(24, 10, sse2, sse4)
  204. PUT_PRED(32, 10, sse2, sse4)
  205. PUT_PRED(48, 10, sse2, sse4)
  206. PUT_PRED(64, 10, sse2, sse4)
  207. void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  208. {
  209. int cpu_flags = av_get_cpu_flags();
  210. #define SET_LUMA_FUNCS(tabname, funcname, depth, cf) \
  211. c->tabname[0] = funcname ## _4_ ## depth ## _ ## cf; \
  212. c->tabname[1] = funcname ## _8_ ## depth ## _ ## cf; \
  213. c->tabname[2] = funcname ## _12_ ## depth ## _ ## cf; \
  214. c->tabname[3] = funcname ## _16_ ## depth ## _ ## cf; \
  215. c->tabname[4] = funcname ## _24_ ## depth ## _ ## cf; \
  216. c->tabname[5] = funcname ## _32_ ## depth ## _ ## cf; \
  217. c->tabname[6] = funcname ## _48_ ## depth ## _ ## cf; \
  218. c->tabname[7] = funcname ## _64_ ## depth ## _ ## cf;
  219. #define SET_CHROMA_FUNCS(tabname, funcname, depth, cf) \
  220. c->tabname[1] = funcname ## _4_ ## depth ## _ ## cf; \
  221. c->tabname[3] = funcname ## _8_ ## depth ## _ ## cf; \
  222. c->tabname[4] = funcname ## _12_ ## depth ## _ ## cf; \
  223. c->tabname[5] = funcname ## _16_ ## depth ## _ ## cf; \
  224. c->tabname[6] = funcname ## _24_ ## depth ## _ ## cf; \
  225. c->tabname[7] = funcname ## _32_ ## depth ## _ ## cf;
  226. #define SET_QPEL_FUNCS(v, h, depth, cf, name) SET_LUMA_FUNCS (put_hevc_qpel[v][h], name, depth, cf)
  227. #define SET_EPEL_FUNCS(v, h, depth, cf, name) SET_CHROMA_FUNCS(put_hevc_epel[v][h], name, depth, cf)
  228. if (bit_depth == 8) {
  229. if (EXTERNAL_MMXEXT(cpu_flags)) {
  230. c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
  231. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_mmxext;
  232. }
  233. if (EXTERNAL_SSE2(cpu_flags)) {
  234. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
  235. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
  236. c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
  237. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
  238. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
  239. SET_QPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  240. SET_EPEL_FUNCS(0, 0, 8, sse2, ff_hevc_get_pixels);
  241. SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 8, sse2);
  242. SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 8, sse2);
  243. SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 8, sse2);
  244. SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 8, sse2);
  245. }
  246. if (EXTERNAL_SSSE3(cpu_flags)) {
  247. SET_QPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_qpel_h);
  248. SET_QPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_qpel_v);
  249. SET_EPEL_FUNCS(0, 1, 8, ssse3, ff_hevc_epel_h);
  250. SET_EPEL_FUNCS(1, 0, 8, ssse3, ff_hevc_epel_v);
  251. }
  252. } else if (bit_depth == 10) {
  253. if (EXTERNAL_MMXEXT(cpu_flags)) {
  254. c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
  255. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_mmxext;
  256. }
  257. if (EXTERNAL_SSE2(cpu_flags)) {
  258. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
  259. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
  260. c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
  261. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
  262. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
  263. SET_QPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
  264. SET_EPEL_FUNCS(0, 0, 10, sse2, ff_hevc_get_pixels);
  265. SET_LUMA_FUNCS(put_unweighted_pred, ff_hevc_put_unweighted_pred, 10, sse2);
  266. SET_LUMA_FUNCS(put_unweighted_pred_avg, ff_hevc_put_unweighted_pred_avg, 10, sse2);
  267. SET_CHROMA_FUNCS(put_unweighted_pred_chroma, ff_hevc_put_unweighted_pred, 10, sse2);
  268. SET_CHROMA_FUNCS(put_unweighted_pred_avg_chroma, ff_hevc_put_unweighted_pred_avg, 10, sse2);
  269. }
  270. }
  271. #if ARCH_X86_64
  272. if (bit_depth == 8) {
  273. if (EXTERNAL_SSSE3(cpu_flags)) {
  274. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
  275. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
  276. }
  277. if (EXTERNAL_SSE4(cpu_flags)) {
  278. SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 8, sse4);
  279. SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 8, sse4);
  280. SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 8, sse4);
  281. SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 8, sse4);
  282. }
  283. if (EXTERNAL_AVX(cpu_flags)) {
  284. #if HAVE_AVX_EXTERNAL
  285. SET_QPEL_FUNCS(1, 1, 8, avx, hevc_qpel_hv);
  286. SET_EPEL_FUNCS(1, 1, 8, avx, hevc_epel_hv);
  287. #endif /* HAVE_AVX_EXTERNAL */
  288. }
  289. if (EXTERNAL_AVX2(cpu_flags)) {
  290. c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
  291. c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
  292. }
  293. } else if (bit_depth == 10) {
  294. if (EXTERNAL_SSSE3(cpu_flags)) {
  295. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
  296. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
  297. }
  298. if (EXTERNAL_SSE4(cpu_flags)) {
  299. SET_LUMA_FUNCS(weighted_pred, ff_hevc_put_weighted_pred, 10, sse4);
  300. SET_CHROMA_FUNCS(weighted_pred_chroma, ff_hevc_put_weighted_pred, 10, sse4);
  301. SET_LUMA_FUNCS(weighted_pred_avg, ff_hevc_put_weighted_pred_avg, 10, sse4);
  302. SET_CHROMA_FUNCS(weighted_pred_avg_chroma, ff_hevc_put_weighted_pred_avg, 10, sse4);
  303. }
  304. if (EXTERNAL_AVX(cpu_flags)) {
  305. #if HAVE_AVX_EXTERNAL
  306. SET_QPEL_FUNCS(0, 1, 10, avx, ff_hevc_qpel_h);
  307. SET_QPEL_FUNCS(1, 0, 10, avx, ff_hevc_qpel_v);
  308. SET_QPEL_FUNCS(1, 1, 10, avx, hevc_qpel_hv);
  309. SET_EPEL_FUNCS(0, 1, 10, avx, ff_hevc_epel_h);
  310. SET_EPEL_FUNCS(1, 0, 10, avx, ff_hevc_epel_v);
  311. SET_EPEL_FUNCS(1, 1, 10, avx, hevc_epel_hv);
  312. #endif /* HAVE_AVX_EXTERNAL */
  313. }
  314. if (EXTERNAL_AVX2(cpu_flags)) {
  315. c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
  316. c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
  317. }
  318. }
  319. #endif /* ARCH_X86_64 */
  320. }