You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

620 lines
30KB

  1. /*
  2. * Copyright (c) 2013 Seppo Tomperi
  3. * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
  4. *
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "config.h"
  23. #include "libavutil/cpu.h"
  24. #include "libavutil/x86/asm.h"
  25. #include "libavutil/x86/cpu.h"
  26. #include "libavcodec/get_bits.h" /* required for hevcdsp.h GetBitContext */
  27. #include "libavcodec/hevcdsp.h"
  28. #include "libavcodec/x86/hevcdsp.h"
  29. #define LFC_FUNC(DIR, DEPTH, OPT) \
  30. void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
  31. #define LFL_FUNC(DIR, DEPTH, OPT) \
  32. void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
  33. #define LFC_FUNCS(type, depth, opt) \
  34. LFC_FUNC(h, depth, opt) \
  35. LFC_FUNC(v, depth, opt)
  36. #define LFL_FUNCS(type, depth, opt) \
  37. LFL_FUNC(h, depth, opt) \
  38. LFL_FUNC(v, depth, opt)
  39. LFC_FUNCS(uint8_t, 8, sse2)
  40. LFC_FUNCS(uint8_t, 10, sse2)
  41. LFC_FUNCS(uint8_t, 12, sse2)
  42. LFC_FUNCS(uint8_t, 8, avx)
  43. LFC_FUNCS(uint8_t, 10, avx)
  44. LFC_FUNCS(uint8_t, 12, avx)
  45. LFL_FUNCS(uint8_t, 8, sse2)
  46. LFL_FUNCS(uint8_t, 10, sse2)
  47. LFL_FUNCS(uint8_t, 12, sse2)
  48. LFL_FUNCS(uint8_t, 8, ssse3)
  49. LFL_FUNCS(uint8_t, 10, ssse3)
  50. LFL_FUNCS(uint8_t, 12, ssse3)
  51. LFL_FUNCS(uint8_t, 8, avx)
  52. LFL_FUNCS(uint8_t, 10, avx)
  53. LFL_FUNCS(uint8_t, 12, avx)
  54. #define IDCT_FUNCS(W, opt) \
  55. void ff_hevc_idct##W##_dc_8_##opt(int16_t *coeffs); \
  56. void ff_hevc_idct##W##_dc_10_##opt(int16_t *coeffs); \
  57. void ff_hevc_idct##W##_dc_12_##opt(int16_t *coeffs)
  58. IDCT_FUNCS(4x4, mmxext);
  59. IDCT_FUNCS(8x8, mmxext);
  60. IDCT_FUNCS(8x8, sse2);
  61. IDCT_FUNCS(16x16, sse2);
  62. IDCT_FUNCS(32x32, sse2);
  63. IDCT_FUNCS(16x16, avx2);
  64. IDCT_FUNCS(32x32, avx2);
  65. #define mc_rep_func(name, bitd, step, W, opt) \
  66. void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
  67. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  68. intptr_t mx, intptr_t my, int width) \
  69. { \
  70. int i; \
  71. uint8_t *src; \
  72. int16_t *dst; \
  73. for (i = 0; i < W; i += step) { \
  74. src = _src + (i * ((bitd + 7) / 8)); \
  75. dst = _dst + i; \
  76. ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
  77. } \
  78. }
  79. #define mc_rep_uni_func(name, bitd, step, W, opt) \
  80. void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
  81. uint8_t *_src, ptrdiff_t _srcstride, int height, \
  82. intptr_t mx, intptr_t my, int width) \
  83. { \
  84. int i; \
  85. uint8_t *src; \
  86. uint8_t *dst; \
  87. for (i = 0; i < W; i += step) { \
  88. src = _src + (i * ((bitd + 7) / 8)); \
  89. dst = _dst + (i * ((bitd + 7) / 8)); \
  90. ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
  91. height, mx, my, width); \
  92. } \
  93. }
  94. #define mc_rep_bi_func(name, bitd, step, W, opt) \
  95. void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \
  96. ptrdiff_t _srcstride, int16_t* _src2, \
  97. int height, intptr_t mx, intptr_t my, int width) \
  98. { \
  99. int i; \
  100. uint8_t *src; \
  101. uint8_t *dst; \
  102. int16_t *src2; \
  103. for (i = 0; i < W ; i += step) { \
  104. src = _src + (i * ((bitd + 7) / 8)); \
  105. dst = _dst + (i * ((bitd + 7) / 8)); \
  106. src2 = _src2 + i; \
  107. ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
  108. height, mx, my, width); \
  109. } \
  110. }
  111. #define mc_rep_funcs(name, bitd, step, W, opt) \
  112. mc_rep_func(name, bitd, step, W, opt); \
  113. mc_rep_uni_func(name, bitd, step, W, opt); \
  114. mc_rep_bi_func(name, bitd, step, W, opt)
  115. #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  116. mc_rep_funcs(pel_pixels, 8, 16, 64, sse4);
  117. mc_rep_funcs(pel_pixels, 8, 16, 48, sse4);
  118. mc_rep_funcs(pel_pixels, 8, 16, 32, sse4);
  119. mc_rep_funcs(pel_pixels, 8, 8, 24, sse4);
  120. mc_rep_funcs(pel_pixels,10, 8, 64, sse4);
  121. mc_rep_funcs(pel_pixels,10, 8, 48, sse4);
  122. mc_rep_funcs(pel_pixels,10, 8, 32, sse4);
  123. mc_rep_funcs(pel_pixels,10, 8, 24, sse4);
  124. mc_rep_funcs(pel_pixels,10, 8, 16, sse4);
  125. mc_rep_funcs(pel_pixels,10, 4, 12, sse4);
  126. mc_rep_funcs(pel_pixels,12, 8, 64, sse4);
  127. mc_rep_funcs(pel_pixels,12, 8, 48, sse4);
  128. mc_rep_funcs(pel_pixels,12, 8, 32, sse4);
  129. mc_rep_funcs(pel_pixels,12, 8, 24, sse4);
  130. mc_rep_funcs(pel_pixels,12, 8, 16, sse4);
  131. mc_rep_funcs(pel_pixels,12, 4, 12, sse4);
  132. mc_rep_funcs(epel_h, 8, 16, 64, sse4);
  133. mc_rep_funcs(epel_h, 8, 16, 48, sse4);
  134. mc_rep_funcs(epel_h, 8, 16, 32, sse4);
  135. mc_rep_funcs(epel_h, 8, 8, 24, sse4);
  136. mc_rep_funcs(epel_h,10, 8, 64, sse4);
  137. mc_rep_funcs(epel_h,10, 8, 48, sse4);
  138. mc_rep_funcs(epel_h,10, 8, 32, sse4);
  139. mc_rep_funcs(epel_h,10, 8, 24, sse4);
  140. mc_rep_funcs(epel_h,10, 8, 16, sse4);
  141. mc_rep_funcs(epel_h,10, 4, 12, sse4);
  142. mc_rep_funcs(epel_h,12, 8, 64, sse4);
  143. mc_rep_funcs(epel_h,12, 8, 48, sse4);
  144. mc_rep_funcs(epel_h,12, 8, 32, sse4);
  145. mc_rep_funcs(epel_h,12, 8, 24, sse4);
  146. mc_rep_funcs(epel_h,12, 8, 16, sse4);
  147. mc_rep_funcs(epel_h,12, 4, 12, sse4);
  148. mc_rep_funcs(epel_v, 8, 16, 64, sse4);
  149. mc_rep_funcs(epel_v, 8, 16, 48, sse4);
  150. mc_rep_funcs(epel_v, 8, 16, 32, sse4);
  151. mc_rep_funcs(epel_v, 8, 8, 24, sse4);
  152. mc_rep_funcs(epel_v,10, 8, 64, sse4);
  153. mc_rep_funcs(epel_v,10, 8, 48, sse4);
  154. mc_rep_funcs(epel_v,10, 8, 32, sse4);
  155. mc_rep_funcs(epel_v,10, 8, 24, sse4);
  156. mc_rep_funcs(epel_v,10, 8, 16, sse4);
  157. mc_rep_funcs(epel_v,10, 4, 12, sse4);
  158. mc_rep_funcs(epel_v,12, 8, 64, sse4);
  159. mc_rep_funcs(epel_v,12, 8, 48, sse4);
  160. mc_rep_funcs(epel_v,12, 8, 32, sse4);
  161. mc_rep_funcs(epel_v,12, 8, 24, sse4);
  162. mc_rep_funcs(epel_v,12, 8, 16, sse4);
  163. mc_rep_funcs(epel_v,12, 4, 12, sse4);
  164. mc_rep_funcs(epel_hv, 8, 8, 64, sse4);
  165. mc_rep_funcs(epel_hv, 8, 8, 48, sse4);
  166. mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
  167. mc_rep_funcs(epel_hv, 8, 8, 24, sse4);
  168. mc_rep_funcs(epel_hv, 8, 8, 16, sse4);
  169. mc_rep_funcs(epel_hv, 8, 4, 12, sse4);
  170. mc_rep_funcs(epel_hv,10, 8, 64, sse4);
  171. mc_rep_funcs(epel_hv,10, 8, 48, sse4);
  172. mc_rep_funcs(epel_hv,10, 8, 32, sse4);
  173. mc_rep_funcs(epel_hv,10, 8, 24, sse4);
  174. mc_rep_funcs(epel_hv,10, 8, 16, sse4);
  175. mc_rep_funcs(epel_hv,10, 4, 12, sse4);
  176. mc_rep_funcs(epel_hv,12, 8, 64, sse4);
  177. mc_rep_funcs(epel_hv,12, 8, 48, sse4);
  178. mc_rep_funcs(epel_hv,12, 8, 32, sse4);
  179. mc_rep_funcs(epel_hv,12, 8, 24, sse4);
  180. mc_rep_funcs(epel_hv,12, 8, 16, sse4);
  181. mc_rep_funcs(epel_hv,12, 4, 12, sse4);
  182. mc_rep_funcs(qpel_h, 8, 16, 64, sse4);
  183. mc_rep_funcs(qpel_h, 8, 16, 48, sse4);
  184. mc_rep_funcs(qpel_h, 8, 16, 32, sse4);
  185. mc_rep_funcs(qpel_h, 8, 8, 24, sse4);
  186. mc_rep_funcs(qpel_h,10, 8, 64, sse4);
  187. mc_rep_funcs(qpel_h,10, 8, 48, sse4);
  188. mc_rep_funcs(qpel_h,10, 8, 32, sse4);
  189. mc_rep_funcs(qpel_h,10, 8, 24, sse4);
  190. mc_rep_funcs(qpel_h,10, 8, 16, sse4);
  191. mc_rep_funcs(qpel_h,10, 4, 12, sse4);
  192. mc_rep_funcs(qpel_h,12, 8, 64, sse4);
  193. mc_rep_funcs(qpel_h,12, 8, 48, sse4);
  194. mc_rep_funcs(qpel_h,12, 8, 32, sse4);
  195. mc_rep_funcs(qpel_h,12, 8, 24, sse4);
  196. mc_rep_funcs(qpel_h,12, 8, 16, sse4);
  197. mc_rep_funcs(qpel_h,12, 4, 12, sse4);
  198. mc_rep_funcs(qpel_v, 8, 16, 64, sse4);
  199. mc_rep_funcs(qpel_v, 8, 16, 48, sse4);
  200. mc_rep_funcs(qpel_v, 8, 16, 32, sse4);
  201. mc_rep_funcs(qpel_v, 8, 8, 24, sse4);
  202. mc_rep_funcs(qpel_v,10, 8, 64, sse4);
  203. mc_rep_funcs(qpel_v,10, 8, 48, sse4);
  204. mc_rep_funcs(qpel_v,10, 8, 32, sse4);
  205. mc_rep_funcs(qpel_v,10, 8, 24, sse4);
  206. mc_rep_funcs(qpel_v,10, 8, 16, sse4);
  207. mc_rep_funcs(qpel_v,10, 4, 12, sse4);
  208. mc_rep_funcs(qpel_v,12, 8, 64, sse4);
  209. mc_rep_funcs(qpel_v,12, 8, 48, sse4);
  210. mc_rep_funcs(qpel_v,12, 8, 32, sse4);
  211. mc_rep_funcs(qpel_v,12, 8, 24, sse4);
  212. mc_rep_funcs(qpel_v,12, 8, 16, sse4);
  213. mc_rep_funcs(qpel_v,12, 4, 12, sse4);
  214. mc_rep_funcs(qpel_hv, 8, 8, 64, sse4);
  215. mc_rep_funcs(qpel_hv, 8, 8, 48, sse4);
  216. mc_rep_funcs(qpel_hv, 8, 8, 32, sse4);
  217. mc_rep_funcs(qpel_hv, 8, 8, 24, sse4);
  218. mc_rep_funcs(qpel_hv, 8, 8, 16, sse4);
  219. mc_rep_funcs(qpel_hv, 8, 4, 12, sse4);
  220. mc_rep_funcs(qpel_hv,10, 8, 64, sse4);
  221. mc_rep_funcs(qpel_hv,10, 8, 48, sse4);
  222. mc_rep_funcs(qpel_hv,10, 8, 32, sse4);
  223. mc_rep_funcs(qpel_hv,10, 8, 24, sse4);
  224. mc_rep_funcs(qpel_hv,10, 8, 16, sse4);
  225. mc_rep_funcs(qpel_hv,10, 4, 12, sse4);
  226. mc_rep_funcs(qpel_hv,12, 8, 64, sse4);
  227. mc_rep_funcs(qpel_hv,12, 8, 48, sse4);
  228. mc_rep_funcs(qpel_hv,12, 8, 32, sse4);
  229. mc_rep_funcs(qpel_hv,12, 8, 24, sse4);
  230. mc_rep_funcs(qpel_hv,12, 8, 16, sse4);
  231. mc_rep_funcs(qpel_hv,12, 4, 12, sse4);
  232. #define mc_rep_uni_w(bitd, step, W, opt) \
  233. void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\
  234. int height, int denom, int _wx, int _ox) \
  235. { \
  236. int i; \
  237. int16_t *src; \
  238. uint8_t *dst; \
  239. for (i = 0; i < W; i += step) { \
  240. src= _src + i; \
  241. dst= _dst + (i * ((bitd + 7) / 8)); \
  242. ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
  243. height, denom, _wx, _ox); \
  244. } \
  245. }
  246. mc_rep_uni_w(8, 6, 12, sse4);
  247. mc_rep_uni_w(8, 8, 16, sse4);
  248. mc_rep_uni_w(8, 8, 24, sse4);
  249. mc_rep_uni_w(8, 8, 32, sse4);
  250. mc_rep_uni_w(8, 8, 48, sse4);
  251. mc_rep_uni_w(8, 8, 64, sse4);
  252. mc_rep_uni_w(10, 6, 12, sse4);
  253. mc_rep_uni_w(10, 8, 16, sse4);
  254. mc_rep_uni_w(10, 8, 24, sse4);
  255. mc_rep_uni_w(10, 8, 32, sse4);
  256. mc_rep_uni_w(10, 8, 48, sse4);
  257. mc_rep_uni_w(10, 8, 64, sse4);
  258. mc_rep_uni_w(12, 6, 12, sse4);
  259. mc_rep_uni_w(12, 8, 16, sse4);
  260. mc_rep_uni_w(12, 8, 24, sse4);
  261. mc_rep_uni_w(12, 8, 32, sse4);
  262. mc_rep_uni_w(12, 8, 48, sse4);
  263. mc_rep_uni_w(12, 8, 64, sse4);
  264. #define mc_rep_bi_w(bitd, step, W, opt) \
  265. void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \
  266. int16_t *_src2, int height, \
  267. int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
  268. { \
  269. int i; \
  270. int16_t *src; \
  271. int16_t *src2; \
  272. uint8_t *dst; \
  273. for (i = 0; i < W; i += step) { \
  274. src = _src + i; \
  275. src2 = _src2 + i; \
  276. dst = _dst + (i * ((bitd + 7) / 8)); \
  277. ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
  278. height, denom, _wx0, _wx1, _ox0, _ox1); \
  279. } \
  280. }
  281. mc_rep_bi_w(8, 6, 12, sse4);
  282. mc_rep_bi_w(8, 8, 16, sse4);
  283. mc_rep_bi_w(8, 8, 24, sse4);
  284. mc_rep_bi_w(8, 8, 32, sse4);
  285. mc_rep_bi_w(8, 8, 48, sse4);
  286. mc_rep_bi_w(8, 8, 64, sse4);
  287. mc_rep_bi_w(10, 6, 12, sse4);
  288. mc_rep_bi_w(10, 8, 16, sse4);
  289. mc_rep_bi_w(10, 8, 24, sse4);
  290. mc_rep_bi_w(10, 8, 32, sse4);
  291. mc_rep_bi_w(10, 8, 48, sse4);
  292. mc_rep_bi_w(10, 8, 64, sse4);
  293. mc_rep_bi_w(12, 6, 12, sse4);
  294. mc_rep_bi_w(12, 8, 16, sse4);
  295. mc_rep_bi_w(12, 8, 24, sse4);
  296. mc_rep_bi_w(12, 8, 32, sse4);
  297. mc_rep_bi_w(12, 8, 48, sse4);
  298. mc_rep_bi_w(12, 8, 64, sse4);
  299. #define mc_uni_w_func(name, bitd, W, opt) \
  300. void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  301. uint8_t *_src, ptrdiff_t _srcstride, \
  302. int height, int denom, \
  303. int _wx, int _ox, \
  304. intptr_t mx, intptr_t my, int width) \
  305. { \
  306. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  307. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  308. ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox);\
  309. }
  310. #define mc_uni_w_funcs(name, bitd, opt) \
  311. mc_uni_w_func(name, bitd, 4, opt); \
  312. mc_uni_w_func(name, bitd, 8, opt); \
  313. mc_uni_w_func(name, bitd, 12, opt); \
  314. mc_uni_w_func(name, bitd, 16, opt); \
  315. mc_uni_w_func(name, bitd, 24, opt); \
  316. mc_uni_w_func(name, bitd, 32, opt); \
  317. mc_uni_w_func(name, bitd, 48, opt); \
  318. mc_uni_w_func(name, bitd, 64, opt)
  319. mc_uni_w_funcs(pel_pixels, 8, sse4);
  320. mc_uni_w_func(pel_pixels, 8, 6, sse4);
  321. mc_uni_w_funcs(epel_h, 8, sse4);
  322. mc_uni_w_func(epel_h, 8, 6, sse4);
  323. mc_uni_w_funcs(epel_v, 8, sse4);
  324. mc_uni_w_func(epel_v, 8, 6, sse4);
  325. mc_uni_w_funcs(epel_hv, 8, sse4);
  326. mc_uni_w_func(epel_hv, 8, 6, sse4);
  327. mc_uni_w_funcs(qpel_h, 8, sse4);
  328. mc_uni_w_funcs(qpel_v, 8, sse4);
  329. mc_uni_w_funcs(qpel_hv, 8, sse4);
  330. mc_uni_w_funcs(pel_pixels, 10, sse4);
  331. mc_uni_w_func(pel_pixels, 10, 6, sse4);
  332. mc_uni_w_funcs(epel_h, 10, sse4);
  333. mc_uni_w_func(epel_h, 10, 6, sse4);
  334. mc_uni_w_funcs(epel_v, 10, sse4);
  335. mc_uni_w_func(epel_v, 10, 6, sse4);
  336. mc_uni_w_funcs(epel_hv, 10, sse4);
  337. mc_uni_w_func(epel_hv, 10, 6, sse4);
  338. mc_uni_w_funcs(qpel_h, 10, sse4);
  339. mc_uni_w_funcs(qpel_v, 10, sse4);
  340. mc_uni_w_funcs(qpel_hv, 10, sse4);
  341. mc_uni_w_funcs(pel_pixels, 12, sse4);
  342. mc_uni_w_func(pel_pixels, 12, 6, sse4);
  343. mc_uni_w_funcs(epel_h, 12, sse4);
  344. mc_uni_w_func(epel_h, 12, 6, sse4);
  345. mc_uni_w_funcs(epel_v, 12, sse4);
  346. mc_uni_w_func(epel_v, 12, 6, sse4);
  347. mc_uni_w_funcs(epel_hv, 12, sse4);
  348. mc_uni_w_func(epel_hv, 12, 6, sse4);
  349. mc_uni_w_funcs(qpel_h, 12, sse4);
  350. mc_uni_w_funcs(qpel_v, 12, sse4);
  351. mc_uni_w_funcs(qpel_hv, 12, sse4);
  352. #define mc_bi_w_func(name, bitd, W, opt) \
  353. void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
  354. uint8_t *_src, ptrdiff_t _srcstride, \
  355. int16_t *_src2, \
  356. int height, int denom, \
  357. int _wx0, int _wx1, int _ox0, int _ox1, \
  358. intptr_t mx, intptr_t my, int width) \
  359. { \
  360. LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
  361. ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
  362. ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2, \
  363. height, denom, _wx0, _wx1, _ox0, _ox1); \
  364. }
  365. #define mc_bi_w_funcs(name, bitd, opt) \
  366. mc_bi_w_func(name, bitd, 4, opt); \
  367. mc_bi_w_func(name, bitd, 8, opt); \
  368. mc_bi_w_func(name, bitd, 12, opt); \
  369. mc_bi_w_func(name, bitd, 16, opt); \
  370. mc_bi_w_func(name, bitd, 24, opt); \
  371. mc_bi_w_func(name, bitd, 32, opt); \
  372. mc_bi_w_func(name, bitd, 48, opt); \
  373. mc_bi_w_func(name, bitd, 64, opt)
  374. mc_bi_w_funcs(pel_pixels, 8, sse4);
  375. mc_bi_w_func(pel_pixels, 8, 6, sse4);
  376. mc_bi_w_funcs(epel_h, 8, sse4);
  377. mc_bi_w_func(epel_h, 8, 6, sse4);
  378. mc_bi_w_funcs(epel_v, 8, sse4);
  379. mc_bi_w_func(epel_v, 8, 6, sse4);
  380. mc_bi_w_funcs(epel_hv, 8, sse4);
  381. mc_bi_w_func(epel_hv, 8, 6, sse4);
  382. mc_bi_w_funcs(qpel_h, 8, sse4);
  383. mc_bi_w_funcs(qpel_v, 8, sse4);
  384. mc_bi_w_funcs(qpel_hv, 8, sse4);
  385. mc_bi_w_funcs(pel_pixels, 10, sse4);
  386. mc_bi_w_func(pel_pixels, 10, 6, sse4);
  387. mc_bi_w_funcs(epel_h, 10, sse4);
  388. mc_bi_w_func(epel_h, 10, 6, sse4);
  389. mc_bi_w_funcs(epel_v, 10, sse4);
  390. mc_bi_w_func(epel_v, 10, 6, sse4);
  391. mc_bi_w_funcs(epel_hv, 10, sse4);
  392. mc_bi_w_func(epel_hv, 10, 6, sse4);
  393. mc_bi_w_funcs(qpel_h, 10, sse4);
  394. mc_bi_w_funcs(qpel_v, 10, sse4);
  395. mc_bi_w_funcs(qpel_hv, 10, sse4);
  396. mc_bi_w_funcs(pel_pixels, 12, sse4);
  397. mc_bi_w_func(pel_pixels, 12, 6, sse4);
  398. mc_bi_w_funcs(epel_h, 12, sse4);
  399. mc_bi_w_func(epel_h, 12, 6, sse4);
  400. mc_bi_w_funcs(epel_v, 12, sse4);
  401. mc_bi_w_func(epel_v, 12, 6, sse4);
  402. mc_bi_w_funcs(epel_hv, 12, sse4);
  403. mc_bi_w_func(epel_hv, 12, 6, sse4);
  404. mc_bi_w_funcs(qpel_h, 12, sse4);
  405. mc_bi_w_funcs(qpel_v, 12, sse4);
  406. mc_bi_w_funcs(qpel_hv, 12, sse4);
  407. #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
  408. #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
  409. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  410. PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
  411. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  412. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  413. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  414. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  415. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  416. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  417. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  418. #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
  419. PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
  420. PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
  421. PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
  422. PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
  423. PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
  424. PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
  425. PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
  426. PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
  427. void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
  428. {
  429. int cpu_flags = av_get_cpu_flags();
  430. if (bit_depth == 8) {
  431. if (EXTERNAL_MMXEXT(cpu_flags)) {
  432. c->idct_dc[0] = ff_hevc_idct4x4_dc_8_mmxext;
  433. c->idct_dc[1] = ff_hevc_idct8x8_dc_8_mmxext;
  434. c->transform_add[0] = ff_hevc_transform_add4_8_mmxext;
  435. }
  436. if (EXTERNAL_SSE2(cpu_flags)) {
  437. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
  438. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
  439. if (ARCH_X86_64) {
  440. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
  441. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
  442. }
  443. c->idct_dc[1] = ff_hevc_idct8x8_dc_8_sse2;
  444. c->idct_dc[2] = ff_hevc_idct16x16_dc_8_sse2;
  445. c->idct_dc[3] = ff_hevc_idct32x32_dc_8_sse2;
  446. c->transform_add[1] = ff_hevc_transform_add8_8_sse2;
  447. c->transform_add[2] = ff_hevc_transform_add16_8_sse2;
  448. c->transform_add[3] = ff_hevc_transform_add32_8_sse2;
  449. }
  450. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  451. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
  452. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
  453. }
  454. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  455. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
  456. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
  457. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
  458. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
  459. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
  460. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
  461. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
  462. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
  463. }
  464. if (EXTERNAL_AVX(cpu_flags)) {
  465. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
  466. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
  467. if (ARCH_X86_64) {
  468. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
  469. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
  470. }
  471. c->transform_add[1] = ff_hevc_transform_add8_8_avx;
  472. c->transform_add[2] = ff_hevc_transform_add16_8_avx;
  473. c->transform_add[3] = ff_hevc_transform_add32_8_avx;
  474. }
  475. if (EXTERNAL_AVX2(cpu_flags)) {
  476. c->idct_dc[2] = ff_hevc_idct16x16_dc_8_avx2;
  477. c->idct_dc[3] = ff_hevc_idct32x32_dc_8_avx2;
  478. }
  479. } else if (bit_depth == 10) {
  480. if (EXTERNAL_MMXEXT(cpu_flags)) {
  481. c->transform_add[0] = ff_hevc_transform_add4_10_mmxext;
  482. c->idct_dc[0] = ff_hevc_idct4x4_dc_10_mmxext;
  483. c->idct_dc[1] = ff_hevc_idct8x8_dc_10_mmxext;
  484. }
  485. if (EXTERNAL_SSE2(cpu_flags)) {
  486. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
  487. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
  488. if (ARCH_X86_64) {
  489. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
  490. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
  491. }
  492. c->idct_dc[1] = ff_hevc_idct8x8_dc_10_sse2;
  493. c->idct_dc[2] = ff_hevc_idct16x16_dc_10_sse2;
  494. c->idct_dc[3] = ff_hevc_idct32x32_dc_10_sse2;
  495. c->transform_add[1] = ff_hevc_transform_add8_10_sse2;
  496. c->transform_add[2] = ff_hevc_transform_add16_10_sse2;
  497. c->transform_add[3] = ff_hevc_transform_add32_10_sse2;
  498. }
  499. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  500. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
  501. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
  502. }
  503. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  504. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
  505. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
  506. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
  507. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
  508. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
  509. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
  510. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
  511. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
  512. }
  513. if (EXTERNAL_AVX(cpu_flags)) {
  514. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
  515. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
  516. if (ARCH_X86_64) {
  517. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
  518. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
  519. }
  520. }
  521. if (EXTERNAL_AVX2(cpu_flags)) {
  522. c->idct_dc[2] = ff_hevc_idct16x16_dc_10_avx2;
  523. c->idct_dc[3] = ff_hevc_idct32x32_dc_10_avx2;
  524. c->transform_add[2] = ff_hevc_transform_add16_10_avx2;
  525. c->transform_add[3] = ff_hevc_transform_add32_10_avx2;
  526. }
  527. } else if (bit_depth == 12) {
  528. if (EXTERNAL_MMXEXT(cpu_flags)) {
  529. c->idct_dc[0] = ff_hevc_idct4x4_dc_12_mmxext;
  530. c->idct_dc[1] = ff_hevc_idct8x8_dc_12_mmxext;
  531. }
  532. if (EXTERNAL_SSE2(cpu_flags)) {
  533. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
  534. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
  535. if (ARCH_X86_64) {
  536. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
  537. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
  538. }
  539. c->idct_dc[1] = ff_hevc_idct8x8_dc_12_sse2;
  540. c->idct_dc[2] = ff_hevc_idct16x16_dc_12_sse2;
  541. c->idct_dc[3] = ff_hevc_idct32x32_dc_12_sse2;
  542. }
  543. if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
  544. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
  545. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
  546. }
  547. if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
  548. EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
  549. EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
  550. EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
  551. EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
  552. QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
  553. QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
  554. QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
  555. QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
  556. }
  557. if (EXTERNAL_AVX(cpu_flags)) {
  558. c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
  559. c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
  560. if (ARCH_X86_64) {
  561. c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
  562. c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
  563. }
  564. }
  565. if (EXTERNAL_AVX2(cpu_flags)) {
  566. c->idct_dc[2] = ff_hevc_idct16x16_dc_12_avx2;
  567. c->idct_dc[3] = ff_hevc_idct32x32_dc_12_avx2;
  568. }
  569. }
  570. }