You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

530 lines
15KB

  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with FFmpeg; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "buffer.h"
  19. #include "common.h"
  20. #include "hwcontext.h"
  21. #include "hwcontext_internal.h"
  22. #include "hwcontext_cuda_internal.h"
  23. #if CONFIG_VULKAN
  24. #include "hwcontext_vulkan.h"
  25. #endif
  26. #include "cuda_check.h"
  27. #include "mem.h"
  28. #include "pixdesc.h"
  29. #include "pixfmt.h"
  30. #include "imgutils.h"
  31. #define CUDA_FRAME_ALIGNMENT 256
  32. typedef struct CUDAFramesContext {
  33. int shift_width, shift_height;
  34. } CUDAFramesContext;
  35. static const enum AVPixelFormat supported_formats[] = {
  36. AV_PIX_FMT_NV12,
  37. AV_PIX_FMT_YUV420P,
  38. AV_PIX_FMT_YUVA420P,
  39. AV_PIX_FMT_YUV444P,
  40. AV_PIX_FMT_P010,
  41. AV_PIX_FMT_P016,
  42. AV_PIX_FMT_YUV444P16,
  43. AV_PIX_FMT_0RGB32,
  44. AV_PIX_FMT_0BGR32,
  45. #if CONFIG_VULKAN
  46. AV_PIX_FMT_VULKAN,
  47. #endif
  48. };
  49. #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
  50. static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
  51. const void *hwconfig,
  52. AVHWFramesConstraints *constraints)
  53. {
  54. int i;
  55. constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
  56. sizeof(*constraints->valid_sw_formats));
  57. if (!constraints->valid_sw_formats)
  58. return AVERROR(ENOMEM);
  59. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
  60. constraints->valid_sw_formats[i] = supported_formats[i];
  61. constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
  62. constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
  63. if (!constraints->valid_hw_formats)
  64. return AVERROR(ENOMEM);
  65. constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
  66. constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
  67. return 0;
  68. }
  69. static void cuda_buffer_free(void *opaque, uint8_t *data)
  70. {
  71. AVHWFramesContext *ctx = opaque;
  72. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  73. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  74. CudaFunctions *cu = hwctx->internal->cuda_dl;
  75. CUcontext dummy;
  76. CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  77. CHECK_CU(cu->cuMemFree((CUdeviceptr)data));
  78. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  79. }
  80. static AVBufferRef *cuda_pool_alloc(void *opaque, int size)
  81. {
  82. AVHWFramesContext *ctx = opaque;
  83. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  84. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  85. CudaFunctions *cu = hwctx->internal->cuda_dl;
  86. AVBufferRef *ret = NULL;
  87. CUcontext dummy = NULL;
  88. CUdeviceptr data;
  89. int err;
  90. err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  91. if (err < 0)
  92. return NULL;
  93. err = CHECK_CU(cu->cuMemAlloc(&data, size));
  94. if (err < 0)
  95. goto fail;
  96. ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
  97. if (!ret) {
  98. CHECK_CU(cu->cuMemFree(data));
  99. goto fail;
  100. }
  101. fail:
  102. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  103. return ret;
  104. }
  105. static int cuda_frames_init(AVHWFramesContext *ctx)
  106. {
  107. CUDAFramesContext *priv = ctx->internal->priv;
  108. int i;
  109. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
  110. if (ctx->sw_format == supported_formats[i])
  111. break;
  112. }
  113. if (i == FF_ARRAY_ELEMS(supported_formats)) {
  114. av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
  115. av_get_pix_fmt_name(ctx->sw_format));
  116. return AVERROR(ENOSYS);
  117. }
  118. av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
  119. if (!ctx->pool) {
  120. int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  121. if (size < 0)
  122. return size;
  123. ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
  124. if (!ctx->internal->pool_internal)
  125. return AVERROR(ENOMEM);
  126. }
  127. return 0;
  128. }
  129. static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
  130. {
  131. int res;
  132. frame->buf[0] = av_buffer_pool_get(ctx->pool);
  133. if (!frame->buf[0])
  134. return AVERROR(ENOMEM);
  135. res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
  136. ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  137. if (res < 0)
  138. return res;
  139. // YUV420P is a special case.
  140. // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
  141. if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
  142. frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2;
  143. frame->data[2] = frame->data[1];
  144. frame->data[1] = frame->data[2] + frame->linesize[2] * ctx->height / 2;
  145. }
  146. frame->format = AV_PIX_FMT_CUDA;
  147. frame->width = ctx->width;
  148. frame->height = ctx->height;
  149. return 0;
  150. }
  151. static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
  152. enum AVHWFrameTransferDirection dir,
  153. enum AVPixelFormat **formats)
  154. {
  155. enum AVPixelFormat *fmts;
  156. fmts = av_malloc_array(2, sizeof(*fmts));
  157. if (!fmts)
  158. return AVERROR(ENOMEM);
  159. fmts[0] = ctx->sw_format;
  160. fmts[1] = AV_PIX_FMT_NONE;
  161. *formats = fmts;
  162. return 0;
  163. }
  164. static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst,
  165. const AVFrame *src)
  166. {
  167. CUDAFramesContext *priv = ctx->internal->priv;
  168. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  169. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  170. CudaFunctions *cu = hwctx->internal->cuda_dl;
  171. CUcontext dummy;
  172. int i, ret;
  173. /* We don't support transfers to HW devices. */
  174. if (dst->hw_frames_ctx)
  175. return AVERROR(ENOSYS);
  176. ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  177. if (ret < 0)
  178. return ret;
  179. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  180. CUDA_MEMCPY2D cpy = {
  181. .srcMemoryType = CU_MEMORYTYPE_DEVICE,
  182. .dstMemoryType = CU_MEMORYTYPE_HOST,
  183. .srcDevice = (CUdeviceptr)src->data[i],
  184. .dstHost = dst->data[i],
  185. .srcPitch = src->linesize[i],
  186. .dstPitch = dst->linesize[i],
  187. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  188. .Height = src->height >> (i ? priv->shift_height : 0),
  189. };
  190. ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
  191. if (ret < 0)
  192. goto exit;
  193. }
  194. ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream));
  195. if (ret < 0)
  196. goto exit;
  197. exit:
  198. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  199. return 0;
  200. }
  201. static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst,
  202. const AVFrame *src)
  203. {
  204. CUDAFramesContext *priv = ctx->internal->priv;
  205. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  206. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  207. CudaFunctions *cu = hwctx->internal->cuda_dl;
  208. CUcontext dummy;
  209. int i, ret;
  210. /* We don't support transfers from HW devices. */
  211. if (src->hw_frames_ctx)
  212. return AVERROR(ENOSYS);
  213. ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  214. if (ret < 0)
  215. return ret;
  216. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  217. CUDA_MEMCPY2D cpy = {
  218. .srcMemoryType = CU_MEMORYTYPE_HOST,
  219. .dstMemoryType = CU_MEMORYTYPE_DEVICE,
  220. .srcHost = src->data[i],
  221. .dstDevice = (CUdeviceptr)dst->data[i],
  222. .srcPitch = src->linesize[i],
  223. .dstPitch = dst->linesize[i],
  224. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  225. .Height = src->height >> ((i == 0 || i == 3) ? 0 : priv->shift_height),
  226. };
  227. ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
  228. if (ret < 0)
  229. goto exit;
  230. }
  231. exit:
  232. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  233. return 0;
  234. }
  235. static void cuda_device_uninit(AVHWDeviceContext *device_ctx)
  236. {
  237. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  238. if (hwctx->internal) {
  239. CudaFunctions *cu = hwctx->internal->cuda_dl;
  240. if (hwctx->internal->is_allocated && hwctx->cuda_ctx) {
  241. if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT)
  242. CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device));
  243. else
  244. CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx));
  245. hwctx->cuda_ctx = NULL;
  246. }
  247. cuda_free_functions(&hwctx->internal->cuda_dl);
  248. }
  249. av_freep(&hwctx->internal);
  250. }
  251. static int cuda_device_init(AVHWDeviceContext *ctx)
  252. {
  253. AVCUDADeviceContext *hwctx = ctx->hwctx;
  254. int ret;
  255. if (!hwctx->internal) {
  256. hwctx->internal = av_mallocz(sizeof(*hwctx->internal));
  257. if (!hwctx->internal)
  258. return AVERROR(ENOMEM);
  259. }
  260. if (!hwctx->internal->cuda_dl) {
  261. ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx);
  262. if (ret < 0) {
  263. av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
  264. goto error;
  265. }
  266. }
  267. return 0;
  268. error:
  269. cuda_device_uninit(ctx);
  270. return ret;
  271. }
  272. static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) {
  273. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  274. CudaFunctions *cu;
  275. CUcontext dummy;
  276. int ret, dev_active = 0;
  277. unsigned int dev_flags = 0;
  278. const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
  279. cu = hwctx->internal->cuda_dl;
  280. hwctx->internal->flags = flags;
  281. if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
  282. ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
  283. &dev_flags, &dev_active));
  284. if (ret < 0)
  285. return ret;
  286. if (dev_active && dev_flags != desired_flags) {
  287. av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
  288. return AVERROR(ENOTSUP);
  289. } else if (dev_flags != desired_flags) {
  290. ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
  291. desired_flags));
  292. if (ret < 0)
  293. return ret;
  294. }
  295. ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
  296. hwctx->internal->cuda_device));
  297. if (ret < 0)
  298. return ret;
  299. } else {
  300. ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
  301. hwctx->internal->cuda_device));
  302. if (ret < 0)
  303. return ret;
  304. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  305. }
  306. hwctx->internal->is_allocated = 1;
  307. // Setting stream to NULL will make functions automatically use the default CUstream
  308. hwctx->stream = NULL;
  309. return 0;
  310. }
  311. static int cuda_device_create(AVHWDeviceContext *device_ctx,
  312. const char *device,
  313. AVDictionary *opts, int flags)
  314. {
  315. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  316. CudaFunctions *cu;
  317. int ret, device_idx = 0;
  318. if (device)
  319. device_idx = strtol(device, NULL, 0);
  320. if (cuda_device_init(device_ctx) < 0)
  321. goto error;
  322. cu = hwctx->internal->cuda_dl;
  323. ret = CHECK_CU(cu->cuInit(0));
  324. if (ret < 0)
  325. goto error;
  326. ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx));
  327. if (ret < 0)
  328. goto error;
  329. ret = cuda_context_init(device_ctx, flags);
  330. if (ret < 0)
  331. goto error;
  332. return 0;
  333. error:
  334. cuda_device_uninit(device_ctx);
  335. return AVERROR_UNKNOWN;
  336. }
  337. static int cuda_device_derive(AVHWDeviceContext *device_ctx,
  338. AVHWDeviceContext *src_ctx,
  339. int flags) {
  340. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  341. CudaFunctions *cu;
  342. const char *src_uuid = NULL;
  343. int ret, i, device_count;
  344. #if CONFIG_VULKAN
  345. VkPhysicalDeviceIDProperties vk_idp = {
  346. .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES,
  347. };
  348. #endif
  349. switch (src_ctx->type) {
  350. #if CONFIG_VULKAN
  351. case AV_HWDEVICE_TYPE_VULKAN: {
  352. AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
  353. VkPhysicalDeviceProperties2 vk_dev_props = {
  354. .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
  355. .pNext = &vk_idp,
  356. };
  357. vkGetPhysicalDeviceProperties2(vkctx->phys_dev, &vk_dev_props);
  358. src_uuid = vk_idp.deviceUUID;
  359. break;
  360. }
  361. #endif
  362. default:
  363. return AVERROR(ENOSYS);
  364. }
  365. if (!src_uuid) {
  366. av_log(device_ctx, AV_LOG_ERROR,
  367. "Failed to get UUID of source device.\n");
  368. goto error;
  369. }
  370. if (cuda_device_init(device_ctx) < 0)
  371. goto error;
  372. cu = hwctx->internal->cuda_dl;
  373. ret = CHECK_CU(cu->cuInit(0));
  374. if (ret < 0)
  375. goto error;
  376. ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
  377. if (ret < 0)
  378. goto error;
  379. hwctx->internal->cuda_device = -1;
  380. for (i = 0; i < device_count; i++) {
  381. CUdevice dev;
  382. CUuuid uuid;
  383. ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
  384. if (ret < 0)
  385. goto error;
  386. ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
  387. if (ret < 0)
  388. goto error;
  389. if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
  390. hwctx->internal->cuda_device = dev;
  391. break;
  392. }
  393. }
  394. if (hwctx->internal->cuda_device == -1) {
  395. av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
  396. goto error;
  397. }
  398. ret = cuda_context_init(device_ctx, flags);
  399. if (ret < 0)
  400. goto error;
  401. return 0;
  402. error:
  403. cuda_device_uninit(device_ctx);
  404. return AVERROR_UNKNOWN;
  405. }
  406. const HWContextType ff_hwcontext_type_cuda = {
  407. .type = AV_HWDEVICE_TYPE_CUDA,
  408. .name = "CUDA",
  409. .device_hwctx_size = sizeof(AVCUDADeviceContext),
  410. .frames_priv_size = sizeof(CUDAFramesContext),
  411. .device_create = cuda_device_create,
  412. .device_derive = cuda_device_derive,
  413. .device_init = cuda_device_init,
  414. .device_uninit = cuda_device_uninit,
  415. .frames_get_constraints = cuda_frames_get_constraints,
  416. .frames_init = cuda_frames_init,
  417. .frames_get_buffer = cuda_get_buffer,
  418. .transfer_get_formats = cuda_transfer_get_formats,
  419. .transfer_data_to = cuda_transfer_data_to,
  420. .transfer_data_from = cuda_transfer_data_from,
  421. .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
  422. };