You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

529 lines
15KB

  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with FFmpeg; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "buffer.h"
  19. #include "common.h"
  20. #include "hwcontext.h"
  21. #include "hwcontext_internal.h"
  22. #include "hwcontext_cuda_internal.h"
  23. #if CONFIG_VULKAN
  24. #include "hwcontext_vulkan.h"
  25. #endif
  26. #include "cuda_check.h"
  27. #include "mem.h"
  28. #include "pixdesc.h"
  29. #include "pixfmt.h"
  30. #include "imgutils.h"
  31. #define CUDA_FRAME_ALIGNMENT 256
  32. typedef struct CUDAFramesContext {
  33. int shift_width, shift_height;
  34. } CUDAFramesContext;
  35. static const enum AVPixelFormat supported_formats[] = {
  36. AV_PIX_FMT_NV12,
  37. AV_PIX_FMT_YUV420P,
  38. AV_PIX_FMT_YUV444P,
  39. AV_PIX_FMT_P010,
  40. AV_PIX_FMT_P016,
  41. AV_PIX_FMT_YUV444P16,
  42. AV_PIX_FMT_0RGB32,
  43. AV_PIX_FMT_0BGR32,
  44. #if CONFIG_VULKAN
  45. AV_PIX_FMT_VULKAN,
  46. #endif
  47. };
  48. #define CHECK_CU(x) FF_CUDA_CHECK_DL(device_ctx, cu, x)
  49. static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
  50. const void *hwconfig,
  51. AVHWFramesConstraints *constraints)
  52. {
  53. int i;
  54. constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
  55. sizeof(*constraints->valid_sw_formats));
  56. if (!constraints->valid_sw_formats)
  57. return AVERROR(ENOMEM);
  58. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
  59. constraints->valid_sw_formats[i] = supported_formats[i];
  60. constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
  61. constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
  62. if (!constraints->valid_hw_formats)
  63. return AVERROR(ENOMEM);
  64. constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
  65. constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
  66. return 0;
  67. }
  68. static void cuda_buffer_free(void *opaque, uint8_t *data)
  69. {
  70. AVHWFramesContext *ctx = opaque;
  71. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  72. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  73. CudaFunctions *cu = hwctx->internal->cuda_dl;
  74. CUcontext dummy;
  75. CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  76. CHECK_CU(cu->cuMemFree((CUdeviceptr)data));
  77. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  78. }
  79. static AVBufferRef *cuda_pool_alloc(void *opaque, int size)
  80. {
  81. AVHWFramesContext *ctx = opaque;
  82. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  83. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  84. CudaFunctions *cu = hwctx->internal->cuda_dl;
  85. AVBufferRef *ret = NULL;
  86. CUcontext dummy = NULL;
  87. CUdeviceptr data;
  88. int err;
  89. err = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  90. if (err < 0)
  91. return NULL;
  92. err = CHECK_CU(cu->cuMemAlloc(&data, size));
  93. if (err < 0)
  94. goto fail;
  95. ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
  96. if (!ret) {
  97. CHECK_CU(cu->cuMemFree(data));
  98. goto fail;
  99. }
  100. fail:
  101. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  102. return ret;
  103. }
  104. static int cuda_frames_init(AVHWFramesContext *ctx)
  105. {
  106. CUDAFramesContext *priv = ctx->internal->priv;
  107. int i;
  108. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
  109. if (ctx->sw_format == supported_formats[i])
  110. break;
  111. }
  112. if (i == FF_ARRAY_ELEMS(supported_formats)) {
  113. av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
  114. av_get_pix_fmt_name(ctx->sw_format));
  115. return AVERROR(ENOSYS);
  116. }
  117. av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
  118. if (!ctx->pool) {
  119. int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  120. if (size < 0)
  121. return size;
  122. ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
  123. if (!ctx->internal->pool_internal)
  124. return AVERROR(ENOMEM);
  125. }
  126. return 0;
  127. }
  128. static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
  129. {
  130. int res;
  131. frame->buf[0] = av_buffer_pool_get(ctx->pool);
  132. if (!frame->buf[0])
  133. return AVERROR(ENOMEM);
  134. res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
  135. ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  136. if (res < 0)
  137. return res;
  138. // YUV420P is a special case.
  139. // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them, also chroma is half-aligned
  140. if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
  141. frame->linesize[1] = frame->linesize[2] = frame->linesize[0] / 2;
  142. frame->data[2] = frame->data[1];
  143. frame->data[1] = frame->data[2] + frame->linesize[2] * ctx->height / 2;
  144. }
  145. frame->format = AV_PIX_FMT_CUDA;
  146. frame->width = ctx->width;
  147. frame->height = ctx->height;
  148. return 0;
  149. }
  150. static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
  151. enum AVHWFrameTransferDirection dir,
  152. enum AVPixelFormat **formats)
  153. {
  154. enum AVPixelFormat *fmts;
  155. fmts = av_malloc_array(2, sizeof(*fmts));
  156. if (!fmts)
  157. return AVERROR(ENOMEM);
  158. fmts[0] = ctx->sw_format;
  159. fmts[1] = AV_PIX_FMT_NONE;
  160. *formats = fmts;
  161. return 0;
  162. }
  163. static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst,
  164. const AVFrame *src)
  165. {
  166. CUDAFramesContext *priv = ctx->internal->priv;
  167. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  168. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  169. CudaFunctions *cu = hwctx->internal->cuda_dl;
  170. CUcontext dummy;
  171. int i, ret;
  172. /* We don't support transfers to HW devices. */
  173. if (dst->hw_frames_ctx)
  174. return AVERROR(ENOSYS);
  175. ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  176. if (ret < 0)
  177. return ret;
  178. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  179. CUDA_MEMCPY2D cpy = {
  180. .srcMemoryType = CU_MEMORYTYPE_DEVICE,
  181. .dstMemoryType = CU_MEMORYTYPE_HOST,
  182. .srcDevice = (CUdeviceptr)src->data[i],
  183. .dstHost = dst->data[i],
  184. .srcPitch = src->linesize[i],
  185. .dstPitch = dst->linesize[i],
  186. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  187. .Height = src->height >> (i ? priv->shift_height : 0),
  188. };
  189. ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
  190. if (ret < 0)
  191. goto exit;
  192. }
  193. ret = CHECK_CU(cu->cuStreamSynchronize(hwctx->stream));
  194. if (ret < 0)
  195. goto exit;
  196. exit:
  197. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  198. return 0;
  199. }
  200. static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst,
  201. const AVFrame *src)
  202. {
  203. CUDAFramesContext *priv = ctx->internal->priv;
  204. AVHWDeviceContext *device_ctx = ctx->device_ctx;
  205. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  206. CudaFunctions *cu = hwctx->internal->cuda_dl;
  207. CUcontext dummy;
  208. int i, ret;
  209. /* We don't support transfers from HW devices. */
  210. if (src->hw_frames_ctx)
  211. return AVERROR(ENOSYS);
  212. ret = CHECK_CU(cu->cuCtxPushCurrent(hwctx->cuda_ctx));
  213. if (ret < 0)
  214. return ret;
  215. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  216. CUDA_MEMCPY2D cpy = {
  217. .srcMemoryType = CU_MEMORYTYPE_HOST,
  218. .dstMemoryType = CU_MEMORYTYPE_DEVICE,
  219. .srcHost = src->data[i],
  220. .dstDevice = (CUdeviceptr)dst->data[i],
  221. .srcPitch = src->linesize[i],
  222. .dstPitch = dst->linesize[i],
  223. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  224. .Height = src->height >> (i ? priv->shift_height : 0),
  225. };
  226. ret = CHECK_CU(cu->cuMemcpy2DAsync(&cpy, hwctx->stream));
  227. if (ret < 0)
  228. goto exit;
  229. }
  230. exit:
  231. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  232. return 0;
  233. }
  234. static void cuda_device_uninit(AVHWDeviceContext *device_ctx)
  235. {
  236. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  237. if (hwctx->internal) {
  238. CudaFunctions *cu = hwctx->internal->cuda_dl;
  239. if (hwctx->internal->is_allocated && hwctx->cuda_ctx) {
  240. if (hwctx->internal->flags & AV_CUDA_USE_PRIMARY_CONTEXT)
  241. CHECK_CU(cu->cuDevicePrimaryCtxRelease(hwctx->internal->cuda_device));
  242. else
  243. CHECK_CU(cu->cuCtxDestroy(hwctx->cuda_ctx));
  244. hwctx->cuda_ctx = NULL;
  245. }
  246. cuda_free_functions(&hwctx->internal->cuda_dl);
  247. }
  248. av_freep(&hwctx->internal);
  249. }
  250. static int cuda_device_init(AVHWDeviceContext *ctx)
  251. {
  252. AVCUDADeviceContext *hwctx = ctx->hwctx;
  253. int ret;
  254. if (!hwctx->internal) {
  255. hwctx->internal = av_mallocz(sizeof(*hwctx->internal));
  256. if (!hwctx->internal)
  257. return AVERROR(ENOMEM);
  258. }
  259. if (!hwctx->internal->cuda_dl) {
  260. ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx);
  261. if (ret < 0) {
  262. av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
  263. goto error;
  264. }
  265. }
  266. return 0;
  267. error:
  268. cuda_device_uninit(ctx);
  269. return ret;
  270. }
  271. static int cuda_context_init(AVHWDeviceContext *device_ctx, int flags) {
  272. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  273. CudaFunctions *cu;
  274. CUcontext dummy;
  275. int ret, dev_active = 0;
  276. unsigned int dev_flags = 0;
  277. const unsigned int desired_flags = CU_CTX_SCHED_BLOCKING_SYNC;
  278. cu = hwctx->internal->cuda_dl;
  279. hwctx->internal->flags = flags;
  280. if (flags & AV_CUDA_USE_PRIMARY_CONTEXT) {
  281. ret = CHECK_CU(cu->cuDevicePrimaryCtxGetState(hwctx->internal->cuda_device,
  282. &dev_flags, &dev_active));
  283. if (ret < 0)
  284. return ret;
  285. if (dev_active && dev_flags != desired_flags) {
  286. av_log(device_ctx, AV_LOG_ERROR, "Primary context already active with incompatible flags.\n");
  287. return AVERROR(ENOTSUP);
  288. } else if (dev_flags != desired_flags) {
  289. ret = CHECK_CU(cu->cuDevicePrimaryCtxSetFlags(hwctx->internal->cuda_device,
  290. desired_flags));
  291. if (ret < 0)
  292. return ret;
  293. }
  294. ret = CHECK_CU(cu->cuDevicePrimaryCtxRetain(&hwctx->cuda_ctx,
  295. hwctx->internal->cuda_device));
  296. if (ret < 0)
  297. return ret;
  298. } else {
  299. ret = CHECK_CU(cu->cuCtxCreate(&hwctx->cuda_ctx, desired_flags,
  300. hwctx->internal->cuda_device));
  301. if (ret < 0)
  302. return ret;
  303. CHECK_CU(cu->cuCtxPopCurrent(&dummy));
  304. }
  305. hwctx->internal->is_allocated = 1;
  306. // Setting stream to NULL will make functions automatically use the default CUstream
  307. hwctx->stream = NULL;
  308. return 0;
  309. }
  310. static int cuda_device_create(AVHWDeviceContext *device_ctx,
  311. const char *device,
  312. AVDictionary *opts, int flags)
  313. {
  314. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  315. CudaFunctions *cu;
  316. int ret, device_idx = 0;
  317. if (device)
  318. device_idx = strtol(device, NULL, 0);
  319. if (cuda_device_init(device_ctx) < 0)
  320. goto error;
  321. cu = hwctx->internal->cuda_dl;
  322. ret = CHECK_CU(cu->cuInit(0));
  323. if (ret < 0)
  324. goto error;
  325. ret = CHECK_CU(cu->cuDeviceGet(&hwctx->internal->cuda_device, device_idx));
  326. if (ret < 0)
  327. goto error;
  328. ret = cuda_context_init(device_ctx, flags);
  329. if (ret < 0)
  330. goto error;
  331. return 0;
  332. error:
  333. cuda_device_uninit(device_ctx);
  334. return AVERROR_UNKNOWN;
  335. }
  336. static int cuda_device_derive(AVHWDeviceContext *device_ctx,
  337. AVHWDeviceContext *src_ctx,
  338. int flags) {
  339. AVCUDADeviceContext *hwctx = device_ctx->hwctx;
  340. CudaFunctions *cu;
  341. const char *src_uuid = NULL;
  342. int ret, i, device_count;
  343. #if CONFIG_VULKAN
  344. VkPhysicalDeviceIDProperties vk_idp = {
  345. .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES,
  346. };
  347. #endif
  348. switch (src_ctx->type) {
  349. #if CONFIG_VULKAN
  350. case AV_HWDEVICE_TYPE_VULKAN: {
  351. AVVulkanDeviceContext *vkctx = src_ctx->hwctx;
  352. VkPhysicalDeviceProperties2 vk_dev_props = {
  353. .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
  354. .pNext = &vk_idp,
  355. };
  356. vkGetPhysicalDeviceProperties2(vkctx->phys_dev, &vk_dev_props);
  357. src_uuid = vk_idp.deviceUUID;
  358. break;
  359. }
  360. #endif
  361. default:
  362. return AVERROR(ENOSYS);
  363. }
  364. if (!src_uuid) {
  365. av_log(device_ctx, AV_LOG_ERROR,
  366. "Failed to get UUID of source device.\n");
  367. goto error;
  368. }
  369. if (cuda_device_init(device_ctx) < 0)
  370. goto error;
  371. cu = hwctx->internal->cuda_dl;
  372. ret = CHECK_CU(cu->cuInit(0));
  373. if (ret < 0)
  374. goto error;
  375. ret = CHECK_CU(cu->cuDeviceGetCount(&device_count));
  376. if (ret < 0)
  377. goto error;
  378. hwctx->internal->cuda_device = -1;
  379. for (i = 0; i < device_count; i++) {
  380. CUdevice dev;
  381. CUuuid uuid;
  382. ret = CHECK_CU(cu->cuDeviceGet(&dev, i));
  383. if (ret < 0)
  384. goto error;
  385. ret = CHECK_CU(cu->cuDeviceGetUuid(&uuid, dev));
  386. if (ret < 0)
  387. goto error;
  388. if (memcmp(src_uuid, uuid.bytes, sizeof (uuid.bytes)) == 0) {
  389. hwctx->internal->cuda_device = dev;
  390. break;
  391. }
  392. }
  393. if (hwctx->internal->cuda_device == -1) {
  394. av_log(device_ctx, AV_LOG_ERROR, "Could not derive CUDA device.\n");
  395. goto error;
  396. }
  397. ret = cuda_context_init(device_ctx, flags);
  398. if (ret < 0)
  399. goto error;
  400. return 0;
  401. error:
  402. cuda_device_uninit(device_ctx);
  403. return AVERROR_UNKNOWN;
  404. }
  405. const HWContextType ff_hwcontext_type_cuda = {
  406. .type = AV_HWDEVICE_TYPE_CUDA,
  407. .name = "CUDA",
  408. .device_hwctx_size = sizeof(AVCUDADeviceContext),
  409. .frames_priv_size = sizeof(CUDAFramesContext),
  410. .device_create = cuda_device_create,
  411. .device_derive = cuda_device_derive,
  412. .device_init = cuda_device_init,
  413. .device_uninit = cuda_device_uninit,
  414. .frames_get_constraints = cuda_frames_get_constraints,
  415. .frames_init = cuda_frames_init,
  416. .frames_get_buffer = cuda_get_buffer,
  417. .transfer_get_formats = cuda_transfer_get_formats,
  418. .transfer_data_to = cuda_transfer_data_to,
  419. .transfer_data_from = cuda_transfer_data_from,
  420. .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
  421. };