You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

393 lines
12KB

  1. /*
  2. * This file is part of FFmpeg.
  3. *
  4. * FFmpeg is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2.1 of the License, or (at your option) any later version.
  8. *
  9. * FFmpeg is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with FFmpeg; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. #include "buffer.h"
  19. #include "common.h"
  20. #include "hwcontext.h"
  21. #include "hwcontext_internal.h"
  22. #include "hwcontext_cuda_internal.h"
  23. #include "mem.h"
  24. #include "pixdesc.h"
  25. #include "pixfmt.h"
  26. #include "imgutils.h"
  27. #define CUDA_FRAME_ALIGNMENT 256
  28. typedef struct CUDAFramesContext {
  29. int shift_width, shift_height;
  30. } CUDAFramesContext;
  31. static const enum AVPixelFormat supported_formats[] = {
  32. AV_PIX_FMT_NV12,
  33. AV_PIX_FMT_YUV420P,
  34. AV_PIX_FMT_YUV444P,
  35. AV_PIX_FMT_P010,
  36. AV_PIX_FMT_P016,
  37. AV_PIX_FMT_YUV444P16,
  38. AV_PIX_FMT_0RGB32,
  39. AV_PIX_FMT_0BGR32,
  40. };
  41. static int cuda_frames_get_constraints(AVHWDeviceContext *ctx,
  42. const void *hwconfig,
  43. AVHWFramesConstraints *constraints)
  44. {
  45. int i;
  46. constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1,
  47. sizeof(*constraints->valid_sw_formats));
  48. if (!constraints->valid_sw_formats)
  49. return AVERROR(ENOMEM);
  50. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++)
  51. constraints->valid_sw_formats[i] = supported_formats[i];
  52. constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE;
  53. constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats));
  54. if (!constraints->valid_hw_formats)
  55. return AVERROR(ENOMEM);
  56. constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA;
  57. constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE;
  58. return 0;
  59. }
  60. static void cuda_buffer_free(void *opaque, uint8_t *data)
  61. {
  62. AVHWFramesContext *ctx = opaque;
  63. AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx;
  64. CudaFunctions *cu = hwctx->internal->cuda_dl;
  65. CUcontext dummy;
  66. cu->cuCtxPushCurrent(hwctx->cuda_ctx);
  67. cu->cuMemFree((CUdeviceptr)data);
  68. cu->cuCtxPopCurrent(&dummy);
  69. }
  70. static AVBufferRef *cuda_pool_alloc(void *opaque, int size)
  71. {
  72. AVHWFramesContext *ctx = opaque;
  73. AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx;
  74. CudaFunctions *cu = hwctx->internal->cuda_dl;
  75. AVBufferRef *ret = NULL;
  76. CUcontext dummy = NULL;
  77. CUdeviceptr data;
  78. CUresult err;
  79. err = cu->cuCtxPushCurrent(hwctx->cuda_ctx);
  80. if (err != CUDA_SUCCESS) {
  81. av_log(ctx, AV_LOG_ERROR, "Error setting current CUDA context\n");
  82. return NULL;
  83. }
  84. err = cu->cuMemAlloc(&data, size);
  85. if (err != CUDA_SUCCESS)
  86. goto fail;
  87. ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0);
  88. if (!ret) {
  89. cu->cuMemFree(data);
  90. goto fail;
  91. }
  92. fail:
  93. cu->cuCtxPopCurrent(&dummy);
  94. return ret;
  95. }
  96. static int cuda_frames_init(AVHWFramesContext *ctx)
  97. {
  98. CUDAFramesContext *priv = ctx->internal->priv;
  99. int i;
  100. for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
  101. if (ctx->sw_format == supported_formats[i])
  102. break;
  103. }
  104. if (i == FF_ARRAY_ELEMS(supported_formats)) {
  105. av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n",
  106. av_get_pix_fmt_name(ctx->sw_format));
  107. return AVERROR(ENOSYS);
  108. }
  109. av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height);
  110. if (!ctx->pool) {
  111. int size = av_image_get_buffer_size(ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  112. if (size < 0)
  113. return size;
  114. ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL);
  115. if (!ctx->internal->pool_internal)
  116. return AVERROR(ENOMEM);
  117. }
  118. return 0;
  119. }
  120. static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame)
  121. {
  122. int res;
  123. frame->buf[0] = av_buffer_pool_get(ctx->pool);
  124. if (!frame->buf[0])
  125. return AVERROR(ENOMEM);
  126. res = av_image_fill_arrays(frame->data, frame->linesize, frame->buf[0]->data,
  127. ctx->sw_format, ctx->width, ctx->height, CUDA_FRAME_ALIGNMENT);
  128. if (res < 0)
  129. return res;
  130. // YUV420P is a special case.
  131. // Nvenc expects the U/V planes in swapped order from how ffmpeg expects them.
  132. if (ctx->sw_format == AV_PIX_FMT_YUV420P) {
  133. FFSWAP(uint8_t*, frame->data[1], frame->data[2]);
  134. FFSWAP(int, frame->linesize[1], frame->linesize[2]);
  135. }
  136. frame->format = AV_PIX_FMT_CUDA;
  137. frame->width = ctx->width;
  138. frame->height = ctx->height;
  139. return 0;
  140. }
  141. static int cuda_transfer_get_formats(AVHWFramesContext *ctx,
  142. enum AVHWFrameTransferDirection dir,
  143. enum AVPixelFormat **formats)
  144. {
  145. enum AVPixelFormat *fmts;
  146. fmts = av_malloc_array(2, sizeof(*fmts));
  147. if (!fmts)
  148. return AVERROR(ENOMEM);
  149. fmts[0] = ctx->sw_format;
  150. fmts[1] = AV_PIX_FMT_NONE;
  151. *formats = fmts;
  152. return 0;
  153. }
  154. static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst,
  155. const AVFrame *src)
  156. {
  157. CUDAFramesContext *priv = ctx->internal->priv;
  158. AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx;
  159. CudaFunctions *cu = device_hwctx->internal->cuda_dl;
  160. CUcontext dummy;
  161. CUresult err;
  162. int i;
  163. err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx);
  164. if (err != CUDA_SUCCESS)
  165. return AVERROR_UNKNOWN;
  166. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  167. CUDA_MEMCPY2D cpy = {
  168. .srcMemoryType = CU_MEMORYTYPE_DEVICE,
  169. .dstMemoryType = CU_MEMORYTYPE_HOST,
  170. .srcDevice = (CUdeviceptr)src->data[i],
  171. .dstHost = dst->data[i],
  172. .srcPitch = src->linesize[i],
  173. .dstPitch = dst->linesize[i],
  174. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  175. .Height = src->height >> (i ? priv->shift_height : 0),
  176. };
  177. err = cu->cuMemcpy2DAsync(&cpy, device_hwctx->stream);
  178. if (err != CUDA_SUCCESS) {
  179. av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n");
  180. return AVERROR_UNKNOWN;
  181. }
  182. }
  183. err = cu->cuStreamSynchronize(device_hwctx->stream);
  184. if (err != CUDA_SUCCESS) {
  185. av_log(ctx, AV_LOG_ERROR, "Error synchronizing CUDA stream\n");
  186. return AVERROR_UNKNOWN;
  187. }
  188. cu->cuCtxPopCurrent(&dummy);
  189. return 0;
  190. }
  191. static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst,
  192. const AVFrame *src)
  193. {
  194. CUDAFramesContext *priv = ctx->internal->priv;
  195. AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx;
  196. CudaFunctions *cu = device_hwctx->internal->cuda_dl;
  197. CUcontext dummy;
  198. CUresult err;
  199. int i;
  200. err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx);
  201. if (err != CUDA_SUCCESS)
  202. return AVERROR_UNKNOWN;
  203. for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) {
  204. CUDA_MEMCPY2D cpy = {
  205. .srcMemoryType = CU_MEMORYTYPE_HOST,
  206. .dstMemoryType = CU_MEMORYTYPE_DEVICE,
  207. .srcHost = src->data[i],
  208. .dstDevice = (CUdeviceptr)dst->data[i],
  209. .srcPitch = src->linesize[i],
  210. .dstPitch = dst->linesize[i],
  211. .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]),
  212. .Height = src->height >> (i ? priv->shift_height : 0),
  213. };
  214. err = cu->cuMemcpy2DAsync(&cpy, device_hwctx->stream);
  215. if (err != CUDA_SUCCESS) {
  216. av_log(ctx, AV_LOG_ERROR, "Error transferring the data to the CUDA frame\n");
  217. return AVERROR_UNKNOWN;
  218. }
  219. }
  220. err = cu->cuStreamSynchronize(device_hwctx->stream);
  221. if (err != CUDA_SUCCESS) {
  222. av_log(ctx, AV_LOG_ERROR, "Error synchronizing CUDA stream\n");
  223. return AVERROR_UNKNOWN;
  224. }
  225. cu->cuCtxPopCurrent(&dummy);
  226. return 0;
  227. }
  228. static void cuda_device_uninit(AVHWDeviceContext *ctx)
  229. {
  230. AVCUDADeviceContext *hwctx = ctx->hwctx;
  231. if (hwctx->internal) {
  232. if (hwctx->internal->is_allocated && hwctx->cuda_ctx) {
  233. hwctx->internal->cuda_dl->cuCtxDestroy(hwctx->cuda_ctx);
  234. hwctx->cuda_ctx = NULL;
  235. }
  236. cuda_free_functions(&hwctx->internal->cuda_dl);
  237. }
  238. av_freep(&hwctx->internal);
  239. }
  240. static int cuda_device_init(AVHWDeviceContext *ctx)
  241. {
  242. AVCUDADeviceContext *hwctx = ctx->hwctx;
  243. int ret;
  244. if (!hwctx->internal) {
  245. hwctx->internal = av_mallocz(sizeof(*hwctx->internal));
  246. if (!hwctx->internal)
  247. return AVERROR(ENOMEM);
  248. }
  249. if (!hwctx->internal->cuda_dl) {
  250. ret = cuda_load_functions(&hwctx->internal->cuda_dl, ctx);
  251. if (ret < 0) {
  252. av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n");
  253. goto error;
  254. }
  255. }
  256. return 0;
  257. error:
  258. cuda_device_uninit(ctx);
  259. return ret;
  260. }
  261. static int cuda_device_create(AVHWDeviceContext *ctx, const char *device,
  262. AVDictionary *opts, int flags)
  263. {
  264. AVCUDADeviceContext *hwctx = ctx->hwctx;
  265. CudaFunctions *cu;
  266. CUdevice cu_device;
  267. CUcontext dummy;
  268. CUresult err;
  269. int device_idx = 0;
  270. if (device)
  271. device_idx = strtol(device, NULL, 0);
  272. if (cuda_device_init(ctx) < 0)
  273. goto error;
  274. cu = hwctx->internal->cuda_dl;
  275. err = cu->cuInit(0);
  276. if (err != CUDA_SUCCESS) {
  277. av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n");
  278. goto error;
  279. }
  280. err = cu->cuDeviceGet(&cu_device, device_idx);
  281. if (err != CUDA_SUCCESS) {
  282. av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx);
  283. goto error;
  284. }
  285. err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device);
  286. if (err != CUDA_SUCCESS) {
  287. av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n");
  288. goto error;
  289. }
  290. // Setting stream to NULL will make functions automatically use the default CUstream
  291. hwctx->stream = NULL;
  292. cu->cuCtxPopCurrent(&dummy);
  293. hwctx->internal->is_allocated = 1;
  294. return 0;
  295. error:
  296. cuda_device_uninit(ctx);
  297. return AVERROR_UNKNOWN;
  298. }
  299. const HWContextType ff_hwcontext_type_cuda = {
  300. .type = AV_HWDEVICE_TYPE_CUDA,
  301. .name = "CUDA",
  302. .device_hwctx_size = sizeof(AVCUDADeviceContext),
  303. .frames_priv_size = sizeof(CUDAFramesContext),
  304. .device_create = cuda_device_create,
  305. .device_init = cuda_device_init,
  306. .device_uninit = cuda_device_uninit,
  307. .frames_get_constraints = cuda_frames_get_constraints,
  308. .frames_init = cuda_frames_init,
  309. .frames_get_buffer = cuda_get_buffer,
  310. .transfer_get_formats = cuda_transfer_get_formats,
  311. .transfer_data_to = cuda_transfer_data_to,
  312. .transfer_data_from = cuda_transfer_data_from,
  313. .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE },
  314. };