5 tháng trước cách đây · ad0a44028d
--- a/Changelog
+++ b/Changelog
@@ -19,6 +19,7 @@ version <next>:
 
				 - VVC decoder supports all content of SCC (Screen Content Coding):
			
 
				   IBC (Inter Block Copy), Palette Mode and ACT (Adaptive Color Transform
			
 
				 - G.728 decoder
			
 
				+- pad_cuda filter
			
 
				 
			
 
				 
			
 
				 version 7.1:
			
--- a/configure
+++ b/configure
@@ -3362,6 +3362,8 @@ thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
 
				 transpose_npp_filter_deps="ffnvcodec libnpp"
			
 
				 overlay_cuda_filter_deps="ffnvcodec"
			
 
				 overlay_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
			
 
				+pad_cuda_filter_deps="ffnvcodec"
			
 
				+pad_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
			
 
				 sharpen_npp_filter_deps="ffnvcodec libnpp"
			
 
				 
			
 
				 ddagrab_filter_deps="d3d11va IDXGIOutput1 DXGI_OUTDUPL_FRAME_INFO"
			
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -26686,6 +26686,30 @@ See @ref{framesync}.
 
				 
			
 
				 This filter also supports the @ref{framesync} options.
			
 
				 
			
 
				+
			
 
				+@anchor{pad_cuda}
			
 
				+@subsection pad_cuda
			
 
				+Add paddings to an input video stream using CUDA.
			
 
				+
			
 
				+This filter is the CUDA-accelerated version of the @ref{pad} filter. It accepts the same options and expressions and provides the same core functionality.
			
 
				+For a detailed description of available options, please see the documentation for the @ref{pad} filter.
			
 
				+
			
 
				+@subsubsection Examples
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+Add a 200-pixel black border to all sides of a video frame:
			
 
				+@example
			
 
				+ffmpeg -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 -vf "pad_cuda=w=iw+400:h=ih+400:x=200:y=200" -c:v h264_nvenc out.mp4
			
 
				+@end example
			
 
				+
			
 
				+@item
			
 
				+Pad the input video to a 16:9 aspect ratio, filling with the color "blue":
			
 
				+@example
			
 
				+ffmpeg -hwaccel cuda -hwaccel_output_format cuda -i input.mp4 -vf "pad_cuda=w=ih*16/9/sar:h=ih:x=(ow-iw)/2:y=(oh-ih)/2:color=blue" -c:v h264_nvenc out.mp4
			
 
				+@end example
			
 
				+@end itemize
			
 
				+
			
 
				 @anchor{scale_cuda}
			
 
				 @subsection scale_cuda
			
 
				 
			
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -422,6 +422,7 @@ OBJS-$(CONFIG_OVERLAY_VAAPI_FILTER)          += vf_overlay_vaapi.o framesync.o v
 
				 OBJS-$(CONFIG_OVERLAY_VULKAN_FILTER)         += vf_overlay_vulkan.o vulkan.o vulkan_filter.o
			
 
				 OBJS-$(CONFIG_OWDENOISE_FILTER)              += vf_owdenoise.o
			
 
				 OBJS-$(CONFIG_PAD_FILTER)                    += vf_pad.o
			
 
				+OBJS-$(CONFIG_PAD_CUDA_FILTER)               += vf_pad_cuda.o vf_pad_cuda.ptx.o cuda/load_helper.o
			
 
				 OBJS-$(CONFIG_PAD_OPENCL_FILTER)             += vf_pad_opencl.o opencl.o opencl/pad.o
			
 
				 OBJS-$(CONFIG_PALETTEGEN_FILTER)             += vf_palettegen.o palette.o
			
 
				 OBJS-$(CONFIG_PALETTEUSE_FILTER)             += vf_paletteuse.o framesync.o palette.o
			
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -395,6 +395,7 @@ extern const FFFilter ff_vf_overlay_vulkan;
 
				 extern const FFFilter ff_vf_overlay_cuda;
			
 
				 extern const FFFilter ff_vf_owdenoise;
			
 
				 extern const FFFilter ff_vf_pad;
			
 
				+extern const FFFilter ff_vf_pad_cuda;
			
 
				 extern const FFFilter ff_vf_pad_opencl;
			
 
				 extern const FFFilter ff_vf_palettegen;
			
 
				 extern const FFFilter ff_vf_paletteuse;
			
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -31,7 +31,7 @@
 
				 
			
 
				 #include "version_major.h"
			
 
				 
			
 
				-#define LIBAVFILTER_VERSION_MINOR   0
			
 
				+#define LIBAVFILTER_VERSION_MINOR   1
			
 
				 #define LIBAVFILTER_VERSION_MICRO 100
			
 
				 
			
 
				 
			
--- a/libavfilter/vf_pad_cuda.c
+++ b/libavfilter/vf_pad_cuda.c
@@ -0,0 +1,636 @@
 
				+/*
			
 
				+ * This file is part of FFmpeg.
			
 
				+ *
			
 
				+ * FFmpeg is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU Lesser General Public
			
 
				+ * License as published by the Free Software Foundation; either
			
 
				+ * version 2.1 of the License, or (at your option) any later version.
			
 
				+ *
			
 
				+ * FFmpeg is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * Lesser General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU Lesser General Public
			
 
				+ * License along with FFmpeg; if not, write to the Free Software
			
 
				+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
			
 
				+ */
			
 
				+
			
 
				+/**
			
 
				+ * @file
			
 
				+ * CUDA video padding filter
			
 
				+ */
			
 
				+
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "filters.h"
			
 
				+#include "libavutil/avstring.h"
			
 
				+#include "libavutil/common.h"
			
 
				+#include "libavutil/cuda_check.h"
			
 
				+#include "libavutil/eval.h"
			
 
				+#include "libavutil/hwcontext.h"
			
 
				+#include "libavutil/hwcontext_cuda_internal.h"
			
 
				+#include "libavutil/imgutils.h"
			
 
				+#include "libavutil/internal.h"
			
 
				+#include "libavutil/opt.h"
			
 
				+#include "libavutil/pixdesc.h"
			
 
				+#include "libavutil/colorspace.h"
			
 
				+
			
 
				+#include "cuda/load_helper.h"
			
 
				+
			
 
				+#define CHECK_CU(x) FF_CUDA_CHECK_DL(ctx, device_hwctx->internal->cuda_dl, x)
			
 
				+#define DIV_UP(a, b) ( ((a) + (b) - 1) / (b) )
			
 
				+#define BLOCK_X 32
			
 
				+#define BLOCK_Y 16
			
 
				+
			
 
				+static const enum AVPixelFormat supported_formats[] = {
			
 
				+    AV_PIX_FMT_YUV420P,
			
 
				+    AV_PIX_FMT_YUV444P,
			
 
				+    AV_PIX_FMT_YUVA420P,
			
 
				+    AV_PIX_FMT_YUVA444P,
			
 
				+    AV_PIX_FMT_NV12,
			
 
				+};
			
 
				+
			
 
				+typedef struct CUDAPadContext {
			
 
				+    const AVClass *class;
			
 
				+
			
 
				+    AVBufferRef *frames_ctx;
			
 
				+
			
 
				+    int w, h;       ///< output dimensions, a value of 0 will result in the input size
			
 
				+    int x, y;       ///< offsets of the input area with respect to the padded area
			
 
				+    int in_w, in_h; ///< width and height for the padded input video
			
 
				+
			
 
				+    char *w_expr;   ///< width expression
			
 
				+    char *h_expr;   ///< height expression
			
 
				+    char *x_expr;   ///< x offset expression
			
 
				+    char *y_expr;   ///< y offset expression
			
 
				+
			
 
				+    uint8_t rgba_color[4];    ///< color for the padding area
			
 
				+    uint8_t parsed_color[4];
			
 
				+    AVRational aspect;
			
 
				+
			
 
				+    int eval_mode;
			
 
				+
			
 
				+    int last_out_w, last_out_h; ///< used to evaluate the prior output width and height with the incoming frame
			
 
				+
			
 
				+    AVCUDADeviceContext *hwctx;
			
 
				+    CUmodule cu_module;
			
 
				+    CUfunction cu_func_uchar;
			
 
				+    CUfunction cu_func_uchar2;
			
 
				+} CUDAPadContext;
			
 
				+
			
 
				+static const char *const var_names[] = {
			
 
				+    "in_w",  "iw",
			
 
				+    "in_h",  "ih",
			
 
				+    "out_w", "ow",
			
 
				+    "out_h", "oh",
			
 
				+    "x",
			
 
				+    "y",
			
 
				+    "a",
			
 
				+    "sar",
			
 
				+    "dar",
			
 
				+    "hsub",
			
 
				+    "vsub",
			
 
				+    NULL
			
 
				+};
			
 
				+
			
 
				+enum {
			
 
				+    VAR_IN_W,
			
 
				+    VAR_IW,
			
 
				+    VAR_IN_H,
			
 
				+    VAR_IH,
			
 
				+    VAR_OUT_W,
			
 
				+    VAR_OW,
			
 
				+    VAR_OUT_H,
			
 
				+    VAR_OH,
			
 
				+    VAR_X,
			
 
				+    VAR_Y,
			
 
				+    VAR_A,
			
 
				+    VAR_SAR,
			
 
				+    VAR_DAR,
			
 
				+    VAR_HSUB,
			
 
				+    VAR_VSUB,
			
 
				+    VARS_NB
			
 
				+};
			
 
				+
			
 
				+enum EvalMode {
			
 
				+    EVAL_MODE_INIT,
			
 
				+    EVAL_MODE_FRAME,
			
 
				+    EVAL_MODE_NB
			
 
				+};
			
 
				+
			
 
				+static int eval_expr(AVFilterContext *ctx)
			
 
				+{
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+    AVFilterLink *inlink = ctx->inputs[0];
			
 
				+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
			
 
				+
			
 
				+    double var_values[VARS_NB], res;
			
 
				+    char *expr;
			
 
				+    int ret;
			
 
				+
			
 
				+    var_values[VAR_IN_W]   = var_values[VAR_IW]   = s->in_w;
			
 
				+    var_values[VAR_IN_H]   = var_values[VAR_IH]   = s->in_h;
			
 
				+    var_values[VAR_OUT_W]  = var_values[VAR_OW]  = NAN;
			
 
				+    var_values[VAR_OUT_H]  = var_values[VAR_OH]  = NAN;
			
 
				+    var_values[VAR_A]      = (double)s->in_w / s->in_h;
			
 
				+    var_values[VAR_SAR]    = inlink->sample_aspect_ratio.num ?
			
 
				+                             (double)inlink->sample_aspect_ratio.num /
			
 
				+                             inlink->sample_aspect_ratio.den : 1;
			
 
				+    var_values[VAR_DAR]    = var_values[VAR_A] * var_values[VAR_SAR];
			
 
				+    var_values[VAR_HSUB]   = 1 << desc->log2_chroma_w;
			
 
				+    var_values[VAR_VSUB]   = 1 << desc->log2_chroma_h;
			
 
				+
			
 
				+    expr = s->w_expr;
			
 
				+    ret = av_expr_parse_and_eval(&res, expr, var_names, var_values, NULL, NULL, NULL, NULL, NULL, 0, ctx);
			
 
				+    if (ret < 0)
			
 
				+        goto fail;
			
 
				+
			
 
				+    s->w = res;
			
 
				+    if (s->w < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Width expression is negative.\n");
			
 
				+        ret = AVERROR(EINVAL);
			
 
				+        goto fail;
			
 
				+    }
			
 
				+
			
 
				+    var_values[VAR_OUT_W] = var_values[VAR_OW] = s->w;
			
 
				+
			
 
				+    expr = s->h_expr;
			
 
				+    ret = av_expr_parse_and_eval(&res, expr, var_names, var_values, NULL, NULL, NULL, NULL, NULL, 0, ctx);
			
 
				+    if (ret < 0)
			
 
				+        goto fail;
			
 
				+
			
 
				+    s->h = res;
			
 
				+    if (s->h < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Height expression is negative.\n");
			
 
				+        ret = AVERROR(EINVAL);
			
 
				+        goto fail;
			
 
				+    }
			
 
				+    var_values[VAR_OUT_H] = var_values[VAR_OH] = s->h;
			
 
				+
			
 
				+    if (!s->h)
			
 
				+        s->h = s->in_h;
			
 
				+
			
 
				+    var_values[VAR_OUT_H] = var_values[VAR_OH] = s->h;
			
 
				+
			
 
				+
			
 
				+    expr = s->w_expr;
			
 
				+    ret = av_expr_parse_and_eval(&res, expr, var_names, var_values, NULL, NULL, NULL, NULL, NULL, 0, ctx);
			
 
				+    if (ret < 0)
			
 
				+        goto fail;
			
 
				+
			
 
				+    s->w = res;
			
 
				+    if (s->w < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Width expression is negative.\n");
			
 
				+        ret = AVERROR(EINVAL);
			
 
				+        goto fail;
			
 
				+    }
			
 
				+    if (!s->w)
			
 
				+        s->w = s->in_w;
			
 
				+
			
 
				+    var_values[VAR_OUT_W] = var_values[VAR_OW] = s->w;
			
 
				+
			
 
				+
			
 
				+    expr = s->x_expr;
			
 
				+    ret = av_expr_parse_and_eval(&res, expr, var_names, var_values, NULL, NULL, NULL, NULL, NULL, 0, ctx);
			
 
				+    if (ret < 0)
			
 
				+        goto fail;
			
 
				+
			
 
				+    s->x = res;
			
 
				+
			
 
				+
			
 
				+    expr = s->y_expr;
			
 
				+    ret = av_expr_parse_and_eval(&res, expr, var_names, var_values, NULL, NULL, NULL, NULL, NULL, 0, ctx);
			
 
				+    if (ret < 0)
			
 
				+        goto fail;
			
 
				+
			
 
				+    s->y = res;
			
 
				+
			
 
				+    if (s->x < 0 || s->x + s->in_w > s->w) {
			
 
				+        s->x = (s->w - s->in_w) / 2;
			
 
				+        av_log(ctx, AV_LOG_VERBOSE, "centering X offset.\n");
			
 
				+    }
			
 
				+
			
 
				+    if (s->y < 0 || s->y + s->in_h > s->h) {
			
 
				+        s->y = (s->h - s->in_h) / 2;
			
 
				+        av_log(ctx, AV_LOG_VERBOSE, "centering Y offset.\n");
			
 
				+    }
			
 
				+
			
 
				+    s->w = av_clip(s->w, 1, INT_MAX);
			
 
				+    s->h = av_clip(s->h, 1, INT_MAX);
			
 
				+
			
 
				+    if (s->w < s->in_w || s->h < s->in_h) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Padded size < input size.\n");
			
 
				+        return AVERROR(EINVAL);
			
 
				+    }
			
 
				+
			
 
				+    av_log(ctx, AV_LOG_DEBUG,
			
 
				+           "w:%d h:%d -> w:%d h:%d x:%d y:%d color:0x%02X%02X%02X%02X\n",
			
 
				+           inlink->w, inlink->h, s->w, s->h, s->x, s->y, s->rgba_color[0],
			
 
				+           s->rgba_color[1], s->rgba_color[2], s->rgba_color[3]);
			
 
				+
			
 
				+    return 0;
			
 
				+
			
 
				+fail:
			
 
				+    av_log(ctx, AV_LOG_ERROR, "Error evaluating '%s'\n", expr);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+static int cuda_pad_alloc_out_frames_ctx(AVFilterContext *ctx, AVBufferRef **out_frames_ctx, const int width, const int height)
			
 
				+{
			
 
				+    AVFilterLink *inlink = ctx->inputs[0];
			
 
				+    FilterLink *inl = ff_filter_link(inlink);
			
 
				+    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext *)inl->hw_frames_ctx->data;
			
 
				+    int ret;
			
 
				+
			
 
				+    *out_frames_ctx = av_hwframe_ctx_alloc(in_frames_ctx->device_ref);
			
 
				+    if (!*out_frames_ctx) {
			
 
				+        return AVERROR(ENOMEM);
			
 
				+    }
			
 
				+
			
 
				+    AVHWFramesContext *out_fc = (AVHWFramesContext *)(*out_frames_ctx)->data;
			
 
				+    out_fc->format    = AV_PIX_FMT_CUDA;
			
 
				+    out_fc->sw_format = in_frames_ctx->sw_format;
			
 
				+
			
 
				+    out_fc->width     = FFALIGN(width, 32);
			
 
				+    out_fc->height    = FFALIGN(height, 32);
			
 
				+
			
 
				+    ret = av_hwframe_ctx_init(*out_frames_ctx);
			
 
				+    if (ret < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to init output ctx\n");
			
 
				+        av_buffer_unref(out_frames_ctx);
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static av_cold int cuda_pad_init(AVFilterContext *ctx)
			
 
				+{
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+
			
 
				+    s->last_out_w = -1;
			
 
				+    s->last_out_h = -1;
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static av_cold void cuda_pad_uninit(AVFilterContext *ctx)
			
 
				+{
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+    CUcontext dummy;
			
 
				+
			
 
				+    av_buffer_unref(&s->frames_ctx);
			
 
				+
			
 
				+    if (s->hwctx && s->cu_module) {
			
 
				+        CudaFunctions *cu = s->hwctx->internal->cuda_dl;
			
 
				+        AVCUDADeviceContext *device_hwctx = s->hwctx;
			
 
				+        CHECK_CU(cu->cuCtxPushCurrent(s->hwctx->cuda_ctx));
			
 
				+        CHECK_CU(cu->cuModuleUnload(s->cu_module));
			
 
				+        CHECK_CU(cu->cuCtxPopCurrent(&dummy));
			
 
				+    }
			
 
				+
			
 
				+    s->cu_module = NULL;
			
 
				+    s->hwctx = NULL;
			
 
				+}
			
 
				+
			
 
				+static av_cold int cuda_pad_load_functions(AVFilterContext *ctx)
			
 
				+{
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
			
 
				+    CUcontext dummy_cu_ctx;
			
 
				+    int ret;
			
 
				+
			
 
				+    AVCUDADeviceContext *device_hwctx = s->hwctx;
			
 
				+
			
 
				+    extern const unsigned char ff_vf_pad_cuda_ptx_data[];
			
 
				+    extern const unsigned int ff_vf_pad_cuda_ptx_len;
			
 
				+
			
 
				+    ret = CHECK_CU(cu->cuCtxPushCurrent(device_hwctx->cuda_ctx));
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    ret = ff_cuda_load_module(ctx, device_hwctx, &s->cu_module,
			
 
				+                              ff_vf_pad_cuda_ptx_data, ff_vf_pad_cuda_ptx_len);
			
 
				+    if (ret < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to load CUDA module\n");
			
 
				+        goto end;
			
 
				+    }
			
 
				+
			
 
				+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "pad_uchar"));
			
 
				+    if (ret < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to load pad_planar_cuda\n");
			
 
				+        goto end;
			
 
				+    }
			
 
				+
			
 
				+    ret = CHECK_CU(cu->cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "pad_uchar2"));
			
 
				+    if (ret < 0)
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to load pad_uv_cuda\n");
			
 
				+
			
 
				+end:
			
 
				+    CHECK_CU(cu->cuCtxPopCurrent(&dummy_cu_ctx));
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+static int cuda_pad_config_props(AVFilterLink *outlink)
			
 
				+{
			
 
				+    AVFilterContext *ctx = outlink->src;
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+
			
 
				+    AVFilterLink *inlink = ctx->inputs[0];
			
 
				+    FilterLink *inl = ff_filter_link(inlink);
			
 
				+
			
 
				+    FilterLink *ol = ff_filter_link(outlink);
			
 
				+
			
 
				+    AVHWFramesContext *in_frames_ctx;
			
 
				+    int format_supported = 0;
			
 
				+    int ret;
			
 
				+
			
 
				+    s->in_w = inlink->w;
			
 
				+    s->in_h = inlink->h;
			
 
				+    ret = eval_expr(ctx);
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    if (!inl->hw_frames_ctx) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "No hw context provided on input\n");
			
 
				+        return AVERROR(EINVAL);
			
 
				+    }
			
 
				+
			
 
				+    in_frames_ctx = (AVHWFramesContext *)inl->hw_frames_ctx->data;
			
 
				+    s->hwctx = in_frames_ctx->device_ctx->hwctx;
			
 
				+
			
 
				+    for (int i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) {
			
 
				+        if (in_frames_ctx->sw_format == supported_formats[i]) {
			
 
				+            format_supported = 1;
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+    if (!format_supported) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Unsupported input format.\n");
			
 
				+        return AVERROR(EINVAL);
			
 
				+    }
			
 
				+
			
 
				+    s->parsed_color[0] = RGB_TO_Y_BT709(s->rgba_color[0], s->rgba_color[1], s->rgba_color[2]);
			
 
				+    s->parsed_color[1] = RGB_TO_U_BT709(s->rgba_color[0], s->rgba_color[1], s->rgba_color[2], 0);
			
 
				+    s->parsed_color[2] = RGB_TO_V_BT709(s->rgba_color[0], s->rgba_color[1], s->rgba_color[2], 0);
			
 
				+    s->parsed_color[3] = s->rgba_color[3];
			
 
				+
			
 
				+    ret = cuda_pad_alloc_out_frames_ctx(ctx, &s->frames_ctx, s->w, s->h);
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    ol->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
			
 
				+    if (!ol->hw_frames_ctx)
			
 
				+        return AVERROR(ENOMEM);
			
 
				+
			
 
				+    outlink->w         = s->w;
			
 
				+    outlink->h         = s->h;
			
 
				+    outlink->time_base = inlink->time_base;
			
 
				+    outlink->format    = AV_PIX_FMT_CUDA;
			
 
				+
			
 
				+    s->last_out_w = s->w;
			
 
				+    s->last_out_h = s->h;
			
 
				+
			
 
				+    ret = cuda_pad_load_functions(ctx);
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static int cuda_pad_pad(AVFilterContext *ctx, AVFrame *out, const AVFrame *in)
			
 
				+{
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+    FilterLink *inl = ff_filter_link(ctx->inputs[0]);
			
 
				+
			
 
				+    AVHWFramesContext *in_frames_ctx = (AVHWFramesContext *)inl->hw_frames_ctx->data;
			
 
				+    const AVPixFmtDescriptor *pixdesc = av_pix_fmt_desc_get(in_frames_ctx->sw_format);
			
 
				+
			
 
				+    CudaFunctions *cu = s->hwctx->internal->cuda_dl;
			
 
				+    AVCUDADeviceContext *device_hwctx = s->hwctx;
			
 
				+    int ret;
			
 
				+
			
 
				+
			
 
				+    const int nb_planes = av_pix_fmt_count_planes(in_frames_ctx->sw_format);
			
 
				+    for (int plane = 0; plane < nb_planes; plane++) {
			
 
				+        const AVComponentDescriptor *cur_comp = &pixdesc->comp[0];
			
 
				+        for (int comp = 1; comp < pixdesc->nb_components && cur_comp->plane != plane; comp++)
			
 
				+            cur_comp = &pixdesc->comp[comp];
			
 
				+
			
 
				+        int hsub = (plane == 1 || plane == 2) ? pixdesc->log2_chroma_w : 0;
			
 
				+        int vsub = (plane == 1 || plane == 2) ? pixdesc->log2_chroma_h : 0;
			
 
				+
			
 
				+        int src_w = AV_CEIL_RSHIFT(s->in_w, hsub);
			
 
				+        int src_h = AV_CEIL_RSHIFT(s->in_h, vsub);
			
 
				+
			
 
				+        int dst_w = AV_CEIL_RSHIFT(s->w, hsub);
			
 
				+        int dst_h = AV_CEIL_RSHIFT(s->h, vsub);
			
 
				+
			
 
				+        int y_plane_offset = AV_CEIL_RSHIFT(s->y, vsub);
			
 
				+        int x_plane_offset = AV_CEIL_RSHIFT(s->x, hsub);
			
 
				+
			
 
				+        if (x_plane_offset + src_w > dst_w || y_plane_offset + src_h > dst_h) {
			
 
				+            av_log(ctx, AV_LOG_ERROR,
			
 
				+                   "ROI out of bounds in plane %d: offset=(%d,%d) in=(%dx%d) "
			
 
				+                   "out=(%dx%d)\n",
			
 
				+                   plane, x_plane_offset, y_plane_offset, src_w, src_h, dst_w, dst_h);
			
 
				+            return AVERROR(EINVAL);
			
 
				+        }
			
 
				+
			
 
				+        int dst_linesize = out->linesize[plane] / cur_comp->step;
			
 
				+        int src_linesize = in->linesize[plane] / cur_comp->step;
			
 
				+
			
 
				+        CUdeviceptr d_dst = (CUdeviceptr)out->data[plane];
			
 
				+        CUdeviceptr d_src = (CUdeviceptr)in->data[plane];
			
 
				+
			
 
				+        CUfunction cuda_func;
			
 
				+
			
 
				+        if (cur_comp->step == 1 && cur_comp->depth == 8)
			
 
				+            cuda_func = s->cu_func_uchar;
			
 
				+        else if(cur_comp->step == 2 && cur_comp->depth == 8)
			
 
				+            cuda_func = s->cu_func_uchar2;
			
 
				+        else
			
 
				+            return AVERROR_BUG;
			
 
				+
			
 
				+        void *kernel_args[] = {
			
 
				+            &d_dst, &dst_linesize, &dst_w, &dst_h,
			
 
				+            &d_src, &src_linesize, &src_w, &src_h,
			
 
				+            &x_plane_offset, &y_plane_offset, &s->parsed_color[plane]
			
 
				+        };
			
 
				+
			
 
				+        unsigned int grid_x = DIV_UP(dst_w, BLOCK_X);
			
 
				+        unsigned int grid_y = DIV_UP(dst_h, BLOCK_Y);
			
 
				+
			
 
				+        ret = CHECK_CU(cu->cuLaunchKernel(cuda_func, grid_x, grid_y, 1,
			
 
				+                                          BLOCK_X, BLOCK_Y, 1,
			
 
				+                                          0, s->hwctx->stream, kernel_args, NULL));
			
 
				+
			
 
				+        if (ret < 0) {
			
 
				+            av_log(ctx, AV_LOG_ERROR, "Failed to launch kernel for plane %d\n", plane);
			
 
				+            return ret;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static int cuda_pad_filter_frame(AVFilterLink *inlink, AVFrame *in)
			
 
				+{
			
 
				+    AVFilterContext *ctx = inlink->dst;
			
 
				+    CUDAPadContext *s = ctx->priv;
			
 
				+    AVFilterLink *outlink = ctx->outputs[0];
			
 
				+
			
 
				+    FilterLink *outl = ff_filter_link(outlink);
			
 
				+
			
 
				+    AVHWFramesContext *out_frames_ctx = (AVHWFramesContext *)outl->hw_frames_ctx->data;
			
 
				+    AVCUDADeviceContext *device_hwctx = out_frames_ctx->device_ctx->hwctx;
			
 
				+
			
 
				+    int ret;
			
 
				+
			
 
				+    if (s->eval_mode == EVAL_MODE_FRAME) {
			
 
				+        s->in_w   = in->width;
			
 
				+        s->in_h   = in->height;
			
 
				+        s->aspect = in->sample_aspect_ratio;
			
 
				+
			
 
				+        ret = eval_expr(ctx);
			
 
				+        if (ret < 0) {
			
 
				+            av_frame_free(&in);
			
 
				+            return ret;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    if (s->x == 0 && s->y == 0 &&
			
 
				+        s->w == in->width && s->h == in->height) {
			
 
				+        av_log(ctx, AV_LOG_DEBUG, "No border. Passing the frame unmodified.\n");
			
 
				+        s->last_out_w = s->w;
			
 
				+        s->last_out_h = s->h;
			
 
				+        return ff_filter_frame(outlink, in);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    if (s->w != s->last_out_w || s->h != s->last_out_h) {
			
 
				+
			
 
				+        av_buffer_unref(&s->frames_ctx);
			
 
				+
			
 
				+        ret = cuda_pad_alloc_out_frames_ctx(ctx, &s->frames_ctx, s->w, s->h);
			
 
				+        if (ret < 0)
			
 
				+            return ret;
			
 
				+
			
 
				+        av_buffer_unref(&outl->hw_frames_ctx);
			
 
				+        outl->hw_frames_ctx = av_buffer_ref(s->frames_ctx);
			
 
				+        if (!outl->hw_frames_ctx) {
			
 
				+            av_frame_free(&in);
			
 
				+            av_log(ctx, AV_LOG_ERROR, "Failed to allocate output frame context.\n");
			
 
				+            return AVERROR(ENOMEM);
			
 
				+        }
			
 
				+        outlink->w = s->w;
			
 
				+        outlink->h = s->h;
			
 
				+
			
 
				+        s->last_out_w = s->w;
			
 
				+        s->last_out_h = s->h;
			
 
				+    }
			
 
				+
			
 
				+    AVFrame *out = av_frame_alloc();
			
 
				+    if (!out) {
			
 
				+        av_frame_free(&in);
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to allocate output AVFrame.\n");
			
 
				+        return AVERROR(ENOMEM);
			
 
				+    }
			
 
				+    ret = av_hwframe_get_buffer(outl->hw_frames_ctx, out, 0);
			
 
				+    if (ret < 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Unable to get output buffer: %s\n",
			
 
				+               av_err2str(ret));
			
 
				+        av_frame_free(&out);
			
 
				+        av_frame_free(&in);
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    CUcontext dummy;
			
 
				+    ret = CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPushCurrent(
			
 
				+        device_hwctx->cuda_ctx));
			
 
				+    if (ret < 0) {
			
 
				+        av_frame_free(&out);
			
 
				+        av_frame_free(&in);
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    ret = cuda_pad_pad(ctx, out, in);
			
 
				+
			
 
				+    CHECK_CU(device_hwctx->internal->cuda_dl->cuCtxPopCurrent(&dummy));
			
 
				+
			
 
				+    if (ret < 0) {
			
 
				+        av_frame_free(&out);
			
 
				+        av_frame_free(&in);
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    av_frame_copy_props(out, in);
			
 
				+    out->width  = s->w;
			
 
				+    out->height = s->h;
			
 
				+
			
 
				+
			
 
				+    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
			
 
				+              (int64_t)in->sample_aspect_ratio.num * out->height * in->width,
			
 
				+              (int64_t)in->sample_aspect_ratio.den * out->width * in->height,
			
 
				+              INT_MAX);
			
 
				+
			
 
				+    av_frame_free(&in);
			
 
				+    return ff_filter_frame(outlink, out);
			
 
				+}
			
 
				+
			
 
				+#define OFFSET(x) offsetof(CUDAPadContext, x)
			
 
				+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
			
 
				+
			
 
				+static const AVOption cuda_pad_options[] = {
			
 
				+    { "width",  "set the pad area width expression",                             OFFSET(w_expr),     AV_OPT_TYPE_STRING,   {.str = "iw"},       0, 0,        FLAGS },
			
 
				+    { "w",      "set the pad area width expression",                             OFFSET(w_expr),     AV_OPT_TYPE_STRING,   {.str = "iw"},       0, 0,        FLAGS },
			
 
				+    { "height", "set the pad area height expression",                            OFFSET(h_expr),     AV_OPT_TYPE_STRING,   {.str = "ih"},       0, 0,        FLAGS },
			
 
				+    { "h",      "set the pad area height expression",                            OFFSET(h_expr),     AV_OPT_TYPE_STRING,   {.str = "ih"},       0, 0,        FLAGS },
			
 
				+    { "x",      "set the x offset expression for the input image position",      OFFSET(x_expr),     AV_OPT_TYPE_STRING,   {.str = "0"},        0, 0,        FLAGS },
			
 
				+    { "y",      "set the y offset expression for the input image position",      OFFSET(y_expr),     AV_OPT_TYPE_STRING,   {.str = "0"},        0, 0,        FLAGS },
			
 
				+    { "color",  "set the color of the padded area border",                       OFFSET(rgba_color), AV_OPT_TYPE_COLOR,    {.str = "black"},    .flags =      FLAGS },
			
 
				+    { "eval",   "specify when to evaluate expressions",                          OFFSET(eval_mode),  AV_OPT_TYPE_INT,      {.i64 = EVAL_MODE_INIT}, 0, EVAL_MODE_NB-1, FLAGS, .unit = "eval" },
			
 
				+         { "init",  "eval expressions once during initialization", 0, AV_OPT_TYPE_CONST, {.i64=EVAL_MODE_INIT},  .flags = FLAGS, .unit = "eval" },
			
 
				+         { "frame", "eval expressions during initialization and per-frame", 0, AV_OPT_TYPE_CONST, {.i64=EVAL_MODE_FRAME}, .flags = FLAGS, .unit = "eval" },
			
 
				+    { "aspect", "pad to fit an aspect instead of a resolution",                  OFFSET(aspect),     AV_OPT_TYPE_RATIONAL, {.dbl = 0},        0, DBL_MAX,    FLAGS },
			
 
				+    { NULL }
			
 
				+};
			
 
				+
			
 
				+static const AVClass cuda_pad_class = {
			
 
				+    .class_name = "pad_cuda",
			
 
				+    .item_name  = av_default_item_name,
			
 
				+    .option     = cuda_pad_options,
			
 
				+    .version    = LIBAVUTIL_VERSION_INT,
			
 
				+};
			
 
				+
			
 
				+static const AVFilterPad cuda_pad_inputs[] = {{
			
 
				+        .name         = "default",
			
 
				+        .type         = AVMEDIA_TYPE_VIDEO,
			
 
				+        .filter_frame = cuda_pad_filter_frame
			
 
				+}};
			
 
				+
			
 
				+static const AVFilterPad cuda_pad_outputs[] = {{
			
 
				+        .name         = "default",
			
 
				+        .type         = AVMEDIA_TYPE_VIDEO,
			
 
				+        .config_props = cuda_pad_config_props,
			
 
				+}};
			
 
				+
			
 
				+const FFFilter ff_vf_pad_cuda = {
			
 
				+    .p.name         = "pad_cuda",
			
 
				+    .p.description  = NULL_IF_CONFIG_SMALL("CUDA-based GPU padding filter"),
			
 
				+    .init           = cuda_pad_init,
			
 
				+    .uninit         = cuda_pad_uninit,
			
 
				+
			
 
				+    .p.priv_class   = &cuda_pad_class,
			
 
				+
			
 
				+    FILTER_INPUTS(cuda_pad_inputs),
			
 
				+    FILTER_OUTPUTS(cuda_pad_outputs),
			
 
				+
			
 
				+    FILTER_SINGLE_PIXFMT(AV_PIX_FMT_CUDA),
			
 
				+
			
 
				+    .priv_size      = sizeof(CUDAPadContext),
			
 
				+    .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE,
			
 
				+};
			
--- a/libavfilter/vf_pad_cuda.cu
+++ b/libavfilter/vf_pad_cuda.cu
@@ -0,0 +1,61 @@
 
				+/*
			
 
				+ * This file is part of FFmpeg.
			
 
				+ *
			
 
				+ * FFmpeg is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU Lesser General Public
			
 
				+ * License as published by the Free Software Foundation; either
			
 
				+ * version 2.1 of the License, or (at your option) any later version.
			
 
				+ *
			
 
				+ * FFmpeg is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
			
 
				+ * Lesser General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU Lesser General Public
			
 
				+ * License along with FFmpeg; if not, write to the Free Software
			
 
				+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
			
 
				+ */
			
 
				+
			
 
				+template <typename T>
			
 
				+__device__ void pad_impl(T* dst, int dst_pitch, int dst_w, int dst_h,
			
 
				+                         const T* src, int src_pitch, int src_w, int src_h,
			
 
				+                         int roi_x, int roi_y, T fill_val)
			
 
				+{
			
 
				+    const int x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+    const int y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+
			
 
				+    if (x >= dst_w || y >= dst_h) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (x >= roi_x && x < (roi_x + src_w) && y >= roi_y && y < (roi_y + src_h)) {
			
 
				+        const int src_x = x - roi_x;
			
 
				+        const int src_y = y - roi_y;
			
 
				+        dst[y * dst_pitch + x] = src[src_y * src_pitch + src_x];
			
 
				+    } else {
			
 
				+        dst[y * dst_pitch + x] = fill_val;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+extern "C" {
			
 
				+
			
 
				+__global__ void pad_uchar(unsigned char* dst, int dst_pitch, int dst_w, int dst_h,
			
 
				+                          const unsigned char* src, int src_pitch, int src_w, int src_h,
			
 
				+                          int roi_x, int roi_y, unsigned char fill_val)
			
 
				+{
			
 
				+    pad_impl<unsigned char>(dst, dst_pitch, dst_w, dst_h,
			
 
				+                            src, src_pitch, src_w, src_h,
			
 
				+                            roi_x, roi_y, fill_val);
			
 
				+}
			
 
				+
			
 
				+__global__ void pad_uchar2(uchar2* dst, int dst_pitch, int dst_w, int dst_h,
			
 
				+                           const uchar2* src, int src_pitch, int src_w, int src_h,
			
 
				+                           int roi_x, int roi_y, uchar2 fill_val)
			
 
				+{
			
 
				+    pad_impl<uchar2>(dst, dst_pitch, dst_w, dst_h,
			
 
				+                     src, src_pitch, src_w, src_h,
			
 
				+                     roi_x, roi_y, fill_val);
			
 
				+}
			
 
				+
			
 
				+}