Ver Fonte

libavfilter: vf_drawtext filter support draw text with detection bounding boxes in side_data

This feature can be used with dnn detection by setting vf_drawtext's option
text_source=side_data_detection_bboxes, for example:
./ffmpeg -i face.jpeg -vf dnn_detect=dnn_backend=openvino:model=face-detection-adas-0001.xml:\
input=data:output=detection_out:labels=face-detection-adas-0001.label,drawbox=box_source=
side_data_detection_bboxes,drawtext=text_source=side_data_detection_bboxes:fontcolor=green:\
fontsize=40, -y face_detect.jpeg
Please note, the default fontsize of vf_drawtext is 12, which may be too
small to be seen clearly.

Signed-off-by: Ting Fu <ting.fu@intel.com>
Ting Fu há 4 anos atrás
pai
commit
7a879cce37
2 ficheiros alterados com 79 adições e 6 exclusões
  1. 8 0
      doc/filters.texi
  2. 71 6
      libavfilter/vf_drawtext.c

+ 8 - 0
doc/filters.texi

@@ -10788,6 +10788,14 @@ parameter @var{text}.
 
 
 If both @var{text} and @var{textfile} are specified, an error is thrown.
 If both @var{text} and @var{textfile} are specified, an error is thrown.
 
 
+@item text_source
+Text source should be set as side_data_detection_bboxes if you want to use text data in
+detection bboxes of side data.
+
+If text source is set, @var{text} and @var{textfile} will be ignored and still use
+text data in detection bboxes of side data. So please do not use this parameter
+if you are not sure about the text source.
+
 @item reload
 @item reload
 If set to 1, the @var{textfile} will be reloaded before each frame.
 If set to 1, the @var{textfile} will be reloaded before each frame.
 Be sure to update it atomically, or it may be read partially, or even fail.
 Be sure to update it atomically, or it may be read partially, or even fail.

+ 71 - 6
libavfilter/vf_drawtext.c

@@ -55,6 +55,7 @@
 #include "libavutil/time_internal.h"
 #include "libavutil/time_internal.h"
 #include "libavutil/tree.h"
 #include "libavutil/tree.h"
 #include "libavutil/lfg.h"
 #include "libavutil/lfg.h"
+#include "libavutil/detection_bbox.h"
 #include "avfilter.h"
 #include "avfilter.h"
 #include "drawutils.h"
 #include "drawutils.h"
 #include "formats.h"
 #include "formats.h"
@@ -199,6 +200,8 @@ typedef struct DrawTextContext {
     int tc24hmax;                   ///< 1 if timecode is wrapped to 24 hours, 0 otherwise
     int tc24hmax;                   ///< 1 if timecode is wrapped to 24 hours, 0 otherwise
     int reload;                     ///< reload text file for each frame
     int reload;                     ///< reload text file for each frame
     int start_number;               ///< starting frame number for n/frame_num var
     int start_number;               ///< starting frame number for n/frame_num var
+    char *text_source_string;       ///< the string to specify text data source
+    enum AVFrameSideDataType text_source;
 #if CONFIG_LIBFRIBIDI
 #if CONFIG_LIBFRIBIDI
     int text_shaping;               ///< 1 to shape the text before drawing it
     int text_shaping;               ///< 1 to shape the text before drawing it
 #endif
 #endif
@@ -246,6 +249,7 @@ static const AVOption drawtext_options[]= {
     { "alpha",       "apply alpha while rendering", OFFSET(a_expr),      AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
     { "alpha",       "apply alpha while rendering", OFFSET(a_expr),      AV_OPT_TYPE_STRING, { .str = "1"     },          .flags = FLAGS },
     {"fix_bounds", "check and fix text coords to avoid clipping", OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
     {"fix_bounds", "check and fix text coords to avoid clipping", OFFSET(fix_bounds), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS},
     {"start_number", "start frame number for n/frame_num variable", OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
     {"start_number", "start frame number for n/frame_num variable", OFFSET(start_number), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS},
+    {"text_source", "the source of text", OFFSET(text_source_string), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 1, FLAGS },
 
 
 #if CONFIG_LIBFRIBIDI
 #if CONFIG_LIBFRIBIDI
     {"text_shaping", "attempt to shape text before drawing", OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
     {"text_shaping", "attempt to shape text before drawing", OFFSET(text_shaping), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, FLAGS},
@@ -690,6 +694,16 @@ out:
 }
 }
 #endif
 #endif
 
 
+static enum AVFrameSideDataType text_source_string_parse(const char *text_source_string)
+{
+    av_assert0(text_source_string);
+    if (!strcmp(text_source_string, "side_data_detection_bboxes")) {
+        return AV_FRAME_DATA_DETECTION_BBOXES;
+    } else {
+        return AVERROR(EINVAL);
+    }
+}
+
 static av_cold int init(AVFilterContext *ctx)
 static av_cold int init(AVFilterContext *ctx)
 {
 {
     int err;
     int err;
@@ -731,9 +745,28 @@ static av_cold int init(AVFilterContext *ctx)
             s->text = av_strdup("");
             s->text = av_strdup("");
     }
     }
 
 
+    if (s->text_source_string) {
+        s->text_source = text_source_string_parse(s->text_source_string);
+        if ((int)s->text_source < 0) {
+            av_log(ctx, AV_LOG_ERROR, "Error text source: %s\n", s->text_source_string);
+            return AVERROR(EINVAL);
+        }
+    }
+
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
+        if (s->text) {
+            av_log(ctx, AV_LOG_WARNING, "Multiple texts provided, will use text_source only\n");
+            av_free(s->text);
+        }
+        s->text = av_mallocz(AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE *
+                             (AV_NUM_DETECTION_BBOX_CLASSIFY + 1));
+        if (!s->text)
+            return AVERROR(ENOMEM);
+    }
+
     if (!s->text) {
     if (!s->text) {
         av_log(ctx, AV_LOG_ERROR,
         av_log(ctx, AV_LOG_ERROR,
-               "Either text, a valid file or a timecode must be provided\n");
+               "Either text, a valid file, a timecode or text source must be provided\n");
         return AVERROR(EINVAL);
         return AVERROR(EINVAL);
     }
     }
 
 
@@ -1440,10 +1473,15 @@ continue_on_invalid2:
 
 
     s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h;
     s->var_values[VAR_LINE_H] = s->var_values[VAR_LH] = s->max_glyph_h;
 
 
-    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
-    s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
-    /* It is necessary if x is expressed from y  */
-    s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES) {
+        s->var_values[VAR_X] = s->x;
+        s->var_values[VAR_Y] = s->y;
+    } else {
+        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+        s->y = s->var_values[VAR_Y] = av_expr_eval(s->y_pexpr, s->var_values, &s->prng);
+        /* It is necessary if x is expressed from y  */
+        s->x = s->var_values[VAR_X] = av_expr_eval(s->x_pexpr, s->var_values, &s->prng);
+    }
 
 
     update_alpha(s);
     update_alpha(s);
     update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
     update_color_with_alpha(s, &fontcolor  , s->fontcolor  );
@@ -1511,6 +1549,21 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     AVFilterLink *outlink = ctx->outputs[0];
     AVFilterLink *outlink = ctx->outputs[0];
     DrawTextContext *s = ctx->priv;
     DrawTextContext *s = ctx->priv;
     int ret;
     int ret;
+    const AVDetectionBBoxHeader *header = NULL;
+    const AVDetectionBBox *bbox;
+    AVFrameSideData *sd;
+    int loop = 1;
+
+    if (s->text_source == AV_FRAME_DATA_DETECTION_BBOXES && sd) {
+        sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
+        if (sd) {
+            header = (AVDetectionBBoxHeader *)sd->data;
+            loop = header->nb_bboxes;
+        } else {
+            av_log(s, AV_LOG_WARNING, "No detection bboxes.\n");
+            return ff_filter_frame(outlink, frame);
+        }
+    }
 
 
     if (s->reload) {
     if (s->reload) {
         if ((ret = load_textfile(ctx)) < 0) {
         if ((ret = load_textfile(ctx)) < 0) {
@@ -1536,7 +1589,19 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
     s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
     s->var_values[VAR_PKT_SIZE] = frame->pkt_size;
     s->metadata = frame->metadata;
     s->metadata = frame->metadata;
 
 
-    draw_text(ctx, frame, frame->width, frame->height);
+    for (int i = 0; i < loop; i++) {
+        if (header) {
+            bbox = av_get_detection_bbox(header, i);
+            strcpy(s->text, bbox->detect_label);
+            for (int j = 0; j < bbox->classify_count; j++) {
+                strcat(s->text, ", ");
+                strcat(s->text, bbox->classify_labels[j]);
+            }
+            s->x = bbox->x;
+            s->y = bbox->y - s->fontsize;
+        }
+        draw_text(ctx, frame, frame->width, frame->height);
+    }
 
 
     av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n",
     av_log(ctx, AV_LOG_DEBUG, "n:%d t:%f text_w:%d text_h:%d x:%d y:%d\n",
            (int)s->var_values[VAR_N], s->var_values[VAR_T],
            (int)s->var_values[VAR_N], s->var_values[VAR_T],