4 meses atrás · 13ce36fef9
--- a/Changelog
+++ b/Changelog
@@ -2,6 +2,7 @@ Entries are sorted chronologically from oldest to youngest within each release,
 
				 releases are sorted from youngest to oldest.
			
 
				 
			
 
				 version <next>:
			
 
				+- Whisper filter
			
 
				 - Drop support for OpenSSL < 1.1.0
			
 
				 - yasm support dropped, users need to use nasm
			
 
				 - VVC VAAPI decoder
			
--- a/configure
+++ b/configure
@@ -336,6 +336,7 @@ External library support:
 
				   --disable-securetransport disable Secure Transport, needed for TLS support
			
 
				                            on OSX if openssl and gnutls are not used [autodetect]
			
 
				   --enable-vapoursynth     enable VapourSynth demuxer [no]
			
 
				+  --enable-whisper         enable whisper filter [no]
			
 
				   --disable-xlib           disable xlib [autodetect]
			
 
				   --disable-zlib           disable zlib [autodetect]
			
 
				 
			
@@ -2005,6 +2006,7 @@ EXTERNAL_LIBRARY_LIST="
 
				     pocketsphinx
			
 
				     vapoursynth
			
 
				     vulkan_static
			
 
				+    whisper
			
 
				 "
			
 
				 
			
 
				 HWACCEL_AUTODETECT_LIBRARY_LIST="
			
@@ -4085,6 +4087,7 @@ xstack_qsv_filter_deps="libmfx"
 
				 xstack_qsv_filter_select="qsvvpp"
			
 
				 pad_vaapi_filter_deps="vaapi_1"
			
 
				 drawbox_vaapi_filter_deps="vaapi_1"
			
 
				+whisper_filter_deps="whisper"
			
 
				 
			
 
				 # examples
			
 
				 avio_http_serve_files_deps="avformat avutil fork"
			
@@ -7164,6 +7167,8 @@ enabled libvo_amrwbenc    && require libvo_amrwbenc vo-amrwbenc/enc_if.h E_IF_in
 
				 enabled libvorbis         && require_pkg_config libvorbis vorbis vorbis/codec.h vorbis_info_init &&
			
 
				                              require_pkg_config libvorbisenc vorbisenc vorbis/vorbisenc.h vorbis_encode_init
			
 
				 
			
 
				+enabled whisper           && require_pkg_config whisper "whisper >= 1.7.5" whisper.h whisper_init_from_file_with_params
			
 
				+
			
 
				 enabled libvpx            && {
			
 
				     enabled libvpx_vp8_decoder && {
			
 
				         check_pkg_config libvpx_vp8_decoder "vpx >= 1.4.0" "vpx/vpx_decoder.h vpx/vp8dx.h" vpx_codec_vp8_dx ||
			
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -7682,6 +7682,113 @@ There are 6 samples at -4 dB, 62 at -5 dB, 286 at -6 dB, etc.
 
				 In other words, raising the volume by +4 dB does not cause any clipping,
			
 
				 raising it by +5 dB causes clipping for 6 samples, etc.
			
 
				 
			
 
				+@anchor{whisper}
			
 
				+@section whisper
			
 
				+
			
 
				+It runs automatic speech recognition using the OpenAI's Whisper model.
			
 
				+
			
 
				+It requires the whisper.cpp library (https://github.com/ggml-org/whisper.cpp)
			
 
				+as a prerequisite. After installing the library it can be enabled using:
			
 
				+@code{./configure --enable-whisper}.
			
 
				+
			
 
				+The filter has following options:
			
 
				+
			
 
				+@table @option
			
 
				+@item model
			
 
				+The file path of the downloaded whisper.cpp model (mandatory).
			
 
				+
			
 
				+@item language
			
 
				+The language to use for transcription ('auto' for auto-detect).
			
 
				+Default value: @code{"auto"}
			
 
				+
			
 
				+@item queue
			
 
				+The maximum size that will be queued into the filter before processing the audio
			
 
				+with whisper. Using a small value the audio stream will be processed more often,
			
 
				+but the transcription quality will be lower and the required processing power
			
 
				+will be higher. Using a large value (e.g. 10-20s) will produce more accurate
			
 
				+results using less CPU (as using the whisper-cli tool), but the transcription
			
 
				+latency will be higher, thus not useful to process real-time streams.
			
 
				+Consider using the vad_model option associated with a large queue value.
			
 
				+Default value: @code{"3"}
			
 
				+
			
 
				+@item use_gpu
			
 
				+If the GPU support should be enabled.
			
 
				+Default value: @code{"true"}
			
 
				+
			
 
				+@item gpu_device
			
 
				+The GPU device index to use.
			
 
				+Default value: @code{"0"}
			
 
				+
			
 
				+@item destination
			
 
				+If set, the transcription output will be sent to the specified file or URL
			
 
				+(use one of the FFmpeg AVIO protocols); otherwise, the output will be logged as
			
 
				+info messages.
			
 
				+The output will also be set in the "lavfi.whisper.text" frame metadata.
			
 
				+If the destination is a file and it already exists, it will be overwritten.
			
 
				+
			
 
				+@item format
			
 
				+The destination format string; it could be "text" (only the transcribed text
			
 
				+will be sent to the destination), "srt" (subtitle format) or "json".
			
 
				+Default value: @code{"text"}
			
 
				+
			
 
				+@item vad_model
			
 
				+Path to the VAD model file. If set, the filter will load an additional voice
			
 
				+activity detection module (https://github.com/snakers4/silero-vad) that will be
			
 
				+used to fragment the audio queue; use this option setting a valid path obtained
			
 
				+from the whisper.cpp repository (e.g. "../whisper.cpp/models/ggml-silero-v5.1.2.bin")
			
 
				+and increase the queue parameter to a higher value (e.g. 20).
			
 
				+
			
 
				+@item vad_threshold
			
 
				+The VAD threshold to use.
			
 
				+Default value: @code{"0.5"}
			
 
				+
			
 
				+@item vad_min_speech_duration
			
 
				+The minimum VAD speaking duration.
			
 
				+Default value: @code{"0.1"}
			
 
				+
			
 
				+@item vad_min_silence_duration
			
 
				+The minimum VAD silence duration.
			
 
				+Default value: @code{"0.5"}
			
 
				+
			
 
				+@end table
			
 
				+
			
 
				+@subsection Examples
			
 
				+@itemize
			
 
				+
			
 
				+@item
			
 
				+Run a transcription with srt file generation:
			
 
				+@example
			
 
				+ffmpeg -i input.mp4 -vn -af "whisper=model=../whisper.cpp/models/ggml-base.en.bin\
			
 
				+:language=en\
			
 
				+:queue=3\
			
 
				+:destination=output.srt\
			
 
				+:format=srt" -f null -
			
 
				+@end example
			
 
				+
			
 
				+@item
			
 
				+Run a transcription and send the output in JSON format to an HTTP service:
			
 
				+@example
			
 
				+ffmpeg -i input.mp4 -vn -af "whisper=model=../whisper.cpp/models/ggml-base.en.bin\
			
 
				+:language=en\
			
 
				+:queue=3\
			
 
				+:destination=http\\://localhost\\:3000\
			
 
				+:format=json' -f null -
			
 
				+@end example
			
 
				+
			
 
				+@item
			
 
				+Transcribe the microphone input using the VAD option:
			
 
				+@example
			
 
				+ffmpeg -loglevel warning -f pulse -i default \
			
 
				+-af 'highpass=f=200,lowpass=f=3000,whisper=model=../whisper.cpp/models/ggml-medium.bin\
			
 
				+:language=en\
			
 
				+:queue=10\
			
 
				+:destination=-\
			
 
				+:format=json\
			
 
				+:vad_model=../whisper.cpp/models/ggml-silero-v5.1.2.bin' -f null -
			
 
				+@end example
			
 
				+
			
 
				+@end itemize
			
 
				+
			
 
				 @c man end AUDIO FILTERS
			
 
				 
			
 
				 @chapter Audio Sources
			
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -188,6 +188,8 @@ OBJS-$(CONFIG_HILBERT_FILTER)                += asrc_hilbert.o
 
				 OBJS-$(CONFIG_SINC_FILTER)                   += asrc_sinc.o
			
 
				 OBJS-$(CONFIG_SINE_FILTER)                   += asrc_sine.o
			
 
				 
			
 
				+OBJS-$(CONFIG_WHISPER_FILTER)                += af_whisper.o
			
 
				+
			
 
				 OBJS-$(CONFIG_ANULLSINK_FILTER)              += asink_anullsink.o
			
 
				 
			
 
				 # video filters
			
--- a/libavfilter/af_whisper.c
+++ b/libavfilter/af_whisper.c
@@ -0,0 +1,463 @@
 
				+/*
			
 
				+ * Copyright (c) 2025 Vittorio Palmisano
			
 
				+ *
			
 
				+ * This file is part of FFmpeg.
			
 
				+ *
			
 
				+ * FFmpeg is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU Lesser General Public License
			
 
				+ * as published by the Free Software Foundation; either
			
 
				+ * version 2.1 of the License, or (at your option) any later version.
			
 
				+ *
			
 
				+ * FFmpeg is distributed in the hope that it will be useful,
			
 
				+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+ * GNU Lesser General Public License for more details.
			
 
				+ *
			
 
				+ * You should have received a copy of the GNU Lesser General Public License
			
 
				+ * along with FFmpeg; if not, write to the Free Software Foundation, Inc.,
			
 
				+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
			
 
				+ */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				+#include <whisper.h>
			
 
				+
			
 
				+#include "libavutil/avutil.h"
			
 
				+#include "libavutil/opt.h"
			
 
				+#include "libavutil/channel_layout.h"
			
 
				+#include "libavutil/samplefmt.h"
			
 
				+#include "libavfilter/avfilter.h"
			
 
				+#include "libavfilter/audio.h"
			
 
				+#include "libavutil/mem.h"
			
 
				+#include "libavutil/avstring.h"
			
 
				+#include "libavutil/internal.h"
			
 
				+#include "libavformat/avio.h"
			
 
				+#include "libavutil/thread.h"
			
 
				+
			
 
				+#include "formats.h"
			
 
				+
			
 
				+typedef struct WhisperContext {
			
 
				+    const AVClass *class;
			
 
				+    char *model_path;
			
 
				+    char *language;
			
 
				+    bool use_gpu;
			
 
				+    int gpu_device;
			
 
				+    char *vad_model_path;
			
 
				+    float vad_threshold;
			
 
				+    int64_t vad_min_speech_duration;
			
 
				+    int64_t vad_min_silence_duration;
			
 
				+
			
 
				+    int64_t queue;
			
 
				+    char *destination;
			
 
				+    char *format;
			
 
				+
			
 
				+    struct whisper_context *ctx_wsp;
			
 
				+    struct whisper_vad_context *ctx_vad;
			
 
				+    struct whisper_vad_params vad_params;
			
 
				+
			
 
				+    float *audio_buffer;
			
 
				+    int audio_buffer_queue_size;
			
 
				+    int audio_buffer_fill_size;
			
 
				+    int audio_buffer_vad_size;
			
 
				+    int64_t audio_buffer_start_ms;
			
 
				+
			
 
				+    int eof;
			
 
				+    int64_t next_pts;
			
 
				+
			
 
				+    AVIOContext *avio_context;
			
 
				+    int index;
			
 
				+} WhisperContext;
			
 
				+
			
 
				+static void cb_log(enum ggml_log_level level, const char *text, void *user_data)
			
 
				+{
			
 
				+    AVFilterContext *ctx = user_data;
			
 
				+    int av_log_level = AV_LOG_DEBUG;
			
 
				+    switch (level) {
			
 
				+    case GGML_LOG_LEVEL_ERROR:
			
 
				+        av_log_level = AV_LOG_ERROR;
			
 
				+        break;
			
 
				+    case GGML_LOG_LEVEL_WARN:
			
 
				+        av_log_level = AV_LOG_WARNING;
			
 
				+        break;
			
 
				+    }
			
 
				+    av_log(ctx, av_log_level, "%s", text);
			
 
				+}
			
 
				+
			
 
				+static int init(AVFilterContext *ctx)
			
 
				+{
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+
			
 
				+    static AVOnce init_static_once = AV_ONCE_INIT;
			
 
				+    ff_thread_once(&init_static_once, ggml_backend_load_all);
			
 
				+
			
 
				+    whisper_log_set(cb_log, ctx);
			
 
				+
			
 
				+    // Init whisper context
			
 
				+    if (!wctx->model_path) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use the 'model' option.\n");
			
 
				+        return AVERROR(EINVAL);
			
 
				+    }
			
 
				+
			
 
				+    struct whisper_context_params params = whisper_context_default_params();
			
 
				+    params.use_gpu = wctx->use_gpu;
			
 
				+    params.gpu_device = wctx->gpu_device;
			
 
				+
			
 
				+    wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, params);
			
 
				+    if (wctx->ctx_wsp == NULL) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context from model: %s\n", wctx->model_path);
			
 
				+        return AVERROR(EIO);
			
 
				+    }
			
 
				+
			
 
				+    // Init buffer
			
 
				+    wctx->audio_buffer_queue_size = av_rescale(wctx->queue, WHISPER_SAMPLE_RATE, AV_TIME_BASE);
			
 
				+    wctx->audio_buffer = av_malloc_array(wctx->audio_buffer_queue_size, sizeof(*wctx->audio_buffer));
			
 
				+    if (!wctx->audio_buffer)
			
 
				+        return AVERROR(ENOMEM);
			
 
				+
			
 
				+    // Init VAD model context
			
 
				+    if (wctx->vad_model_path) {
			
 
				+        struct whisper_vad_context_params ctx_params = whisper_vad_default_context_params();
			
 
				+        ctx_params.n_threads = ff_filter_get_nb_threads(ctx);
			
 
				+        // ctx_params.use_gpu = wctx->use_gpu; TODO (see: whisper_vad_init_context)
			
 
				+        ctx_params.gpu_device = wctx->gpu_device;
			
 
				+        wctx->ctx_vad = whisper_vad_init_from_file_with_params(wctx->vad_model_path, ctx_params);
			
 
				+
			
 
				+        wctx->vad_params = whisper_vad_default_params();
			
 
				+        wctx->vad_params.threshold = wctx->vad_threshold;
			
 
				+        wctx->vad_params.min_speech_duration_ms = av_rescale(wctx->vad_min_speech_duration, 1000, AV_TIME_BASE);
			
 
				+        wctx->vad_params.min_silence_duration_ms = av_rescale(wctx->vad_min_silence_duration, 1000, AV_TIME_BASE);
			
 
				+        wctx->vad_params.max_speech_duration_s = av_rescale(wctx->queue, 1, AV_TIME_BASE);
			
 
				+        wctx->vad_params.speech_pad_ms = 0;
			
 
				+        wctx->vad_params.samples_overlap = 0;
			
 
				+    }
			
 
				+
			
 
				+    wctx->next_pts = AV_NOPTS_VALUE;
			
 
				+
			
 
				+    if (wctx->destination && strcmp("", wctx->destination)) {
			
 
				+        const char *dst = wctx->destination;
			
 
				+        if (!strcmp("-", dst))
			
 
				+            dst = "pipe:1";
			
 
				+        int ret = avio_open(&wctx->avio_context, dst, AVIO_FLAG_WRITE);
			
 
				+
			
 
				+        if (ret < 0) {
			
 
				+            av_log(ctx, AV_LOG_ERROR, "Could not open %s: %s\n", wctx->destination, av_err2str(ret));
			
 
				+            return ret;
			
 
				+        }
			
 
				+
			
 
				+        wctx->avio_context->direct = AVIO_FLAG_DIRECT;
			
 
				+    }
			
 
				+
			
 
				+    av_log(ctx, AV_LOG_INFO,
			
 
				+           "Whisper filter initialized: model: %s lang: %s queue: %ld ms\n",
			
 
				+           wctx->model_path, wctx->language, wctx->queue / 1000);
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+static void uninit(AVFilterContext *ctx)
			
 
				+{
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+
			
 
				+    if (wctx->audio_buffer_fill_size > 0) {
			
 
				+        av_log(ctx, AV_LOG_WARNING,
			
 
				+               "Remaining audio buffer %d samples (%d seconds) after stopping\n",
			
 
				+               wctx->audio_buffer_fill_size, wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
			
 
				+    }
			
 
				+
			
 
				+    if (wctx->ctx_vad) {
			
 
				+        whisper_vad_free(wctx->ctx_vad);
			
 
				+        wctx->ctx_vad = NULL;
			
 
				+    }
			
 
				+
			
 
				+    if (wctx->ctx_wsp) {
			
 
				+        whisper_free(wctx->ctx_wsp);
			
 
				+        wctx->ctx_wsp = NULL;
			
 
				+    }
			
 
				+
			
 
				+    av_freep(&wctx->audio_buffer);
			
 
				+
			
 
				+    if (wctx->avio_context)
			
 
				+        avio_closep(&wctx->avio_context);
			
 
				+}
			
 
				+
			
 
				+static void run_transcription(AVFilterContext *ctx, AVFrame *frame, int samples)
			
 
				+{
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+    samples = FFMAX(0, FFMIN(samples, wctx->audio_buffer_fill_size));
			
 
				+
			
 
				+    if (!wctx->ctx_wsp || samples == 0)
			
 
				+        return;
			
 
				+
			
 
				+    const int64_t timestamp_ms = wctx->audio_buffer_start_ms;
			
 
				+    const float duration = (float) samples / WHISPER_SAMPLE_RATE;
			
 
				+
			
 
				+    av_log(ctx, AV_LOG_INFO,
			
 
				+           "run transcription at %ld ms, %d/%d samples (%.2f seconds)...\n",
			
 
				+           timestamp_ms, samples, wctx->audio_buffer_fill_size, duration);
			
 
				+
			
 
				+    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
			
 
				+    params.language = wctx->language;
			
 
				+    params.n_threads = ff_filter_get_nb_threads(ctx);
			
 
				+    params.print_special = 0;
			
 
				+    params.print_progress = 0;
			
 
				+    params.print_realtime = 0;
			
 
				+    params.print_timestamps = 0;
			
 
				+
			
 
				+    if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) {
			
 
				+        av_log(ctx, AV_LOG_ERROR, "Failed to process audio with whisper.cpp\n");
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    const int n_segments = whisper_full_n_segments(wctx->ctx_wsp);
			
 
				+    char *segments_text = NULL;
			
 
				+
			
 
				+    for (int i = 0; i < n_segments; ++i) {
			
 
				+        const char *text = whisper_full_get_segment_text(wctx->ctx_wsp, i);
			
 
				+        char *text_cleaned = av_strireplace(text + 1, "[BLANK_AUDIO]", "");
			
 
				+
			
 
				+        if (av_strnlen(text_cleaned, 1) == 0) {
			
 
				+            av_freep(&text_cleaned);
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        const bool turn = whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
			
 
				+        const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 10;
			
 
				+        const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 10;
			
 
				+
			
 
				+        av_log(ctx, AV_LOG_DEBUG, "  [%ld-%ld%s]: \"%s\"\n",
			
 
				+               timestamp_ms + t0_ms, timestamp_ms + t1_ms, turn ? " (turn)" : "", text_cleaned);
			
 
				+
			
 
				+        if (segments_text) {
			
 
				+            char *new_text = av_asprintf("%s%s", segments_text, text_cleaned);
			
 
				+            av_freep(&segments_text);
			
 
				+            segments_text = new_text;
			
 
				+        } else
			
 
				+            segments_text = av_strdup(text_cleaned);
			
 
				+
			
 
				+        if (wctx->avio_context) {
			
 
				+            const int64_t start_t = timestamp_ms + t0_ms;
			
 
				+            const int64_t end_t = timestamp_ms + t1_ms;
			
 
				+            char *buf = NULL;
			
 
				+
			
 
				+            if (!av_strcasecmp(wctx->format, "srt")) {
			
 
				+                buf =
			
 
				+                    av_asprintf
			
 
				+                    ("%d\n%02ld:%02ld:%02ld.%03ld --> %02ld:%02ld:%02ld.%03ld\n%s\n\n",
			
 
				+                     wctx->index, start_t / 3600000,
			
 
				+                     (start_t / 60000) % 60, (start_t / 1000) % 60,
			
 
				+                     start_t % 1000, end_t / 3600000, (end_t / 60000) % 60,
			
 
				+                     (end_t / 1000) % 60, end_t % 1000, text_cleaned);
			
 
				+            } else if (!av_strcasecmp(wctx->format, "json")) {
			
 
				+                buf = av_asprintf("{\"start\":%ld,\"end\":%ld,\"text\":\"%s\"}\n", start_t, end_t, text_cleaned);
			
 
				+            } else
			
 
				+                buf = av_strdup(text_cleaned);
			
 
				+
			
 
				+            if (buf) {
			
 
				+                avio_write(wctx->avio_context, buf, strlen(buf));
			
 
				+                av_freep(&buf);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        av_freep(&text_cleaned);
			
 
				+    }
			
 
				+
			
 
				+    wctx->index++;
			
 
				+
			
 
				+    AVDictionary **metadata = &frame->metadata;
			
 
				+    if (metadata && segments_text) {
			
 
				+        av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0);
			
 
				+        char *duration_text = av_asprintf("%f", duration);
			
 
				+        av_dict_set(metadata, "lavfi.whisper.duration", duration_text, AV_DICT_DONT_STRDUP_VAL);
			
 
				+    }
			
 
				+    av_freep(&segments_text);
			
 
				+
			
 
				+    if (wctx->audio_buffer_fill_size > samples) {
			
 
				+        memcpy(wctx->audio_buffer, wctx->audio_buffer + samples,
			
 
				+               (wctx->audio_buffer_fill_size - samples) * sizeof(*wctx->audio_buffer));
			
 
				+        wctx->audio_buffer_start_ms += duration * 1000;
			
 
				+    }
			
 
				+    wctx->audio_buffer_fill_size -= samples;
			
 
				+    wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
			
 
				+}
			
 
				+
			
 
				+static int filter_frame(AVFilterLink *inlink, AVFrame *frame)
			
 
				+{
			
 
				+    AVFilterContext *ctx = inlink->dst;
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+    AVFilterLink *outlink = ctx->outputs[0];
			
 
				+
			
 
				+    const int samples = frame->nb_samples;
			
 
				+    const float *input_data = (const float *) frame->data[0];
			
 
				+
			
 
				+    if (wctx->audio_buffer_fill_size + samples > wctx->audio_buffer_queue_size) {
			
 
				+        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);
			
 
				+    }
			
 
				+
			
 
				+    if (!wctx->audio_buffer_fill_size)
			
 
				+        wctx->audio_buffer_start_ms = av_rescale_q(frame->pts,
			
 
				+                                                   (AVRational) {1000, 1},
			
 
				+                                                   (AVRational) {inlink->time_base.den, inlink->time_base.num});
			
 
				+    memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, samples * sizeof(*wctx->audio_buffer));
			
 
				+    wctx->audio_buffer_fill_size += samples;
			
 
				+
			
 
				+    if (wctx->ctx_vad
			
 
				+        && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >=
			
 
				+        av_rescale(wctx->vad_min_speech_duration + wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
			
 
				+        struct whisper_vad_segments *segments = whisper_vad_segments_from_samples(wctx->ctx_vad,
			
 
				+                                                                                  wctx->vad_params,
			
 
				+                                                                                  wctx->audio_buffer,
			
 
				+                                                                                  wctx->audio_buffer_fill_size);
			
 
				+        wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size;
			
 
				+
			
 
				+        if (!segments) {
			
 
				+            av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n");
			
 
				+        } else {
			
 
				+            int n_segments = whisper_vad_segments_n_segments(segments);
			
 
				+
			
 
				+            if (n_segments > 0) {
			
 
				+                const float start_ms = whisper_vad_segments_get_segment_t0(segments, 0) * 10.0;
			
 
				+                const float end_ms = whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0;
			
 
				+                int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000);
			
 
				+
			
 
				+                if (end_pos <= wctx->audio_buffer_fill_size -
			
 
				+                    av_rescale(wctx->vad_min_silence_duration, WHISPER_SAMPLE_RATE, AV_TIME_BASE)) {
			
 
				+                    av_log(ctx, AV_LOG_INFO,
			
 
				+                            "VAD detected %d segments, start: %.0f ms, end: %.0f ms (buffer: %d ms)\n",
			
 
				+                            n_segments, start_ms, end_ms, 1000 * wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE);
			
 
				+                    run_transcription(ctx, frame, end_pos);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            whisper_vad_free_segments(segments);
			
 
				+        }
			
 
				+    } else if (wctx->audio_buffer_fill_size >= wctx->audio_buffer_queue_size)
			
 
				+        run_transcription(ctx, frame, wctx->audio_buffer_fill_size);
			
 
				+
			
 
				+    wctx->next_pts = frame->pts + av_rescale_q(samples, (AVRational) {
			
 
				+                                               1, inlink->sample_rate}
			
 
				+                                               , inlink->time_base);
			
 
				+    return ff_filter_frame(outlink, frame);
			
 
				+}
			
 
				+
			
 
				+static int push_last_frame(AVFilterLink *outlink)
			
 
				+{
			
 
				+    AVFilterContext *ctx = outlink->src;
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+    AVFrame *frame;
			
 
				+    int n_out = 1;
			
 
				+
			
 
				+    if (ctx->is_disabled || wctx->audio_buffer_fill_size == 0)
			
 
				+        return 0;
			
 
				+    frame = ff_get_audio_buffer(outlink, n_out);
			
 
				+    if (!frame)
			
 
				+        return AVERROR(ENOMEM);
			
 
				+
			
 
				+    av_samples_set_silence(frame->extended_data, 0, n_out, frame->ch_layout.nb_channels, frame->format);
			
 
				+
			
 
				+    frame->pts = wctx->next_pts;
			
 
				+    if (wctx->next_pts != AV_NOPTS_VALUE)
			
 
				+        wctx->next_pts += av_rescale_q(n_out, (AVRational) {
			
 
				+                                       1, outlink->sample_rate}
			
 
				+                                       , outlink->time_base);
			
 
				+
			
 
				+    run_transcription(ctx, frame, wctx->audio_buffer_fill_size);
			
 
				+
			
 
				+    return ff_filter_frame(outlink, frame);
			
 
				+}
			
 
				+
			
 
				+static int activate(AVFilterContext *ctx)
			
 
				+{
			
 
				+    AVFilterLink *inlink = ctx->inputs[0];
			
 
				+    AVFilterLink *outlink = ctx->outputs[0];
			
 
				+    WhisperContext *wctx = ctx->priv;
			
 
				+    int64_t pts;
			
 
				+    int status;
			
 
				+
			
 
				+    FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
			
 
				+
			
 
				+    if (!wctx->eof && ff_inlink_queued_frames(inlink)) {
			
 
				+        AVFrame *frame = NULL;
			
 
				+        int ret;
			
 
				+
			
 
				+        ret = ff_inlink_consume_frame(inlink, &frame);
			
 
				+        if (ret < 0)
			
 
				+            return ret;
			
 
				+        if (ret > 0)
			
 
				+            return filter_frame(inlink, frame);
			
 
				+    }
			
 
				+
			
 
				+    if (!wctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts))
			
 
				+        wctx->eof = status == AVERROR_EOF;
			
 
				+
			
 
				+    if (wctx->eof) {
			
 
				+        push_last_frame(outlink);
			
 
				+
			
 
				+        ff_outlink_set_status(outlink, AVERROR_EOF, wctx->next_pts);
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				+    FF_FILTER_FORWARD_WANTED(outlink, inlink);
			
 
				+
			
 
				+    return FFERROR_NOT_READY;
			
 
				+}
			
 
				+
			
 
				+static int query_formats(const AVFilterContext *ctx,
			
 
				+                         AVFilterFormatsConfig **cfg_in,
			
 
				+                         AVFilterFormatsConfig **cfg_out)
			
 
				+{
			
 
				+    static const enum AVSampleFormat sample_fmts[] = { AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_NONE };
			
 
				+    AVChannelLayout chlayouts[] = { FF_COUNT2LAYOUT(1), { 0 } };
			
 
				+    int sample_rates[] = { WHISPER_SAMPLE_RATE, -1 };
			
 
				+    int ret;
			
 
				+
			
 
				+    ret = ff_set_common_formats_from_list2(ctx, cfg_in, cfg_out, sample_fmts);
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    ret = ff_set_common_channel_layouts_from_list2(ctx, cfg_in, cfg_out, chlayouts);
			
 
				+    if (ret < 0)
			
 
				+        return ret;
			
 
				+
			
 
				+    return ff_set_common_samplerates_from_list2(ctx, cfg_in, cfg_out, sample_rates);
			
 
				+}
			
 
				+
			
 
				+#define OFFSET(x) offsetof(WhisperContext, x)
			
 
				+#define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
			
 
				+#define HOURS 3600000000
			
 
				+
			
 
				+static const AVOption whisper_options[] = {
			
 
				+    { "model", "Path to the whisper.cpp model file", OFFSET(model_path), AV_OPT_TYPE_STRING,.flags = FLAGS }, { "language", "Language for transcription ('auto' for auto-detect)", OFFSET(language), AV_OPT_TYPE_STRING, {.str = "auto"}, .flags = FLAGS },
			
 
				+    { "queue", "Audio queue size", OFFSET(queue), AV_OPT_TYPE_DURATION, {.i64 = 3000000}, 20000, HOURS, .flags = FLAGS },
			
 
				+    { "use_gpu", "Use GPU for processing", OFFSET(use_gpu), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, .flags = FLAGS },
			
 
				+    { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
			
 
				+    { "destination", "Output destination", OFFSET(destination), AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
			
 
				+    { "format", "Output format (text|srt|json)", OFFSET(format), AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
			
 
				+    { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), AV_OPT_TYPE_STRING,.flags = FLAGS },
			
 
				+    { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
			
 
				+    { "vad_min_speech_duration", "Minimum speech duration for VAD", OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, HOURS, .flags = FLAGS },
			
 
				+    { "vad_min_silence_duration", "Minimum silence duration for VAD", OFFSET(vad_min_silence_duration), AV_OPT_TYPE_DURATION, {.i64 = 500000}, 0, HOURS, .flags = FLAGS },
			
 
				+    { NULL }
			
 
				+};
			
 
				+
			
 
				+static const AVClass whisper_class = {
			
 
				+    .class_name = "whisper",
			
 
				+    .item_name = av_default_item_name,
			
 
				+    .option = whisper_options,
			
 
				+    .version = LIBAVUTIL_VERSION_INT,
			
 
				+};
			
 
				+
			
 
				+const FFFilter ff_af_whisper = {
			
 
				+    .p.name = "whisper",
			
 
				+    .p.description = NULL_IF_CONFIG_SMALL("Transcribe audio using whisper.cpp."),
			
 
				+    .p.priv_class = &whisper_class,
			
 
				+    .p.flags = AVFILTER_FLAG_METADATA_ONLY,
			
 
				+    .init = init,
			
 
				+    .uninit = uninit,
			
 
				+    .activate = activate,
			
 
				+    .priv_size = sizeof(WhisperContext),
			
 
				+    FILTER_INPUTS(ff_audio_default_filterpad),
			
 
				+    FILTER_OUTPUTS(ff_audio_default_filterpad),
			
 
				+    FILTER_QUERY_FUNC2(query_formats),
			
 
				+};
			
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -162,6 +162,7 @@ extern const FFFilter ff_af_vibrato;
 
				 extern const FFFilter ff_af_virtualbass;
			
 
				 extern const FFFilter ff_af_volume;
			
 
				 extern const FFFilter ff_af_volumedetect;
			
 
				+extern const FFFilter ff_af_whisper;
			
 
				 
			
 
				 extern const FFFilter ff_asrc_aevalsrc;
			
 
				 extern const FFFilter ff_asrc_afdelaysrc;