add silenceremove filter

Signed-off-by: Paul B Mahol <onemda@gmail.com>
11 years ago · 422619646e
--- a/Changelog
+++ b/Changelog
@@ -13,6 +13,7 @@ version <next>:
 - added codecview filter to visualize information exported by some codecs
 - Matroska 3D support thorugh side data
 - HTML generation using texi2html is deprecated in favor of makeinfo/texi2any
 - silenceremove filter


 version 2.3:
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -343,6 +343,7 @@ Filters:
  af_compand.c                          Paul B Mahol
  af_ladspa.c                           Paul B Mahol
  af_pan.c                              Nicolas George
  af_silenceremove.c                    Paul B Mahol
  avf_avectorscope.c                    Paul B Mahol
  avf_showcqt.c                         Muhammad Faiz
  vf_blend.c                            Paul B Mahol
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -42,6 +42,7 @@
    • ported lenscorrection filter from frei0r filter
    • large optimizations in dctdnoiz to make it usable
    • added codecview filter to visualize information exported by some codecs
    • added silenceremove filter

   ┌────────────────────────────┐
   │ libavutil                  │
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -1875,6 +1875,75 @@ ffmpeg -i silence.mp3 -af silencedetect=noise=0.0001 -f null -
@end example
@end itemize

@section silenceremove

 Remove silence from the beginning, middle or end of the audio.

 The filter accepts the following options:

@table @option
@item start_periods
 This value is used to indicate if audio should be trimmed at beginning of
 the audio. A value of zero indicates no silence should be trimmed from the
 beginning. When specifying a non-zero value, it trims audio up until it
 finds non-silence. Normally, when trimming silence from beginning of audio
 the @var{start_periods} will be @code{1} but it can be increased to higher
 values to trim all audio up to specific count of non-silence periods.
 Default value is @code{0}.

@item start_duration
 Specify the amount of time that non-silence must be detected before it stops
 trimming audio. By increasing the duration, bursts of noises can be treated
 as silence and trimmed off. Default value is @code{0}.

@item start_threshold
 This indicates what sample value should be treated as silence. For digital
 audio, a value of @code{0} may be fine but for audio recorded from analog,
 you may wish to increase the value to account for background noise.
 Can be specified in dB (in case "dB" is appended to the specified value)
 or amplitude ratio. Default value is @code{0}.

@item stop_periods
 Set the count for trimming silence from the end of audio.
 To remove silence from the middle of a file, specify a @var{stop_periods}
 that is negative. This value is then threated as a positive value and is
 used to indicate the effect should restart processing as specified by
@var{start_periods}, making it suitable for removing periods of silence
 in the middle of the audio.
 Default value is @code{0}.

@item stop_duration
 Specify a duration of silence that must exist before audio is not copied any
 more. By specifying a higher duration, silence that is wanted can be left in
 the audio.
 Default value is @code{0}.

@item stop_threshold
 This is the same as @option{start_threshold} but for trimming silence from
 the end of audio.
 Can be specified in dB (in case "dB" is appended to the specified value)
 or amplitude ratio. Default value is @code{0}.

@item leave_silence
 This indicate that @var{stop_duration} length of audio should be left intact
 at the beginning of each period of silence.
 For example, if you want to remove long pauses between words but do not want
 to remove the pauses completely. Default value is @code{0}.

@end table

@subsection Examples

@itemize
@item
 The following example shows how this filter can be used to start a recording
 that does not contain the delay at the start which usually occurs between
 pressing the record button and the start of the performance:
@example
 silenceremove=1:5:0.02
@end example
@end itemize

@section treble

 Boost or cut treble (upper) frequencies of the audio using a two-pole
--- a/libavfilter/Makefile
+++ b/libavfilter/Makefile
@@ -78,6 +78,7 @@ OBJS-$(CONFIG_PAN_FILTER)                    += af_pan.o
 OBJS-$(CONFIG_REPLAYGAIN_FILTER)             += af_replaygain.o
 OBJS-$(CONFIG_RESAMPLE_FILTER)               += af_resample.o
 OBJS-$(CONFIG_SILENCEDETECT_FILTER)          += af_silencedetect.o
 OBJS-$(CONFIG_SILENCEREMOVE_FILTER)          += af_silenceremove.o
 OBJS-$(CONFIG_TREBLE_FILTER)                 += af_biquads.o
 OBJS-$(CONFIG_VOLUME_FILTER)                 += af_volume.o
 OBJS-$(CONFIG_VOLUMEDETECT_FILTER)           += af_volumedetect.o
--- a/libavfilter/af_silenceremove.c
+++ b/libavfilter/af_silenceremove.c
@@ -0,0 +1,482 @@
 /*
 * Copyright (c) 2001 Heikki Leinonen
 * Copyright (c) 2001 Chris Bagwell
 * Copyright (c) 2003 Donnie Smith
 * Copyright (c) 2014 Paul B Mahol
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <float.h> /* DBL_MAX */

 #include "libavutil/opt.h"
 #include "libavutil/timestamp.h"
 #include "audio.h"
 #include "formats.h"
 #include "avfilter.h"
 #include "internal.h"

 enum SilenceMode {
    SILENCE_TRIM,
    SILENCE_TRIM_FLUSH,
    SILENCE_COPY,
    SILENCE_COPY_FLUSH,
    SILENCE_STOP
 };

 typedef struct SilenceRemoveContext {
    const AVClass *class;

    enum SilenceMode mode;

    int start_periods;
    int64_t start_duration;
    double start_threshold;

    int stop_periods;
    int64_t stop_duration;
    double stop_threshold;

    double *start_holdoff;
    size_t start_holdoff_offset;
    size_t start_holdoff_end;
    int    start_found_periods;

    double *stop_holdoff;
    size_t stop_holdoff_offset;
    size_t stop_holdoff_end;
    int    stop_found_periods;

    double *window;
    double *window_current;
    double *window_end;
    int window_size;
    double rms_sum;

    int leave_silence;
    int restart;
    int64_t next_pts;
 } SilenceRemoveContext;

 #define OFFSET(x) offsetof(SilenceRemoveContext, x)
 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
 static const AVOption silenceremove_options[] = {
    { "start_periods",   NULL, OFFSET(start_periods),   AV_OPT_TYPE_INT,      {.i64=0},     0,    9000, FLAGS },
    { "start_duration",  NULL, OFFSET(start_duration),  AV_OPT_TYPE_DURATION, {.i64=0},     0,    9000, FLAGS },
    { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE,   {.dbl=0},     0, DBL_MAX, FLAGS },
    { "stop_periods",    NULL, OFFSET(stop_periods),    AV_OPT_TYPE_INT,      {.i64=0}, -9000,    9000, FLAGS },
    { "stop_duration",   NULL, OFFSET(stop_duration),   AV_OPT_TYPE_DURATION, {.i64=0},     0,    9000, FLAGS },
    { "stop_threshold",  NULL, OFFSET(stop_threshold),  AV_OPT_TYPE_DOUBLE,   {.dbl=0},     0, DBL_MAX, FLAGS },
    { "leave_silence",   NULL, OFFSET(leave_silence),   AV_OPT_TYPE_INT,      {.i64=0},     0,       1, FLAGS },
    { NULL }
 };

 AVFILTER_DEFINE_CLASS(silenceremove);

 static av_cold int init(AVFilterContext *ctx)
 {
    SilenceRemoveContext *s = ctx->priv;

    if (s->stop_periods < 0) {
        s->stop_periods = -s->stop_periods;
        s->restart = 1;
    }

    return 0;
 }

 static void clear_rms(SilenceRemoveContext *s)
 {
    memset(s->window, 0, s->window_size * sizeof(*s->window));

    s->window_current = s->window;
    s->window_end = s->window + s->window_size;
    s->rms_sum = 0;
 }

 static int config_input(AVFilterLink *inlink)
 {
    AVFilterContext *ctx = inlink->dst;
    SilenceRemoveContext *s = ctx->priv;

    s->window_size = (inlink->sample_rate / 50) * inlink->channels;
    s->window = av_malloc_array(s->window_size, sizeof(*s->window));
    if (!s->window)
        return AVERROR(ENOMEM);

    clear_rms(s);

    s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
                                   AV_TIME_BASE);
    s->stop_duration  = av_rescale(s->stop_duration, inlink->sample_rate,
                                   AV_TIME_BASE);

    s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
                                       sizeof(*s->start_holdoff) *
                                       inlink->channels);
    if (!s->start_holdoff)
        return AVERROR(ENOMEM);

    s->start_holdoff_offset = 0;
    s->start_holdoff_end    = 0;
    s->start_found_periods  = 0;

    s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
                                      sizeof(*s->stop_holdoff) *
                                      inlink->channels);
    if (!s->stop_holdoff)
        return AVERROR(ENOMEM);

    s->stop_holdoff_offset = 0;
    s->stop_holdoff_end    = 0;
    s->stop_found_periods  = 0;

    if (s->start_periods)
        s->mode = SILENCE_TRIM;
    else
        s->mode = SILENCE_COPY;

    return 0;
 }

 static int config_output(AVFilterLink *outlink)
 {
    outlink->flags |= FF_LINK_FLAG_REQUEST_LOOP;

    return 0;
 }

 static double compute_rms(SilenceRemoveContext *s, double sample)
 {
    double new_sum;

    new_sum  = s->rms_sum;
    new_sum -= *s->window_current;
    new_sum += sample * sample;

    return sqrt(new_sum / s->window_size);
 }

 static void update_rms(SilenceRemoveContext *s, double sample)
 {
    s->rms_sum -= *s->window_current;
    *s->window_current = sample * sample;
    s->rms_sum += *s->window_current;

    s->window_current++;
    if (s->window_current >= s->window_end)
        s->window_current = s->window;
 }

 static void flush(AVFrame *out, AVFilterLink *outlink,
                  int *nb_samples_written, int *ret)
 {
    if (*nb_samples_written) {
        out->nb_samples = *nb_samples_written / outlink->channels;
        *ret = ff_filter_frame(outlink, out);
        *nb_samples_written = 0;
    } else {
        av_frame_free(&out);
    }
 }

 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
 {
    AVFilterContext *ctx = inlink->dst;
    AVFilterLink *outlink = ctx->outputs[0];
    SilenceRemoveContext *s = ctx->priv;
    int i, j, threshold, ret = 0;
    int nbs, nb_samples_read, nb_samples_written;
    double *obuf, *ibuf = (double *)in->data[0];
    AVFrame *out;

    nb_samples_read = nb_samples_written = 0;

    switch (s->mode) {
    case SILENCE_TRIM:
 silence_trim:
        nbs = in->nb_samples - nb_samples_read / inlink->channels;
        if (!nbs)
            break;

        for (i = 0; i < nbs; i++) {
            threshold = 0;
            for (j = 0; j < inlink->channels; j++) {
                threshold |= compute_rms(s, ibuf[j]) > s->start_threshold;
            }

            if (threshold) {
                for (j = 0; j < inlink->channels; j++) {
                    update_rms(s, *ibuf);
                    s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
                    nb_samples_read++;
                }

                if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
                    if (++s->start_found_periods >= s->start_periods) {
                        s->mode = SILENCE_TRIM_FLUSH;
                        goto silence_trim_flush;
                    }

                    s->start_holdoff_offset = 0;
                    s->start_holdoff_end = 0;
                }
            } else {
                s->start_holdoff_end = 0;

                for (j = 0; j < inlink->channels; j++)
                    update_rms(s, ibuf[j]);

                ibuf += inlink->channels;
                nb_samples_read += inlink->channels;
            }
        }
        break;

    case SILENCE_TRIM_FLUSH:
 silence_trim_flush:
        nbs  = s->start_holdoff_end - s->start_holdoff_offset;
        nbs -= nbs % inlink->channels;
        if (!nbs)
            break;

        out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
        if (!out) {
            av_frame_free(&in);
            return AVERROR(ENOMEM);
        }

        memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset],
               nbs * sizeof(double));
        s->start_holdoff_offset += nbs;

        ret = ff_filter_frame(outlink, out);

        if (s->start_holdoff_offset == s->start_holdoff_end) {
            s->start_holdoff_offset = 0;
            s->start_holdoff_end = 0;
            s->mode = SILENCE_COPY;
            goto silence_copy;
        }
        break;

    case SILENCE_COPY:
 silence_copy:
        nbs = in->nb_samples - nb_samples_read / inlink->channels;
        if (!nbs)
            break;

        out = ff_get_audio_buffer(inlink, nbs);
        if (!out) {
            av_frame_free(&in);
            return AVERROR(ENOMEM);
        }
        obuf = (double *)out->data[0];

        if (s->stop_periods) {
            for (i = 0; i < nbs; i++) {
                threshold = 1;
                for (j = 0; j < inlink->channels; j++)
                    threshold &= compute_rms(s, ibuf[j]) > s->stop_threshold;

                if (threshold && s->stop_holdoff_end && !s->leave_silence) {
                    s->mode = SILENCE_COPY_FLUSH;
                    flush(out, outlink, &nb_samples_written, &ret);
                    goto silence_copy_flush;
                } else if (threshold) {
                    for (j = 0; j < inlink->channels; j++) {
                        update_rms(s, *ibuf);
                        *obuf++ = *ibuf++;
                        nb_samples_read++;
                        nb_samples_written++;
                    }
                } else if (!threshold) {
                    for (j = 0; j < inlink->channels; j++) {
                        update_rms(s, *ibuf);
                        if (s->leave_silence) {
                            *obuf++ = *ibuf;
                            nb_samples_written++;
                        }

                        s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
                        nb_samples_read++;
                    }

                    if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
                        if (++s->stop_found_periods >= s->stop_periods) {
                            s->stop_holdoff_offset = 0;
                            s->stop_holdoff_end = 0;

                            if (!s->restart) {
                                s->mode = SILENCE_STOP;
                                flush(out, outlink, &nb_samples_written, &ret);
                                goto silence_stop;
                            } else {
                                s->stop_found_periods = 0;
                                s->start_found_periods = 0;
                                s->start_holdoff_offset = 0;
                                s->start_holdoff_end = 0;
                                clear_rms(s);
                                s->mode = SILENCE_TRIM;
                                flush(out, outlink, &nb_samples_written, &ret);
                                goto silence_trim;
                            }
                        } else {
                            s->mode = SILENCE_COPY_FLUSH;
                            flush(out, outlink, &nb_samples_written, &ret);
                            goto silence_copy_flush;
                        }
                        flush(out, outlink, &nb_samples_written, &ret);
                        break;
                    }
                }
            }
            flush(out, outlink, &nb_samples_written, &ret);
        } else {
            memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
            ret = ff_filter_frame(outlink, out);
        }
        break;

    case SILENCE_COPY_FLUSH:
 silence_copy_flush:
        nbs  = s->stop_holdoff_end - s->stop_holdoff_offset;
        nbs -= nbs % inlink->channels;
        if (!nbs)
            break;

        out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
        if (!out) {
            av_frame_free(&in);
            return AVERROR(ENOMEM);
        }

        memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
               nbs * sizeof(double));
        s->stop_holdoff_offset += nbs;

        ret = ff_filter_frame(outlink, out);

        if (s->stop_holdoff_offset == s->stop_holdoff_end) {
            s->stop_holdoff_offset = 0;
            s->stop_holdoff_end = 0;
            s->mode = SILENCE_COPY;
            goto silence_copy;
        }
        break;
    case SILENCE_STOP:
 silence_stop:
        break;
    }

    av_frame_free(&in);

    return ret;
 }

 static int request_frame(AVFilterLink *outlink)
 {
    AVFilterContext *ctx = outlink->src;
    SilenceRemoveContext *s = ctx->priv;
    int ret;

    ret = ff_request_frame(ctx->inputs[0]);
    if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
                               s->mode == SILENCE_COPY)) {
        int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
        if (nbs) {
            AVFrame *frame;

            frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
            if (!frame)
                return AVERROR(ENOMEM);

            memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
                   nbs * sizeof(double));
            ret = ff_filter_frame(ctx->inputs[0], frame);
        }
        s->mode = SILENCE_STOP;
    }
    return ret;
 }

 static int query_formats(AVFilterContext *ctx)
 {
    AVFilterFormats *formats = NULL;
    AVFilterChannelLayouts *layouts = NULL;
    static const enum AVSampleFormat sample_fmts[] = {
        AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
    };

    layouts = ff_all_channel_layouts();
    if (!layouts)
        return AVERROR(ENOMEM);
    ff_set_common_channel_layouts(ctx, layouts);

    formats = ff_make_format_list(sample_fmts);
    if (!formats)
        return AVERROR(ENOMEM);
    ff_set_common_formats(ctx, formats);

    formats = ff_all_samplerates();
    if (!formats)
        return AVERROR(ENOMEM);
    ff_set_common_samplerates(ctx, formats);

    return 0;
 }

 static av_cold void uninit(AVFilterContext *ctx)
 {
    SilenceRemoveContext *s = ctx->priv;

    av_freep(&s->start_holdoff);
    av_freep(&s->stop_holdoff);
    av_freep(&s->window);
 }

 static const AVFilterPad silenceremove_inputs[] = {
    {
        .name         = "default",
        .type         = AVMEDIA_TYPE_AUDIO,
        .config_props = config_input,
        .filter_frame = filter_frame,
    },
    { NULL }
 };

 static const AVFilterPad silenceremove_outputs[] = {
    {
        .name          = "default",
        .type          = AVMEDIA_TYPE_AUDIO,
        .config_props  = config_output,
        .request_frame = request_frame,
    },
    { NULL }
 };

 AVFilter ff_af_silenceremove = {
    .name          = "silenceremove",
    .description   = NULL_IF_CONFIG_SMALL("Remove silence."),
    .priv_size     = sizeof(SilenceRemoveContext),
    .priv_class    = &silenceremove_class,
    .init          = init,
    .uninit        = uninit,
    .query_formats = query_formats,
    .inputs        = silenceremove_inputs,
    .outputs       = silenceremove_outputs,
 };
--- a/libavfilter/allfilters.c
+++ b/libavfilter/allfilters.c
@@ -96,6 +96,7 @@ void avfilter_register_all(void)
    REGISTER_FILTER(REPLAYGAIN,     replaygain,     af);
    REGISTER_FILTER(RESAMPLE,       resample,       af);
    REGISTER_FILTER(SILENCEDETECT,  silencedetect,  af);
    REGISTER_FILTER(SILENCEREMOVE,  silenceremove,  af);
    REGISTER_FILTER(TREBLE,         treble,         af);
    REGISTER_FILTER(VOLUME,         volume,         af);
    REGISTER_FILTER(VOLUMEDETECT,   volumedetect,   af);
--- a/libavfilter/version.h
+++ b/libavfilter/version.h
@@ -30,8 +30,8 @@
 #include "libavutil/version.h"

 #define LIBAVFILTER_VERSION_MAJOR  5
 #define LIBAVFILTER_VERSION_MINOR  0
 #define LIBAVFILTER_VERSION_MICRO 103
 #define LIBAVFILTER_VERSION_MINOR  1
 #define LIBAVFILTER_VERSION_MICRO 100

 #define LIBAVFILTER_VERSION_INT AV_VERSION_INT(LIBAVFILTER_VERSION_MAJOR, \
                                               LIBAVFILTER_VERSION_MINOR, \