You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

475 lines
14KB

  1. /*
  2. * Copyright (c) 2001 Heikki Leinonen
  3. * Copyright (c) 2001 Chris Bagwell
  4. * Copyright (c) 2003 Donnie Smith
  5. * Copyright (c) 2014 Paul B Mahol
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include <float.h> /* DBL_MAX */
  24. #include "libavutil/opt.h"
  25. #include "libavutil/timestamp.h"
  26. #include "audio.h"
  27. #include "formats.h"
  28. #include "avfilter.h"
  29. #include "internal.h"
  30. enum SilenceMode {
  31. SILENCE_TRIM,
  32. SILENCE_TRIM_FLUSH,
  33. SILENCE_COPY,
  34. SILENCE_COPY_FLUSH,
  35. SILENCE_STOP
  36. };
  37. typedef struct SilenceRemoveContext {
  38. const AVClass *class;
  39. enum SilenceMode mode;
  40. int start_periods;
  41. int64_t start_duration;
  42. double start_threshold;
  43. int stop_periods;
  44. int64_t stop_duration;
  45. double stop_threshold;
  46. double *start_holdoff;
  47. size_t start_holdoff_offset;
  48. size_t start_holdoff_end;
  49. int start_found_periods;
  50. double *stop_holdoff;
  51. size_t stop_holdoff_offset;
  52. size_t stop_holdoff_end;
  53. int stop_found_periods;
  54. double *window;
  55. double *window_current;
  56. double *window_end;
  57. int window_size;
  58. double rms_sum;
  59. int leave_silence;
  60. int restart;
  61. int64_t next_pts;
  62. } SilenceRemoveContext;
  63. #define OFFSET(x) offsetof(SilenceRemoveContext, x)
  64. #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
  65. static const AVOption silenceremove_options[] = {
  66. { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS },
  67. { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  68. { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  69. { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS },
  70. { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  71. { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  72. { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
  73. { NULL }
  74. };
  75. AVFILTER_DEFINE_CLASS(silenceremove);
  76. static av_cold int init(AVFilterContext *ctx)
  77. {
  78. SilenceRemoveContext *s = ctx->priv;
  79. if (s->stop_periods < 0) {
  80. s->stop_periods = -s->stop_periods;
  81. s->restart = 1;
  82. }
  83. return 0;
  84. }
  85. static void clear_rms(SilenceRemoveContext *s)
  86. {
  87. memset(s->window, 0, s->window_size * sizeof(*s->window));
  88. s->window_current = s->window;
  89. s->window_end = s->window + s->window_size;
  90. s->rms_sum = 0;
  91. }
  92. static int config_input(AVFilterLink *inlink)
  93. {
  94. AVFilterContext *ctx = inlink->dst;
  95. SilenceRemoveContext *s = ctx->priv;
  96. s->window_size = (inlink->sample_rate / 50) * inlink->channels;
  97. s->window = av_malloc_array(s->window_size, sizeof(*s->window));
  98. if (!s->window)
  99. return AVERROR(ENOMEM);
  100. clear_rms(s);
  101. s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
  102. AV_TIME_BASE);
  103. s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate,
  104. AV_TIME_BASE);
  105. s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
  106. sizeof(*s->start_holdoff) *
  107. inlink->channels);
  108. if (!s->start_holdoff)
  109. return AVERROR(ENOMEM);
  110. s->start_holdoff_offset = 0;
  111. s->start_holdoff_end = 0;
  112. s->start_found_periods = 0;
  113. s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
  114. sizeof(*s->stop_holdoff) *
  115. inlink->channels);
  116. if (!s->stop_holdoff)
  117. return AVERROR(ENOMEM);
  118. s->stop_holdoff_offset = 0;
  119. s->stop_holdoff_end = 0;
  120. s->stop_found_periods = 0;
  121. if (s->start_periods)
  122. s->mode = SILENCE_TRIM;
  123. else
  124. s->mode = SILENCE_COPY;
  125. return 0;
  126. }
  127. static double compute_rms(SilenceRemoveContext *s, double sample)
  128. {
  129. double new_sum;
  130. new_sum = s->rms_sum;
  131. new_sum -= *s->window_current;
  132. new_sum += sample * sample;
  133. return sqrt(new_sum / s->window_size);
  134. }
  135. static void update_rms(SilenceRemoveContext *s, double sample)
  136. {
  137. s->rms_sum -= *s->window_current;
  138. *s->window_current = sample * sample;
  139. s->rms_sum += *s->window_current;
  140. s->window_current++;
  141. if (s->window_current >= s->window_end)
  142. s->window_current = s->window;
  143. }
  144. static void flush(AVFrame *out, AVFilterLink *outlink,
  145. int *nb_samples_written, int *ret)
  146. {
  147. if (*nb_samples_written) {
  148. out->nb_samples = *nb_samples_written / outlink->channels;
  149. *ret = ff_filter_frame(outlink, out);
  150. *nb_samples_written = 0;
  151. } else {
  152. av_frame_free(&out);
  153. }
  154. }
  155. static int filter_frame(AVFilterLink *inlink, AVFrame *in)
  156. {
  157. AVFilterContext *ctx = inlink->dst;
  158. AVFilterLink *outlink = ctx->outputs[0];
  159. SilenceRemoveContext *s = ctx->priv;
  160. int i, j, threshold, ret = 0;
  161. int nbs, nb_samples_read, nb_samples_written;
  162. double *obuf, *ibuf = (double *)in->data[0];
  163. AVFrame *out;
  164. nb_samples_read = nb_samples_written = 0;
  165. switch (s->mode) {
  166. case SILENCE_TRIM:
  167. silence_trim:
  168. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  169. if (!nbs)
  170. break;
  171. for (i = 0; i < nbs; i++) {
  172. threshold = 0;
  173. for (j = 0; j < inlink->channels; j++) {
  174. threshold |= compute_rms(s, ibuf[j]) > s->start_threshold;
  175. }
  176. if (threshold) {
  177. for (j = 0; j < inlink->channels; j++) {
  178. update_rms(s, *ibuf);
  179. s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
  180. nb_samples_read++;
  181. }
  182. if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
  183. if (++s->start_found_periods >= s->start_periods) {
  184. s->mode = SILENCE_TRIM_FLUSH;
  185. goto silence_trim_flush;
  186. }
  187. s->start_holdoff_offset = 0;
  188. s->start_holdoff_end = 0;
  189. }
  190. } else {
  191. s->start_holdoff_end = 0;
  192. for (j = 0; j < inlink->channels; j++)
  193. update_rms(s, ibuf[j]);
  194. ibuf += inlink->channels;
  195. nb_samples_read += inlink->channels;
  196. }
  197. }
  198. break;
  199. case SILENCE_TRIM_FLUSH:
  200. silence_trim_flush:
  201. nbs = s->start_holdoff_end - s->start_holdoff_offset;
  202. nbs -= nbs % inlink->channels;
  203. if (!nbs)
  204. break;
  205. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  206. if (!out) {
  207. av_frame_free(&in);
  208. return AVERROR(ENOMEM);
  209. }
  210. memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset],
  211. nbs * sizeof(double));
  212. s->start_holdoff_offset += nbs;
  213. ret = ff_filter_frame(outlink, out);
  214. if (s->start_holdoff_offset == s->start_holdoff_end) {
  215. s->start_holdoff_offset = 0;
  216. s->start_holdoff_end = 0;
  217. s->mode = SILENCE_COPY;
  218. goto silence_copy;
  219. }
  220. break;
  221. case SILENCE_COPY:
  222. silence_copy:
  223. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  224. if (!nbs)
  225. break;
  226. out = ff_get_audio_buffer(inlink, nbs);
  227. if (!out) {
  228. av_frame_free(&in);
  229. return AVERROR(ENOMEM);
  230. }
  231. obuf = (double *)out->data[0];
  232. if (s->stop_periods) {
  233. for (i = 0; i < nbs; i++) {
  234. threshold = 1;
  235. for (j = 0; j < inlink->channels; j++)
  236. threshold &= compute_rms(s, ibuf[j]) > s->stop_threshold;
  237. if (threshold && s->stop_holdoff_end && !s->leave_silence) {
  238. s->mode = SILENCE_COPY_FLUSH;
  239. flush(out, outlink, &nb_samples_written, &ret);
  240. goto silence_copy_flush;
  241. } else if (threshold) {
  242. for (j = 0; j < inlink->channels; j++) {
  243. update_rms(s, *ibuf);
  244. *obuf++ = *ibuf++;
  245. nb_samples_read++;
  246. nb_samples_written++;
  247. }
  248. } else if (!threshold) {
  249. for (j = 0; j < inlink->channels; j++) {
  250. update_rms(s, *ibuf);
  251. if (s->leave_silence) {
  252. *obuf++ = *ibuf;
  253. nb_samples_written++;
  254. }
  255. s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
  256. nb_samples_read++;
  257. }
  258. if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
  259. if (++s->stop_found_periods >= s->stop_periods) {
  260. s->stop_holdoff_offset = 0;
  261. s->stop_holdoff_end = 0;
  262. if (!s->restart) {
  263. s->mode = SILENCE_STOP;
  264. flush(out, outlink, &nb_samples_written, &ret);
  265. goto silence_stop;
  266. } else {
  267. s->stop_found_periods = 0;
  268. s->start_found_periods = 0;
  269. s->start_holdoff_offset = 0;
  270. s->start_holdoff_end = 0;
  271. clear_rms(s);
  272. s->mode = SILENCE_TRIM;
  273. flush(out, outlink, &nb_samples_written, &ret);
  274. goto silence_trim;
  275. }
  276. }
  277. s->mode = SILENCE_COPY_FLUSH;
  278. flush(out, outlink, &nb_samples_written, &ret);
  279. goto silence_copy_flush;
  280. }
  281. }
  282. }
  283. flush(out, outlink, &nb_samples_written, &ret);
  284. } else {
  285. memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
  286. ret = ff_filter_frame(outlink, out);
  287. }
  288. break;
  289. case SILENCE_COPY_FLUSH:
  290. silence_copy_flush:
  291. nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  292. nbs -= nbs % inlink->channels;
  293. if (!nbs)
  294. break;
  295. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  296. if (!out) {
  297. av_frame_free(&in);
  298. return AVERROR(ENOMEM);
  299. }
  300. memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  301. nbs * sizeof(double));
  302. s->stop_holdoff_offset += nbs;
  303. ret = ff_filter_frame(outlink, out);
  304. if (s->stop_holdoff_offset == s->stop_holdoff_end) {
  305. s->stop_holdoff_offset = 0;
  306. s->stop_holdoff_end = 0;
  307. s->mode = SILENCE_COPY;
  308. goto silence_copy;
  309. }
  310. break;
  311. case SILENCE_STOP:
  312. silence_stop:
  313. break;
  314. }
  315. av_frame_free(&in);
  316. return ret;
  317. }
  318. static int request_frame(AVFilterLink *outlink)
  319. {
  320. AVFilterContext *ctx = outlink->src;
  321. SilenceRemoveContext *s = ctx->priv;
  322. int ret;
  323. ret = ff_request_frame(ctx->inputs[0]);
  324. if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
  325. s->mode == SILENCE_COPY)) {
  326. int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  327. if (nbs) {
  328. AVFrame *frame;
  329. frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
  330. if (!frame)
  331. return AVERROR(ENOMEM);
  332. memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  333. nbs * sizeof(double));
  334. ret = ff_filter_frame(ctx->inputs[0], frame);
  335. }
  336. s->mode = SILENCE_STOP;
  337. }
  338. return ret;
  339. }
  340. static int query_formats(AVFilterContext *ctx)
  341. {
  342. AVFilterFormats *formats = NULL;
  343. AVFilterChannelLayouts *layouts = NULL;
  344. static const enum AVSampleFormat sample_fmts[] = {
  345. AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
  346. };
  347. int ret;
  348. layouts = ff_all_channel_counts();
  349. if (!layouts)
  350. return AVERROR(ENOMEM);
  351. ret = ff_set_common_channel_layouts(ctx, layouts);
  352. if (ret < 0)
  353. return ret;
  354. formats = ff_make_format_list(sample_fmts);
  355. if (!formats)
  356. return AVERROR(ENOMEM);
  357. ret = ff_set_common_formats(ctx, formats);
  358. if (ret < 0)
  359. return ret;
  360. formats = ff_all_samplerates();
  361. if (!formats)
  362. return AVERROR(ENOMEM);
  363. return ff_set_common_samplerates(ctx, formats);
  364. }
  365. static av_cold void uninit(AVFilterContext *ctx)
  366. {
  367. SilenceRemoveContext *s = ctx->priv;
  368. av_freep(&s->start_holdoff);
  369. av_freep(&s->stop_holdoff);
  370. av_freep(&s->window);
  371. }
  372. static const AVFilterPad silenceremove_inputs[] = {
  373. {
  374. .name = "default",
  375. .type = AVMEDIA_TYPE_AUDIO,
  376. .config_props = config_input,
  377. .filter_frame = filter_frame,
  378. },
  379. { NULL }
  380. };
  381. static const AVFilterPad silenceremove_outputs[] = {
  382. {
  383. .name = "default",
  384. .type = AVMEDIA_TYPE_AUDIO,
  385. .request_frame = request_frame,
  386. },
  387. { NULL }
  388. };
  389. AVFilter ff_af_silenceremove = {
  390. .name = "silenceremove",
  391. .description = NULL_IF_CONFIG_SMALL("Remove silence."),
  392. .priv_size = sizeof(SilenceRemoveContext),
  393. .priv_class = &silenceremove_class,
  394. .init = init,
  395. .uninit = uninit,
  396. .query_formats = query_formats,
  397. .inputs = silenceremove_inputs,
  398. .outputs = silenceremove_outputs,
  399. };