You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

515 lines
16KB

  1. /*
  2. * Copyright (c) 2001 Heikki Leinonen
  3. * Copyright (c) 2001 Chris Bagwell
  4. * Copyright (c) 2003 Donnie Smith
  5. * Copyright (c) 2014 Paul B Mahol
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include <float.h> /* DBL_MAX */
  24. #include "libavutil/opt.h"
  25. #include "libavutil/timestamp.h"
  26. #include "audio.h"
  27. #include "formats.h"
  28. #include "avfilter.h"
  29. #include "internal.h"
  30. enum SilenceMode {
  31. SILENCE_TRIM,
  32. SILENCE_TRIM_FLUSH,
  33. SILENCE_COPY,
  34. SILENCE_COPY_FLUSH,
  35. SILENCE_STOP
  36. };
  37. typedef struct SilenceRemoveContext {
  38. const AVClass *class;
  39. enum SilenceMode mode;
  40. int start_periods;
  41. int64_t start_duration;
  42. double start_threshold;
  43. int stop_periods;
  44. int64_t stop_duration;
  45. double stop_threshold;
  46. double *start_holdoff;
  47. size_t start_holdoff_offset;
  48. size_t start_holdoff_end;
  49. int start_found_periods;
  50. double *stop_holdoff;
  51. size_t stop_holdoff_offset;
  52. size_t stop_holdoff_end;
  53. int stop_found_periods;
  54. double *window;
  55. double *window_current;
  56. double *window_end;
  57. int window_size;
  58. double sum;
  59. int leave_silence;
  60. int restart;
  61. int64_t next_pts;
  62. int detection;
  63. void (*update)(struct SilenceRemoveContext *s, double sample);
  64. double(*compute)(struct SilenceRemoveContext *s, double sample);
  65. } SilenceRemoveContext;
  66. #define OFFSET(x) offsetof(SilenceRemoveContext, x)
  67. #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
  68. static const AVOption silenceremove_options[] = {
  69. { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS },
  70. { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  71. { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  72. { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS },
  73. { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  74. { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  75. { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
  76. { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, FLAGS, "detection" },
  77. { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "detection" },
  78. { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "detection" },
  79. { NULL }
  80. };
  81. AVFILTER_DEFINE_CLASS(silenceremove);
  82. static double compute_peak(SilenceRemoveContext *s, double sample)
  83. {
  84. double new_sum;
  85. new_sum = s->sum;
  86. new_sum -= *s->window_current;
  87. new_sum += fabs(sample);
  88. return new_sum / s->window_size;
  89. }
  90. static void update_peak(SilenceRemoveContext *s, double sample)
  91. {
  92. s->sum -= *s->window_current;
  93. *s->window_current = fabs(sample);
  94. s->sum += *s->window_current;
  95. s->window_current++;
  96. if (s->window_current >= s->window_end)
  97. s->window_current = s->window;
  98. }
  99. static double compute_rms(SilenceRemoveContext *s, double sample)
  100. {
  101. double new_sum;
  102. new_sum = s->sum;
  103. new_sum -= *s->window_current;
  104. new_sum += sample * sample;
  105. return sqrt(new_sum / s->window_size);
  106. }
  107. static void update_rms(SilenceRemoveContext *s, double sample)
  108. {
  109. s->sum -= *s->window_current;
  110. *s->window_current = sample * sample;
  111. s->sum += *s->window_current;
  112. s->window_current++;
  113. if (s->window_current >= s->window_end)
  114. s->window_current = s->window;
  115. }
  116. static av_cold int init(AVFilterContext *ctx)
  117. {
  118. SilenceRemoveContext *s = ctx->priv;
  119. if (s->stop_periods < 0) {
  120. s->stop_periods = -s->stop_periods;
  121. s->restart = 1;
  122. }
  123. switch (s->detection) {
  124. case 0:
  125. s->update = update_peak;
  126. s->compute = compute_peak;
  127. break;
  128. case 1:
  129. s->update = update_rms;
  130. s->compute = compute_rms;
  131. break;
  132. };
  133. return 0;
  134. }
  135. static void clear_window(SilenceRemoveContext *s)
  136. {
  137. memset(s->window, 0, s->window_size * sizeof(*s->window));
  138. s->window_current = s->window;
  139. s->window_end = s->window + s->window_size;
  140. s->sum = 0;
  141. }
  142. static int config_input(AVFilterLink *inlink)
  143. {
  144. AVFilterContext *ctx = inlink->dst;
  145. SilenceRemoveContext *s = ctx->priv;
  146. s->window_size = (inlink->sample_rate / 50) * inlink->channels;
  147. s->window = av_malloc_array(s->window_size, sizeof(*s->window));
  148. if (!s->window)
  149. return AVERROR(ENOMEM);
  150. clear_window(s);
  151. s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
  152. AV_TIME_BASE);
  153. s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate,
  154. AV_TIME_BASE);
  155. s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
  156. sizeof(*s->start_holdoff) *
  157. inlink->channels);
  158. if (!s->start_holdoff)
  159. return AVERROR(ENOMEM);
  160. s->start_holdoff_offset = 0;
  161. s->start_holdoff_end = 0;
  162. s->start_found_periods = 0;
  163. s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
  164. sizeof(*s->stop_holdoff) *
  165. inlink->channels);
  166. if (!s->stop_holdoff)
  167. return AVERROR(ENOMEM);
  168. s->stop_holdoff_offset = 0;
  169. s->stop_holdoff_end = 0;
  170. s->stop_found_periods = 0;
  171. if (s->start_periods)
  172. s->mode = SILENCE_TRIM;
  173. else
  174. s->mode = SILENCE_COPY;
  175. return 0;
  176. }
  177. static void flush(AVFrame *out, AVFilterLink *outlink,
  178. int *nb_samples_written, int *ret)
  179. {
  180. if (*nb_samples_written) {
  181. out->nb_samples = *nb_samples_written / outlink->channels;
  182. *ret = ff_filter_frame(outlink, out);
  183. *nb_samples_written = 0;
  184. } else {
  185. av_frame_free(&out);
  186. }
  187. }
  188. static int filter_frame(AVFilterLink *inlink, AVFrame *in)
  189. {
  190. AVFilterContext *ctx = inlink->dst;
  191. AVFilterLink *outlink = ctx->outputs[0];
  192. SilenceRemoveContext *s = ctx->priv;
  193. int i, j, threshold, ret = 0;
  194. int nbs, nb_samples_read, nb_samples_written;
  195. double *obuf, *ibuf = (double *)in->data[0];
  196. AVFrame *out;
  197. nb_samples_read = nb_samples_written = 0;
  198. switch (s->mode) {
  199. case SILENCE_TRIM:
  200. silence_trim:
  201. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  202. if (!nbs)
  203. break;
  204. for (i = 0; i < nbs; i++) {
  205. threshold = 0;
  206. for (j = 0; j < inlink->channels; j++) {
  207. threshold |= s->compute(s, ibuf[j]) > s->start_threshold;
  208. }
  209. if (threshold) {
  210. for (j = 0; j < inlink->channels; j++) {
  211. s->update(s, *ibuf);
  212. s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
  213. }
  214. nb_samples_read += inlink->channels;
  215. if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
  216. if (++s->start_found_periods >= s->start_periods) {
  217. s->mode = SILENCE_TRIM_FLUSH;
  218. goto silence_trim_flush;
  219. }
  220. s->start_holdoff_offset = 0;
  221. s->start_holdoff_end = 0;
  222. }
  223. } else {
  224. s->start_holdoff_end = 0;
  225. for (j = 0; j < inlink->channels; j++)
  226. s->update(s, ibuf[j]);
  227. ibuf += inlink->channels;
  228. nb_samples_read += inlink->channels;
  229. }
  230. }
  231. break;
  232. case SILENCE_TRIM_FLUSH:
  233. silence_trim_flush:
  234. nbs = s->start_holdoff_end - s->start_holdoff_offset;
  235. nbs -= nbs % inlink->channels;
  236. if (!nbs)
  237. break;
  238. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  239. if (!out) {
  240. av_frame_free(&in);
  241. return AVERROR(ENOMEM);
  242. }
  243. memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset],
  244. nbs * sizeof(double));
  245. s->start_holdoff_offset += nbs;
  246. ret = ff_filter_frame(outlink, out);
  247. if (s->start_holdoff_offset == s->start_holdoff_end) {
  248. s->start_holdoff_offset = 0;
  249. s->start_holdoff_end = 0;
  250. s->mode = SILENCE_COPY;
  251. goto silence_copy;
  252. }
  253. break;
  254. case SILENCE_COPY:
  255. silence_copy:
  256. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  257. if (!nbs)
  258. break;
  259. out = ff_get_audio_buffer(inlink, nbs);
  260. if (!out) {
  261. av_frame_free(&in);
  262. return AVERROR(ENOMEM);
  263. }
  264. obuf = (double *)out->data[0];
  265. if (s->stop_periods) {
  266. for (i = 0; i < nbs; i++) {
  267. threshold = 1;
  268. for (j = 0; j < inlink->channels; j++)
  269. threshold &= s->compute(s, ibuf[j]) > s->stop_threshold;
  270. if (threshold && s->stop_holdoff_end && !s->leave_silence) {
  271. s->mode = SILENCE_COPY_FLUSH;
  272. flush(out, outlink, &nb_samples_written, &ret);
  273. goto silence_copy_flush;
  274. } else if (threshold) {
  275. for (j = 0; j < inlink->channels; j++) {
  276. s->update(s, *ibuf);
  277. *obuf++ = *ibuf++;
  278. }
  279. nb_samples_read += inlink->channels;
  280. nb_samples_written += inlink->channels;
  281. } else if (!threshold) {
  282. for (j = 0; j < inlink->channels; j++) {
  283. s->update(s, *ibuf);
  284. if (s->leave_silence) {
  285. *obuf++ = *ibuf;
  286. nb_samples_written++;
  287. }
  288. s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
  289. }
  290. nb_samples_read += inlink->channels;
  291. if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
  292. if (++s->stop_found_periods >= s->stop_periods) {
  293. s->stop_holdoff_offset = 0;
  294. s->stop_holdoff_end = 0;
  295. if (!s->restart) {
  296. s->mode = SILENCE_STOP;
  297. flush(out, outlink, &nb_samples_written, &ret);
  298. goto silence_stop;
  299. } else {
  300. s->stop_found_periods = 0;
  301. s->start_found_periods = 0;
  302. s->start_holdoff_offset = 0;
  303. s->start_holdoff_end = 0;
  304. clear_window(s);
  305. s->mode = SILENCE_TRIM;
  306. flush(out, outlink, &nb_samples_written, &ret);
  307. goto silence_trim;
  308. }
  309. }
  310. s->mode = SILENCE_COPY_FLUSH;
  311. flush(out, outlink, &nb_samples_written, &ret);
  312. goto silence_copy_flush;
  313. }
  314. }
  315. }
  316. flush(out, outlink, &nb_samples_written, &ret);
  317. } else {
  318. memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
  319. ret = ff_filter_frame(outlink, out);
  320. }
  321. break;
  322. case SILENCE_COPY_FLUSH:
  323. silence_copy_flush:
  324. nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  325. nbs -= nbs % inlink->channels;
  326. if (!nbs)
  327. break;
  328. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  329. if (!out) {
  330. av_frame_free(&in);
  331. return AVERROR(ENOMEM);
  332. }
  333. memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  334. nbs * sizeof(double));
  335. s->stop_holdoff_offset += nbs;
  336. ret = ff_filter_frame(outlink, out);
  337. if (s->stop_holdoff_offset == s->stop_holdoff_end) {
  338. s->stop_holdoff_offset = 0;
  339. s->stop_holdoff_end = 0;
  340. s->mode = SILENCE_COPY;
  341. goto silence_copy;
  342. }
  343. break;
  344. case SILENCE_STOP:
  345. silence_stop:
  346. break;
  347. }
  348. av_frame_free(&in);
  349. return ret;
  350. }
  351. static int request_frame(AVFilterLink *outlink)
  352. {
  353. AVFilterContext *ctx = outlink->src;
  354. SilenceRemoveContext *s = ctx->priv;
  355. int ret;
  356. ret = ff_request_frame(ctx->inputs[0]);
  357. if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
  358. s->mode == SILENCE_COPY)) {
  359. int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  360. if (nbs) {
  361. AVFrame *frame;
  362. frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
  363. if (!frame)
  364. return AVERROR(ENOMEM);
  365. memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  366. nbs * sizeof(double));
  367. ret = ff_filter_frame(ctx->inputs[0], frame);
  368. }
  369. s->mode = SILENCE_STOP;
  370. }
  371. return ret;
  372. }
  373. static int query_formats(AVFilterContext *ctx)
  374. {
  375. AVFilterFormats *formats = NULL;
  376. AVFilterChannelLayouts *layouts = NULL;
  377. static const enum AVSampleFormat sample_fmts[] = {
  378. AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
  379. };
  380. int ret;
  381. layouts = ff_all_channel_counts();
  382. if (!layouts)
  383. return AVERROR(ENOMEM);
  384. ret = ff_set_common_channel_layouts(ctx, layouts);
  385. if (ret < 0)
  386. return ret;
  387. formats = ff_make_format_list(sample_fmts);
  388. if (!formats)
  389. return AVERROR(ENOMEM);
  390. ret = ff_set_common_formats(ctx, formats);
  391. if (ret < 0)
  392. return ret;
  393. formats = ff_all_samplerates();
  394. if (!formats)
  395. return AVERROR(ENOMEM);
  396. return ff_set_common_samplerates(ctx, formats);
  397. }
  398. static av_cold void uninit(AVFilterContext *ctx)
  399. {
  400. SilenceRemoveContext *s = ctx->priv;
  401. av_freep(&s->start_holdoff);
  402. av_freep(&s->stop_holdoff);
  403. av_freep(&s->window);
  404. }
  405. static const AVFilterPad silenceremove_inputs[] = {
  406. {
  407. .name = "default",
  408. .type = AVMEDIA_TYPE_AUDIO,
  409. .config_props = config_input,
  410. .filter_frame = filter_frame,
  411. },
  412. { NULL }
  413. };
  414. static const AVFilterPad silenceremove_outputs[] = {
  415. {
  416. .name = "default",
  417. .type = AVMEDIA_TYPE_AUDIO,
  418. .request_frame = request_frame,
  419. },
  420. { NULL }
  421. };
  422. AVFilter ff_af_silenceremove = {
  423. .name = "silenceremove",
  424. .description = NULL_IF_CONFIG_SMALL("Remove silence."),
  425. .priv_size = sizeof(SilenceRemoveContext),
  426. .priv_class = &silenceremove_class,
  427. .init = init,
  428. .uninit = uninit,
  429. .query_formats = query_formats,
  430. .inputs = silenceremove_inputs,
  431. .outputs = silenceremove_outputs,
  432. };