You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

556 lines
17KB

  1. /*
  2. * Copyright (c) 2001 Heikki Leinonen
  3. * Copyright (c) 2001 Chris Bagwell
  4. * Copyright (c) 2003 Donnie Smith
  5. * Copyright (c) 2014 Paul B Mahol
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include <float.h> /* DBL_MAX */
  24. #include "libavutil/opt.h"
  25. #include "libavutil/timestamp.h"
  26. #include "audio.h"
  27. #include "formats.h"
  28. #include "avfilter.h"
  29. #include "internal.h"
  30. enum SilenceMode {
  31. SILENCE_TRIM,
  32. SILENCE_TRIM_FLUSH,
  33. SILENCE_COPY,
  34. SILENCE_COPY_FLUSH,
  35. SILENCE_STOP
  36. };
  37. typedef struct SilenceRemoveContext {
  38. const AVClass *class;
  39. enum SilenceMode mode;
  40. int start_periods;
  41. int64_t start_duration;
  42. double start_threshold;
  43. int stop_periods;
  44. int64_t stop_duration;
  45. double stop_threshold;
  46. double *start_holdoff;
  47. size_t start_holdoff_offset;
  48. size_t start_holdoff_end;
  49. int start_found_periods;
  50. double *stop_holdoff;
  51. size_t stop_holdoff_offset;
  52. size_t stop_holdoff_end;
  53. int stop_found_periods;
  54. double window_ratio;
  55. double *window;
  56. double *window_current;
  57. double *window_end;
  58. int window_size;
  59. double sum;
  60. int leave_silence;
  61. int restart;
  62. int64_t next_pts;
  63. int detection;
  64. void (*update)(struct SilenceRemoveContext *s, double sample);
  65. double(*compute)(struct SilenceRemoveContext *s, double sample);
  66. } SilenceRemoveContext;
  67. #define OFFSET(x) offsetof(SilenceRemoveContext, x)
  68. #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
  69. static const AVOption silenceremove_options[] = {
  70. { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS },
  71. { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  72. { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  73. { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS },
  74. { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
  75. { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
  76. { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
  77. { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, FLAGS, "detection" },
  78. { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "detection" },
  79. { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "detection" },
  80. { "window", NULL, OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, FLAGS },
  81. { NULL }
  82. };
  83. AVFILTER_DEFINE_CLASS(silenceremove);
  84. static double compute_peak(SilenceRemoveContext *s, double sample)
  85. {
  86. double new_sum;
  87. new_sum = s->sum;
  88. new_sum -= *s->window_current;
  89. new_sum += fabs(sample);
  90. return new_sum / s->window_size;
  91. }
  92. static void update_peak(SilenceRemoveContext *s, double sample)
  93. {
  94. s->sum -= *s->window_current;
  95. *s->window_current = fabs(sample);
  96. s->sum += *s->window_current;
  97. s->window_current++;
  98. if (s->window_current >= s->window_end)
  99. s->window_current = s->window;
  100. }
  101. static double compute_rms(SilenceRemoveContext *s, double sample)
  102. {
  103. double new_sum;
  104. new_sum = s->sum;
  105. new_sum -= *s->window_current;
  106. new_sum += sample * sample;
  107. return sqrt(new_sum / s->window_size);
  108. }
  109. static void update_rms(SilenceRemoveContext *s, double sample)
  110. {
  111. s->sum -= *s->window_current;
  112. *s->window_current = sample * sample;
  113. s->sum += *s->window_current;
  114. s->window_current++;
  115. if (s->window_current >= s->window_end)
  116. s->window_current = s->window;
  117. }
  118. static av_cold int init(AVFilterContext *ctx)
  119. {
  120. SilenceRemoveContext *s = ctx->priv;
  121. if (s->stop_periods < 0) {
  122. s->stop_periods = -s->stop_periods;
  123. s->restart = 1;
  124. }
  125. switch (s->detection) {
  126. case 0:
  127. s->update = update_peak;
  128. s->compute = compute_peak;
  129. break;
  130. case 1:
  131. s->update = update_rms;
  132. s->compute = compute_rms;
  133. break;
  134. };
  135. return 0;
  136. }
  137. static void clear_window(SilenceRemoveContext *s)
  138. {
  139. memset(s->window, 0, s->window_size * sizeof(*s->window));
  140. s->window_current = s->window;
  141. s->window_end = s->window + s->window_size;
  142. s->sum = 0;
  143. }
  144. static int config_input(AVFilterLink *inlink)
  145. {
  146. AVFilterContext *ctx = inlink->dst;
  147. SilenceRemoveContext *s = ctx->priv;
  148. s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels;
  149. s->window = av_malloc_array(s->window_size, sizeof(*s->window));
  150. if (!s->window)
  151. return AVERROR(ENOMEM);
  152. clear_window(s);
  153. s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
  154. AV_TIME_BASE);
  155. if (s->start_duration < 0) {
  156. av_log(ctx, AV_LOG_WARNING, "start duration must be non-negative\n");
  157. s->start_duration = -s->start_duration;
  158. }
  159. s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate,
  160. AV_TIME_BASE);
  161. if (s->stop_duration < 0) {
  162. av_log(ctx, AV_LOG_WARNING, "stop duration must be non-negative\n");
  163. s->stop_duration = -s->stop_duration;
  164. }
  165. s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
  166. sizeof(*s->start_holdoff) *
  167. inlink->channels);
  168. if (!s->start_holdoff)
  169. return AVERROR(ENOMEM);
  170. s->start_holdoff_offset = 0;
  171. s->start_holdoff_end = 0;
  172. s->start_found_periods = 0;
  173. s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
  174. sizeof(*s->stop_holdoff) *
  175. inlink->channels);
  176. if (!s->stop_holdoff)
  177. return AVERROR(ENOMEM);
  178. s->stop_holdoff_offset = 0;
  179. s->stop_holdoff_end = 0;
  180. s->stop_found_periods = 0;
  181. if (s->start_periods)
  182. s->mode = SILENCE_TRIM;
  183. else
  184. s->mode = SILENCE_COPY;
  185. return 0;
  186. }
  187. static void flush(SilenceRemoveContext *s,
  188. AVFrame *out, AVFilterLink *outlink,
  189. int *nb_samples_written, int *ret)
  190. {
  191. if (*nb_samples_written) {
  192. out->nb_samples = *nb_samples_written / outlink->channels;
  193. out->pts = s->next_pts;
  194. s->next_pts += av_rescale_q(out->nb_samples,
  195. (AVRational){1, outlink->sample_rate},
  196. outlink->time_base);
  197. *ret = ff_filter_frame(outlink, out);
  198. *nb_samples_written = 0;
  199. } else {
  200. av_frame_free(&out);
  201. }
  202. }
  203. static int filter_frame(AVFilterLink *inlink, AVFrame *in)
  204. {
  205. AVFilterContext *ctx = inlink->dst;
  206. AVFilterLink *outlink = ctx->outputs[0];
  207. SilenceRemoveContext *s = ctx->priv;
  208. int i, j, threshold, ret = 0;
  209. int nbs, nb_samples_read, nb_samples_written;
  210. double *obuf, *ibuf = (double *)in->data[0];
  211. AVFrame *out;
  212. nb_samples_read = nb_samples_written = 0;
  213. switch (s->mode) {
  214. case SILENCE_TRIM:
  215. silence_trim:
  216. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  217. if (!nbs)
  218. break;
  219. for (i = 0; i < nbs; i++) {
  220. threshold = 0;
  221. for (j = 0; j < inlink->channels; j++) {
  222. threshold |= s->compute(s, ibuf[j]) > s->start_threshold;
  223. }
  224. if (threshold) {
  225. for (j = 0; j < inlink->channels; j++) {
  226. s->update(s, *ibuf);
  227. s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
  228. }
  229. nb_samples_read += inlink->channels;
  230. if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
  231. if (++s->start_found_periods >= s->start_periods) {
  232. s->mode = SILENCE_TRIM_FLUSH;
  233. goto silence_trim_flush;
  234. }
  235. s->start_holdoff_offset = 0;
  236. s->start_holdoff_end = 0;
  237. }
  238. } else {
  239. s->start_holdoff_end = 0;
  240. for (j = 0; j < inlink->channels; j++)
  241. s->update(s, ibuf[j]);
  242. ibuf += inlink->channels;
  243. nb_samples_read += inlink->channels;
  244. }
  245. }
  246. break;
  247. case SILENCE_TRIM_FLUSH:
  248. silence_trim_flush:
  249. nbs = s->start_holdoff_end - s->start_holdoff_offset;
  250. nbs -= nbs % inlink->channels;
  251. if (!nbs)
  252. break;
  253. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  254. if (!out) {
  255. av_frame_free(&in);
  256. return AVERROR(ENOMEM);
  257. }
  258. memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset],
  259. nbs * sizeof(double));
  260. out->pts = s->next_pts;
  261. s->next_pts += av_rescale_q(out->nb_samples,
  262. (AVRational){1, outlink->sample_rate},
  263. outlink->time_base);
  264. s->start_holdoff_offset += nbs;
  265. ret = ff_filter_frame(outlink, out);
  266. if (s->start_holdoff_offset == s->start_holdoff_end) {
  267. s->start_holdoff_offset = 0;
  268. s->start_holdoff_end = 0;
  269. s->mode = SILENCE_COPY;
  270. goto silence_copy;
  271. }
  272. break;
  273. case SILENCE_COPY:
  274. silence_copy:
  275. nbs = in->nb_samples - nb_samples_read / inlink->channels;
  276. if (!nbs)
  277. break;
  278. out = ff_get_audio_buffer(inlink, nbs);
  279. if (!out) {
  280. av_frame_free(&in);
  281. return AVERROR(ENOMEM);
  282. }
  283. obuf = (double *)out->data[0];
  284. if (s->stop_periods) {
  285. for (i = 0; i < nbs; i++) {
  286. threshold = 1;
  287. for (j = 0; j < inlink->channels; j++)
  288. threshold &= s->compute(s, ibuf[j]) > s->stop_threshold;
  289. if (threshold && s->stop_holdoff_end && !s->leave_silence) {
  290. s->mode = SILENCE_COPY_FLUSH;
  291. flush(s, out, outlink, &nb_samples_written, &ret);
  292. goto silence_copy_flush;
  293. } else if (threshold) {
  294. for (j = 0; j < inlink->channels; j++) {
  295. s->update(s, *ibuf);
  296. *obuf++ = *ibuf++;
  297. }
  298. nb_samples_read += inlink->channels;
  299. nb_samples_written += inlink->channels;
  300. } else if (!threshold) {
  301. for (j = 0; j < inlink->channels; j++) {
  302. s->update(s, *ibuf);
  303. if (s->leave_silence) {
  304. *obuf++ = *ibuf;
  305. nb_samples_written++;
  306. }
  307. s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
  308. }
  309. nb_samples_read += inlink->channels;
  310. if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
  311. if (++s->stop_found_periods >= s->stop_periods) {
  312. s->stop_holdoff_offset = 0;
  313. s->stop_holdoff_end = 0;
  314. if (!s->restart) {
  315. s->mode = SILENCE_STOP;
  316. flush(s, out, outlink, &nb_samples_written, &ret);
  317. goto silence_stop;
  318. } else {
  319. s->stop_found_periods = 0;
  320. s->start_found_periods = 0;
  321. s->start_holdoff_offset = 0;
  322. s->start_holdoff_end = 0;
  323. clear_window(s);
  324. s->mode = SILENCE_TRIM;
  325. flush(s, out, outlink, &nb_samples_written, &ret);
  326. goto silence_trim;
  327. }
  328. }
  329. s->mode = SILENCE_COPY_FLUSH;
  330. flush(s, out, outlink, &nb_samples_written, &ret);
  331. goto silence_copy_flush;
  332. }
  333. }
  334. }
  335. flush(s, out, outlink, &nb_samples_written, &ret);
  336. } else {
  337. memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
  338. out->pts = s->next_pts;
  339. s->next_pts += av_rescale_q(out->nb_samples,
  340. (AVRational){1, outlink->sample_rate},
  341. outlink->time_base);
  342. ret = ff_filter_frame(outlink, out);
  343. }
  344. break;
  345. case SILENCE_COPY_FLUSH:
  346. silence_copy_flush:
  347. nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  348. nbs -= nbs % inlink->channels;
  349. if (!nbs)
  350. break;
  351. out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
  352. if (!out) {
  353. av_frame_free(&in);
  354. return AVERROR(ENOMEM);
  355. }
  356. memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  357. nbs * sizeof(double));
  358. s->stop_holdoff_offset += nbs;
  359. out->pts = s->next_pts;
  360. s->next_pts += av_rescale_q(out->nb_samples,
  361. (AVRational){1, outlink->sample_rate},
  362. outlink->time_base);
  363. ret = ff_filter_frame(outlink, out);
  364. if (s->stop_holdoff_offset == s->stop_holdoff_end) {
  365. s->stop_holdoff_offset = 0;
  366. s->stop_holdoff_end = 0;
  367. s->mode = SILENCE_COPY;
  368. goto silence_copy;
  369. }
  370. break;
  371. case SILENCE_STOP:
  372. silence_stop:
  373. break;
  374. }
  375. av_frame_free(&in);
  376. return ret;
  377. }
  378. static int request_frame(AVFilterLink *outlink)
  379. {
  380. AVFilterContext *ctx = outlink->src;
  381. SilenceRemoveContext *s = ctx->priv;
  382. int ret;
  383. ret = ff_request_frame(ctx->inputs[0]);
  384. if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
  385. s->mode == SILENCE_COPY)) {
  386. int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
  387. if (nbs) {
  388. AVFrame *frame;
  389. frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
  390. if (!frame)
  391. return AVERROR(ENOMEM);
  392. memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
  393. nbs * sizeof(double));
  394. frame->pts = s->next_pts;
  395. s->next_pts += av_rescale_q(frame->nb_samples,
  396. (AVRational){1, outlink->sample_rate},
  397. outlink->time_base);
  398. ret = ff_filter_frame(outlink, frame);
  399. }
  400. s->mode = SILENCE_STOP;
  401. }
  402. return ret;
  403. }
  404. static int query_formats(AVFilterContext *ctx)
  405. {
  406. AVFilterFormats *formats = NULL;
  407. AVFilterChannelLayouts *layouts = NULL;
  408. static const enum AVSampleFormat sample_fmts[] = {
  409. AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
  410. };
  411. int ret;
  412. layouts = ff_all_channel_counts();
  413. if (!layouts)
  414. return AVERROR(ENOMEM);
  415. ret = ff_set_common_channel_layouts(ctx, layouts);
  416. if (ret < 0)
  417. return ret;
  418. formats = ff_make_format_list(sample_fmts);
  419. if (!formats)
  420. return AVERROR(ENOMEM);
  421. ret = ff_set_common_formats(ctx, formats);
  422. if (ret < 0)
  423. return ret;
  424. formats = ff_all_samplerates();
  425. if (!formats)
  426. return AVERROR(ENOMEM);
  427. return ff_set_common_samplerates(ctx, formats);
  428. }
  429. static av_cold void uninit(AVFilterContext *ctx)
  430. {
  431. SilenceRemoveContext *s = ctx->priv;
  432. av_freep(&s->start_holdoff);
  433. av_freep(&s->stop_holdoff);
  434. av_freep(&s->window);
  435. }
  436. static const AVFilterPad silenceremove_inputs[] = {
  437. {
  438. .name = "default",
  439. .type = AVMEDIA_TYPE_AUDIO,
  440. .config_props = config_input,
  441. .filter_frame = filter_frame,
  442. },
  443. { NULL }
  444. };
  445. static const AVFilterPad silenceremove_outputs[] = {
  446. {
  447. .name = "default",
  448. .type = AVMEDIA_TYPE_AUDIO,
  449. .request_frame = request_frame,
  450. },
  451. { NULL }
  452. };
  453. AVFilter ff_af_silenceremove = {
  454. .name = "silenceremove",
  455. .description = NULL_IF_CONFIG_SMALL("Remove silence."),
  456. .priv_size = sizeof(SilenceRemoveContext),
  457. .priv_class = &silenceremove_class,
  458. .init = init,
  459. .uninit = uninit,
  460. .query_formats = query_formats,
  461. .inputs = silenceremove_inputs,
  462. .outputs = silenceremove_outputs,
  463. };