You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

654 lines
20KB

  1. /*
  2. * Audio Mix Filter
  3. * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /**
  22. * @file
  23. * Audio Mix Filter
  24. *
  25. * Mixes audio from multiple sources into a single output. The channel layout,
  26. * sample rate, and sample format will be the same for all inputs and the
  27. * output.
  28. */
  29. #include "libavutil/attributes.h"
  30. #include "libavutil/audio_fifo.h"
  31. #include "libavutil/avassert.h"
  32. #include "libavutil/avstring.h"
  33. #include "libavutil/channel_layout.h"
  34. #include "libavutil/common.h"
  35. #include "libavutil/eval.h"
  36. #include "libavutil/float_dsp.h"
  37. #include "libavutil/mathematics.h"
  38. #include "libavutil/opt.h"
  39. #include "libavutil/samplefmt.h"
  40. #include "audio.h"
  41. #include "avfilter.h"
  42. #include "filters.h"
  43. #include "formats.h"
  44. #include "internal.h"
  45. #define INPUT_ON 1 /**< input is active */
  46. #define INPUT_EOF 2 /**< input has reached EOF (may still be active) */
  47. #define DURATION_LONGEST 0
  48. #define DURATION_SHORTEST 1
  49. #define DURATION_FIRST 2
  50. typedef struct FrameInfo {
  51. int nb_samples;
  52. int64_t pts;
  53. struct FrameInfo *next;
  54. } FrameInfo;
  55. /**
  56. * Linked list used to store timestamps and frame sizes of all frames in the
  57. * FIFO for the first input.
  58. *
  59. * This is needed to keep timestamps synchronized for the case where multiple
  60. * input frames are pushed to the filter for processing before a frame is
  61. * requested by the output link.
  62. */
  63. typedef struct FrameList {
  64. int nb_frames;
  65. int nb_samples;
  66. FrameInfo *list;
  67. FrameInfo *end;
  68. } FrameList;
  69. static void frame_list_clear(FrameList *frame_list)
  70. {
  71. if (frame_list) {
  72. while (frame_list->list) {
  73. FrameInfo *info = frame_list->list;
  74. frame_list->list = info->next;
  75. av_free(info);
  76. }
  77. frame_list->nb_frames = 0;
  78. frame_list->nb_samples = 0;
  79. frame_list->end = NULL;
  80. }
  81. }
  82. static int frame_list_next_frame_size(FrameList *frame_list)
  83. {
  84. if (!frame_list->list)
  85. return 0;
  86. return frame_list->list->nb_samples;
  87. }
  88. static int64_t frame_list_next_pts(FrameList *frame_list)
  89. {
  90. if (!frame_list->list)
  91. return AV_NOPTS_VALUE;
  92. return frame_list->list->pts;
  93. }
  94. static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
  95. {
  96. if (nb_samples >= frame_list->nb_samples) {
  97. frame_list_clear(frame_list);
  98. } else {
  99. int samples = nb_samples;
  100. while (samples > 0) {
  101. FrameInfo *info = frame_list->list;
  102. av_assert0(info);
  103. if (info->nb_samples <= samples) {
  104. samples -= info->nb_samples;
  105. frame_list->list = info->next;
  106. if (!frame_list->list)
  107. frame_list->end = NULL;
  108. frame_list->nb_frames--;
  109. frame_list->nb_samples -= info->nb_samples;
  110. av_free(info);
  111. } else {
  112. info->nb_samples -= samples;
  113. info->pts += samples;
  114. frame_list->nb_samples -= samples;
  115. samples = 0;
  116. }
  117. }
  118. }
  119. }
  120. static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
  121. {
  122. FrameInfo *info = av_malloc(sizeof(*info));
  123. if (!info)
  124. return AVERROR(ENOMEM);
  125. info->nb_samples = nb_samples;
  126. info->pts = pts;
  127. info->next = NULL;
  128. if (!frame_list->list) {
  129. frame_list->list = info;
  130. frame_list->end = info;
  131. } else {
  132. av_assert0(frame_list->end);
  133. frame_list->end->next = info;
  134. frame_list->end = info;
  135. }
  136. frame_list->nb_frames++;
  137. frame_list->nb_samples += nb_samples;
  138. return 0;
  139. }
  140. /* FIXME: use directly links fifo */
  141. typedef struct MixContext {
  142. const AVClass *class; /**< class for AVOptions */
  143. AVFloatDSPContext *fdsp;
  144. int nb_inputs; /**< number of inputs */
  145. int active_inputs; /**< number of input currently active */
  146. int duration_mode; /**< mode for determining duration */
  147. float dropout_transition; /**< transition time when an input drops out */
  148. char *weights_str; /**< string for custom weights for every input */
  149. int normalize; /**< if inputs are scaled */
  150. int nb_channels; /**< number of channels */
  151. int sample_rate; /**< sample rate */
  152. int planar;
  153. AVAudioFifo **fifos; /**< audio fifo for each input */
  154. uint8_t *input_state; /**< current state of each input */
  155. float *input_scale; /**< mixing scale factor for each input */
  156. float *weights; /**< custom weights for every input */
  157. float weight_sum; /**< sum of custom weights for every input */
  158. float *scale_norm; /**< normalization factor for every input */
  159. int64_t next_pts; /**< calculated pts for next output frame */
  160. FrameList *frame_list; /**< list of frame info for the first input */
  161. } MixContext;
  162. #define OFFSET(x) offsetof(MixContext, x)
  163. #define A AV_OPT_FLAG_AUDIO_PARAM
  164. #define F AV_OPT_FLAG_FILTERING_PARAM
  165. #define T AV_OPT_FLAG_RUNTIME_PARAM
  166. static const AVOption amix_options[] = {
  167. { "inputs", "Number of inputs.",
  168. OFFSET(nb_inputs), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, INT16_MAX, A|F },
  169. { "duration", "How to determine the end-of-stream.",
  170. OFFSET(duration_mode), AV_OPT_TYPE_INT, { .i64 = DURATION_LONGEST }, 0, 2, A|F, "duration" },
  171. { "longest", "Duration of longest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_LONGEST }, 0, 0, A|F, "duration" },
  172. { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_SHORTEST }, 0, 0, A|F, "duration" },
  173. { "first", "Duration of first input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_FIRST }, 0, 0, A|F, "duration" },
  174. { "dropout_transition", "Transition time, in seconds, for volume "
  175. "renormalization when an input stream ends.",
  176. OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { .dbl = 2.0 }, 0, INT_MAX, A|F },
  177. { "weights", "Set weight for each input.",
  178. OFFSET(weights_str), AV_OPT_TYPE_STRING, {.str="1 1"}, 0, 0, A|F|T },
  179. { "normalize", "Scale inputs",
  180. OFFSET(normalize), AV_OPT_TYPE_BOOL, {.i64=1}, 0, 1, A|F|T },
  181. { NULL }
  182. };
  183. AVFILTER_DEFINE_CLASS(amix);
  184. /**
  185. * Update the scaling factors to apply to each input during mixing.
  186. *
  187. * This balances the full volume range between active inputs and handles
  188. * volume transitions when EOF is encountered on an input but mixing continues
  189. * with the remaining inputs.
  190. */
  191. static void calculate_scales(MixContext *s, int nb_samples)
  192. {
  193. float weight_sum = 0.f;
  194. int i;
  195. for (i = 0; i < s->nb_inputs; i++)
  196. if (s->input_state[i] & INPUT_ON)
  197. weight_sum += FFABS(s->weights[i]);
  198. for (i = 0; i < s->nb_inputs; i++) {
  199. if (s->input_state[i] & INPUT_ON) {
  200. if (s->scale_norm[i] > weight_sum / FFABS(s->weights[i])) {
  201. s->scale_norm[i] -= ((s->weight_sum / FFABS(s->weights[i])) / s->nb_inputs) *
  202. nb_samples / (s->dropout_transition * s->sample_rate);
  203. s->scale_norm[i] = FFMAX(s->scale_norm[i], weight_sum / FFABS(s->weights[i]));
  204. }
  205. }
  206. }
  207. for (i = 0; i < s->nb_inputs; i++) {
  208. if (s->input_state[i] & INPUT_ON) {
  209. if (!s->normalize)
  210. s->input_scale[i] = FFABS(s->weights[i]);
  211. else
  212. s->input_scale[i] = 1.0f / s->scale_norm[i] * FFSIGN(s->weights[i]);
  213. } else {
  214. s->input_scale[i] = 0.0f;
  215. }
  216. }
  217. }
  218. static int config_output(AVFilterLink *outlink)
  219. {
  220. AVFilterContext *ctx = outlink->src;
  221. MixContext *s = ctx->priv;
  222. int i;
  223. char buf[64];
  224. s->planar = av_sample_fmt_is_planar(outlink->format);
  225. s->sample_rate = outlink->sample_rate;
  226. outlink->time_base = (AVRational){ 1, outlink->sample_rate };
  227. s->next_pts = AV_NOPTS_VALUE;
  228. s->frame_list = av_mallocz(sizeof(*s->frame_list));
  229. if (!s->frame_list)
  230. return AVERROR(ENOMEM);
  231. s->fifos = av_mallocz_array(s->nb_inputs, sizeof(*s->fifos));
  232. if (!s->fifos)
  233. return AVERROR(ENOMEM);
  234. s->nb_channels = outlink->channels;
  235. for (i = 0; i < s->nb_inputs; i++) {
  236. s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
  237. if (!s->fifos[i])
  238. return AVERROR(ENOMEM);
  239. }
  240. s->input_state = av_malloc(s->nb_inputs);
  241. if (!s->input_state)
  242. return AVERROR(ENOMEM);
  243. memset(s->input_state, INPUT_ON, s->nb_inputs);
  244. s->active_inputs = s->nb_inputs;
  245. s->input_scale = av_mallocz_array(s->nb_inputs, sizeof(*s->input_scale));
  246. s->scale_norm = av_mallocz_array(s->nb_inputs, sizeof(*s->scale_norm));
  247. if (!s->input_scale || !s->scale_norm)
  248. return AVERROR(ENOMEM);
  249. for (i = 0; i < s->nb_inputs; i++)
  250. s->scale_norm[i] = s->weight_sum / FFABS(s->weights[i]);
  251. calculate_scales(s, 0);
  252. av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
  253. av_log(ctx, AV_LOG_VERBOSE,
  254. "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
  255. av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
  256. return 0;
  257. }
  258. /**
  259. * Read samples from the input FIFOs, mix, and write to the output link.
  260. */
  261. static int output_frame(AVFilterLink *outlink)
  262. {
  263. AVFilterContext *ctx = outlink->src;
  264. MixContext *s = ctx->priv;
  265. AVFrame *out_buf, *in_buf;
  266. int nb_samples, ns, i;
  267. if (s->input_state[0] & INPUT_ON) {
  268. /* first input live: use the corresponding frame size */
  269. nb_samples = frame_list_next_frame_size(s->frame_list);
  270. for (i = 1; i < s->nb_inputs; i++) {
  271. if (s->input_state[i] & INPUT_ON) {
  272. ns = av_audio_fifo_size(s->fifos[i]);
  273. if (ns < nb_samples) {
  274. if (!(s->input_state[i] & INPUT_EOF))
  275. /* unclosed input with not enough samples */
  276. return 0;
  277. /* closed input to drain */
  278. nb_samples = ns;
  279. }
  280. }
  281. }
  282. s->next_pts = frame_list_next_pts(s->frame_list);
  283. } else {
  284. /* first input closed: use the available samples */
  285. nb_samples = INT_MAX;
  286. for (i = 1; i < s->nb_inputs; i++) {
  287. if (s->input_state[i] & INPUT_ON) {
  288. ns = av_audio_fifo_size(s->fifos[i]);
  289. nb_samples = FFMIN(nb_samples, ns);
  290. }
  291. }
  292. if (nb_samples == INT_MAX) {
  293. ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
  294. return 0;
  295. }
  296. }
  297. frame_list_remove_samples(s->frame_list, nb_samples);
  298. calculate_scales(s, nb_samples);
  299. if (nb_samples == 0)
  300. return 0;
  301. out_buf = ff_get_audio_buffer(outlink, nb_samples);
  302. if (!out_buf)
  303. return AVERROR(ENOMEM);
  304. in_buf = ff_get_audio_buffer(outlink, nb_samples);
  305. if (!in_buf) {
  306. av_frame_free(&out_buf);
  307. return AVERROR(ENOMEM);
  308. }
  309. for (i = 0; i < s->nb_inputs; i++) {
  310. if (s->input_state[i] & INPUT_ON) {
  311. int planes, plane_size, p;
  312. av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
  313. nb_samples);
  314. planes = s->planar ? s->nb_channels : 1;
  315. plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
  316. plane_size = FFALIGN(plane_size, 16);
  317. if (out_buf->format == AV_SAMPLE_FMT_FLT ||
  318. out_buf->format == AV_SAMPLE_FMT_FLTP) {
  319. for (p = 0; p < planes; p++) {
  320. s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
  321. (float *) in_buf->extended_data[p],
  322. s->input_scale[i], plane_size);
  323. }
  324. } else {
  325. for (p = 0; p < planes; p++) {
  326. s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
  327. (double *) in_buf->extended_data[p],
  328. s->input_scale[i], plane_size);
  329. }
  330. }
  331. }
  332. }
  333. av_frame_free(&in_buf);
  334. out_buf->pts = s->next_pts;
  335. if (s->next_pts != AV_NOPTS_VALUE)
  336. s->next_pts += nb_samples;
  337. return ff_filter_frame(outlink, out_buf);
  338. }
  339. /**
  340. * Requests a frame, if needed, from each input link other than the first.
  341. */
  342. static int request_samples(AVFilterContext *ctx, int min_samples)
  343. {
  344. MixContext *s = ctx->priv;
  345. int i;
  346. av_assert0(s->nb_inputs > 1);
  347. for (i = 1; i < s->nb_inputs; i++) {
  348. if (!(s->input_state[i] & INPUT_ON) ||
  349. (s->input_state[i] & INPUT_EOF))
  350. continue;
  351. if (av_audio_fifo_size(s->fifos[i]) >= min_samples)
  352. continue;
  353. ff_inlink_request_frame(ctx->inputs[i]);
  354. }
  355. return output_frame(ctx->outputs[0]);
  356. }
  357. /**
  358. * Calculates the number of active inputs and determines EOF based on the
  359. * duration option.
  360. *
  361. * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
  362. */
  363. static int calc_active_inputs(MixContext *s)
  364. {
  365. int i;
  366. int active_inputs = 0;
  367. for (i = 0; i < s->nb_inputs; i++)
  368. active_inputs += !!(s->input_state[i] & INPUT_ON);
  369. s->active_inputs = active_inputs;
  370. if (!active_inputs ||
  371. (s->duration_mode == DURATION_FIRST && !(s->input_state[0] & INPUT_ON)) ||
  372. (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
  373. return AVERROR_EOF;
  374. return 0;
  375. }
  376. static int activate(AVFilterContext *ctx)
  377. {
  378. AVFilterLink *outlink = ctx->outputs[0];
  379. MixContext *s = ctx->priv;
  380. AVFrame *buf = NULL;
  381. int i, ret;
  382. FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, ctx);
  383. for (i = 0; i < s->nb_inputs; i++) {
  384. AVFilterLink *inlink = ctx->inputs[i];
  385. if ((ret = ff_inlink_consume_frame(ctx->inputs[i], &buf)) > 0) {
  386. if (i == 0) {
  387. int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
  388. outlink->time_base);
  389. ret = frame_list_add_frame(s->frame_list, buf->nb_samples, pts);
  390. if (ret < 0) {
  391. av_frame_free(&buf);
  392. return ret;
  393. }
  394. }
  395. ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
  396. buf->nb_samples);
  397. if (ret < 0) {
  398. av_frame_free(&buf);
  399. return ret;
  400. }
  401. av_frame_free(&buf);
  402. ret = output_frame(outlink);
  403. if (ret < 0)
  404. return ret;
  405. }
  406. }
  407. for (i = 0; i < s->nb_inputs; i++) {
  408. int64_t pts;
  409. int status;
  410. if (ff_inlink_acknowledge_status(ctx->inputs[i], &status, &pts)) {
  411. if (status == AVERROR_EOF) {
  412. if (i == 0) {
  413. s->input_state[i] = 0;
  414. if (s->nb_inputs == 1) {
  415. ff_outlink_set_status(outlink, status, pts);
  416. return 0;
  417. }
  418. } else {
  419. s->input_state[i] |= INPUT_EOF;
  420. if (av_audio_fifo_size(s->fifos[i]) == 0) {
  421. s->input_state[i] = 0;
  422. }
  423. }
  424. }
  425. }
  426. }
  427. if (calc_active_inputs(s)) {
  428. ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
  429. return 0;
  430. }
  431. if (ff_outlink_frame_wanted(outlink)) {
  432. int wanted_samples;
  433. if (!(s->input_state[0] & INPUT_ON))
  434. return request_samples(ctx, 1);
  435. if (s->frame_list->nb_frames == 0) {
  436. ff_inlink_request_frame(ctx->inputs[0]);
  437. return 0;
  438. }
  439. av_assert0(s->frame_list->nb_frames > 0);
  440. wanted_samples = frame_list_next_frame_size(s->frame_list);
  441. return request_samples(ctx, wanted_samples);
  442. }
  443. return 0;
  444. }
  445. static void parse_weights(AVFilterContext *ctx)
  446. {
  447. MixContext *s = ctx->priv;
  448. float last_weight = 1.f;
  449. char *p;
  450. int i;
  451. s->weight_sum = 0.f;
  452. p = s->weights_str;
  453. for (i = 0; i < s->nb_inputs; i++) {
  454. last_weight = av_strtod(p, &p);
  455. s->weights[i] = last_weight;
  456. s->weight_sum += FFABS(last_weight);
  457. if (p && *p) {
  458. p++;
  459. } else {
  460. i++;
  461. break;
  462. }
  463. }
  464. for (; i < s->nb_inputs; i++) {
  465. s->weights[i] = last_weight;
  466. s->weight_sum += FFABS(last_weight);
  467. }
  468. }
  469. static av_cold int init(AVFilterContext *ctx)
  470. {
  471. MixContext *s = ctx->priv;
  472. int i, ret;
  473. for (i = 0; i < s->nb_inputs; i++) {
  474. AVFilterPad pad = { 0 };
  475. pad.type = AVMEDIA_TYPE_AUDIO;
  476. pad.name = av_asprintf("input%d", i);
  477. if (!pad.name)
  478. return AVERROR(ENOMEM);
  479. if ((ret = ff_insert_inpad(ctx, i, &pad)) < 0) {
  480. av_freep(&pad.name);
  481. return ret;
  482. }
  483. }
  484. s->fdsp = avpriv_float_dsp_alloc(0);
  485. if (!s->fdsp)
  486. return AVERROR(ENOMEM);
  487. s->weights = av_mallocz_array(s->nb_inputs, sizeof(*s->weights));
  488. if (!s->weights)
  489. return AVERROR(ENOMEM);
  490. parse_weights(ctx);
  491. return 0;
  492. }
  493. static av_cold void uninit(AVFilterContext *ctx)
  494. {
  495. int i;
  496. MixContext *s = ctx->priv;
  497. if (s->fifos) {
  498. for (i = 0; i < s->nb_inputs; i++)
  499. av_audio_fifo_free(s->fifos[i]);
  500. av_freep(&s->fifos);
  501. }
  502. frame_list_clear(s->frame_list);
  503. av_freep(&s->frame_list);
  504. av_freep(&s->input_state);
  505. av_freep(&s->input_scale);
  506. av_freep(&s->scale_norm);
  507. av_freep(&s->weights);
  508. av_freep(&s->fdsp);
  509. for (i = 0; i < ctx->nb_inputs; i++)
  510. av_freep(&ctx->input_pads[i].name);
  511. }
  512. static int query_formats(AVFilterContext *ctx)
  513. {
  514. static const enum AVSampleFormat sample_fmts[] = {
  515. AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
  516. AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_DBLP,
  517. AV_SAMPLE_FMT_NONE
  518. };
  519. int ret;
  520. if ((ret = ff_set_common_formats(ctx, ff_make_format_list(sample_fmts))) < 0 ||
  521. (ret = ff_set_common_samplerates(ctx, ff_all_samplerates())) < 0)
  522. return ret;
  523. return ff_set_common_channel_layouts(ctx, ff_all_channel_counts());
  524. }
  525. static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
  526. char *res, int res_len, int flags)
  527. {
  528. MixContext *s = ctx->priv;
  529. int ret;
  530. ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
  531. if (ret < 0)
  532. return ret;
  533. parse_weights(ctx);
  534. for (int i = 0; i < s->nb_inputs; i++)
  535. s->scale_norm[i] = s->weight_sum / FFABS(s->weights[i]);
  536. calculate_scales(s, 0);
  537. return 0;
  538. }
  539. static const AVFilterPad avfilter_af_amix_outputs[] = {
  540. {
  541. .name = "default",
  542. .type = AVMEDIA_TYPE_AUDIO,
  543. .config_props = config_output,
  544. },
  545. { NULL }
  546. };
  547. AVFilter ff_af_amix = {
  548. .name = "amix",
  549. .description = NULL_IF_CONFIG_SMALL("Audio mixing."),
  550. .priv_size = sizeof(MixContext),
  551. .priv_class = &amix_class,
  552. .init = init,
  553. .uninit = uninit,
  554. .activate = activate,
  555. .query_formats = query_formats,
  556. .inputs = NULL,
  557. .outputs = avfilter_af_amix_outputs,
  558. .process_command = process_command,
  559. .flags = AVFILTER_FLAG_DYNAMIC_INPUTS,
  560. };