You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

583 lines
18KB

  1. /*
  2. * Audio Mix Filter
  3. * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /**
  22. * @file
  23. * Audio Mix Filter
  24. *
  25. * Mixes audio from multiple sources into a single output. The channel layout,
  26. * sample rate, and sample format will be the same for all inputs and the
  27. * output.
  28. */
  29. #include "libavutil/attributes.h"
  30. #include "libavutil/audio_fifo.h"
  31. #include "libavutil/avassert.h"
  32. #include "libavutil/avstring.h"
  33. #include "libavutil/channel_layout.h"
  34. #include "libavutil/common.h"
  35. #include "libavutil/float_dsp.h"
  36. #include "libavutil/mathematics.h"
  37. #include "libavutil/opt.h"
  38. #include "libavutil/samplefmt.h"
  39. #include "audio.h"
  40. #include "avfilter.h"
  41. #include "filters.h"
  42. #include "formats.h"
  43. #include "internal.h"
  44. #define INPUT_ON 1 /**< input is active */
  45. #define INPUT_EOF 2 /**< input has reached EOF (may still be active) */
  46. #define DURATION_LONGEST 0
  47. #define DURATION_SHORTEST 1
  48. #define DURATION_FIRST 2
  49. typedef struct FrameInfo {
  50. int nb_samples;
  51. int64_t pts;
  52. struct FrameInfo *next;
  53. } FrameInfo;
  54. /**
  55. * Linked list used to store timestamps and frame sizes of all frames in the
  56. * FIFO for the first input.
  57. *
  58. * This is needed to keep timestamps synchronized for the case where multiple
  59. * input frames are pushed to the filter for processing before a frame is
  60. * requested by the output link.
  61. */
  62. typedef struct FrameList {
  63. int nb_frames;
  64. int nb_samples;
  65. FrameInfo *list;
  66. FrameInfo *end;
  67. } FrameList;
  68. static void frame_list_clear(FrameList *frame_list)
  69. {
  70. if (frame_list) {
  71. while (frame_list->list) {
  72. FrameInfo *info = frame_list->list;
  73. frame_list->list = info->next;
  74. av_free(info);
  75. }
  76. frame_list->nb_frames = 0;
  77. frame_list->nb_samples = 0;
  78. frame_list->end = NULL;
  79. }
  80. }
  81. static int frame_list_next_frame_size(FrameList *frame_list)
  82. {
  83. if (!frame_list->list)
  84. return 0;
  85. return frame_list->list->nb_samples;
  86. }
  87. static int64_t frame_list_next_pts(FrameList *frame_list)
  88. {
  89. if (!frame_list->list)
  90. return AV_NOPTS_VALUE;
  91. return frame_list->list->pts;
  92. }
  93. static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
  94. {
  95. if (nb_samples >= frame_list->nb_samples) {
  96. frame_list_clear(frame_list);
  97. } else {
  98. int samples = nb_samples;
  99. while (samples > 0) {
  100. FrameInfo *info = frame_list->list;
  101. av_assert0(info);
  102. if (info->nb_samples <= samples) {
  103. samples -= info->nb_samples;
  104. frame_list->list = info->next;
  105. if (!frame_list->list)
  106. frame_list->end = NULL;
  107. frame_list->nb_frames--;
  108. frame_list->nb_samples -= info->nb_samples;
  109. av_free(info);
  110. } else {
  111. info->nb_samples -= samples;
  112. info->pts += samples;
  113. frame_list->nb_samples -= samples;
  114. samples = 0;
  115. }
  116. }
  117. }
  118. }
  119. static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
  120. {
  121. FrameInfo *info = av_malloc(sizeof(*info));
  122. if (!info)
  123. return AVERROR(ENOMEM);
  124. info->nb_samples = nb_samples;
  125. info->pts = pts;
  126. info->next = NULL;
  127. if (!frame_list->list) {
  128. frame_list->list = info;
  129. frame_list->end = info;
  130. } else {
  131. av_assert0(frame_list->end);
  132. frame_list->end->next = info;
  133. frame_list->end = info;
  134. }
  135. frame_list->nb_frames++;
  136. frame_list->nb_samples += nb_samples;
  137. return 0;
  138. }
  139. /* FIXME: use directly links fifo */
  140. typedef struct MixContext {
  141. const AVClass *class; /**< class for AVOptions */
  142. AVFloatDSPContext *fdsp;
  143. int nb_inputs; /**< number of inputs */
  144. int active_inputs; /**< number of input currently active */
  145. int duration_mode; /**< mode for determining duration */
  146. float dropout_transition; /**< transition time when an input drops out */
  147. int nb_channels; /**< number of channels */
  148. int sample_rate; /**< sample rate */
  149. int planar;
  150. AVAudioFifo **fifos; /**< audio fifo for each input */
  151. uint8_t *input_state; /**< current state of each input */
  152. float *input_scale; /**< mixing scale factor for each input */
  153. float scale_norm; /**< normalization factor for all inputs */
  154. int64_t next_pts; /**< calculated pts for next output frame */
  155. FrameList *frame_list; /**< list of frame info for the first input */
  156. } MixContext;
  157. #define OFFSET(x) offsetof(MixContext, x)
  158. #define A AV_OPT_FLAG_AUDIO_PARAM
  159. #define F AV_OPT_FLAG_FILTERING_PARAM
  160. static const AVOption amix_options[] = {
  161. { "inputs", "Number of inputs.",
  162. OFFSET(nb_inputs), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, 1024, A|F },
  163. { "duration", "How to determine the end-of-stream.",
  164. OFFSET(duration_mode), AV_OPT_TYPE_INT, { .i64 = DURATION_LONGEST }, 0, 2, A|F, "duration" },
  165. { "longest", "Duration of longest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_LONGEST }, 0, 0, A|F, "duration" },
  166. { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_SHORTEST }, 0, 0, A|F, "duration" },
  167. { "first", "Duration of first input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_FIRST }, 0, 0, A|F, "duration" },
  168. { "dropout_transition", "Transition time, in seconds, for volume "
  169. "renormalization when an input stream ends.",
  170. OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { .dbl = 2.0 }, 0, INT_MAX, A|F },
  171. { NULL }
  172. };
  173. AVFILTER_DEFINE_CLASS(amix);
  174. /**
  175. * Update the scaling factors to apply to each input during mixing.
  176. *
  177. * This balances the full volume range between active inputs and handles
  178. * volume transitions when EOF is encountered on an input but mixing continues
  179. * with the remaining inputs.
  180. */
  181. static void calculate_scales(MixContext *s, int nb_samples)
  182. {
  183. int i;
  184. if (s->scale_norm > s->active_inputs) {
  185. s->scale_norm -= nb_samples / (s->dropout_transition * s->sample_rate);
  186. s->scale_norm = FFMAX(s->scale_norm, s->active_inputs);
  187. }
  188. for (i = 0; i < s->nb_inputs; i++) {
  189. if (s->input_state[i] & INPUT_ON)
  190. s->input_scale[i] = 1.0f / s->scale_norm;
  191. else
  192. s->input_scale[i] = 0.0f;
  193. }
  194. }
  195. static int config_output(AVFilterLink *outlink)
  196. {
  197. AVFilterContext *ctx = outlink->src;
  198. MixContext *s = ctx->priv;
  199. int i;
  200. char buf[64];
  201. s->planar = av_sample_fmt_is_planar(outlink->format);
  202. s->sample_rate = outlink->sample_rate;
  203. outlink->time_base = (AVRational){ 1, outlink->sample_rate };
  204. s->next_pts = AV_NOPTS_VALUE;
  205. s->frame_list = av_mallocz(sizeof(*s->frame_list));
  206. if (!s->frame_list)
  207. return AVERROR(ENOMEM);
  208. s->fifos = av_mallocz_array(s->nb_inputs, sizeof(*s->fifos));
  209. if (!s->fifos)
  210. return AVERROR(ENOMEM);
  211. s->nb_channels = outlink->channels;
  212. for (i = 0; i < s->nb_inputs; i++) {
  213. s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
  214. if (!s->fifos[i])
  215. return AVERROR(ENOMEM);
  216. }
  217. s->input_state = av_malloc(s->nb_inputs);
  218. if (!s->input_state)
  219. return AVERROR(ENOMEM);
  220. memset(s->input_state, INPUT_ON, s->nb_inputs);
  221. s->active_inputs = s->nb_inputs;
  222. s->input_scale = av_mallocz_array(s->nb_inputs, sizeof(*s->input_scale));
  223. if (!s->input_scale)
  224. return AVERROR(ENOMEM);
  225. s->scale_norm = s->active_inputs;
  226. calculate_scales(s, 0);
  227. av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
  228. av_log(ctx, AV_LOG_VERBOSE,
  229. "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
  230. av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
  231. return 0;
  232. }
  233. /**
  234. * Read samples from the input FIFOs, mix, and write to the output link.
  235. */
  236. static int output_frame(AVFilterLink *outlink)
  237. {
  238. AVFilterContext *ctx = outlink->src;
  239. MixContext *s = ctx->priv;
  240. AVFrame *out_buf, *in_buf;
  241. int nb_samples, ns, i;
  242. if (s->input_state[0] & INPUT_ON) {
  243. /* first input live: use the corresponding frame size */
  244. nb_samples = frame_list_next_frame_size(s->frame_list);
  245. for (i = 1; i < s->nb_inputs; i++) {
  246. if (s->input_state[i] & INPUT_ON) {
  247. ns = av_audio_fifo_size(s->fifos[i]);
  248. if (ns < nb_samples) {
  249. if (!(s->input_state[i] & INPUT_EOF))
  250. /* unclosed input with not enough samples */
  251. return 0;
  252. /* closed input to drain */
  253. nb_samples = ns;
  254. }
  255. }
  256. }
  257. } else {
  258. /* first input closed: use the available samples */
  259. nb_samples = INT_MAX;
  260. for (i = 1; i < s->nb_inputs; i++) {
  261. if (s->input_state[i] & INPUT_ON) {
  262. ns = av_audio_fifo_size(s->fifos[i]);
  263. nb_samples = FFMIN(nb_samples, ns);
  264. }
  265. }
  266. if (nb_samples == INT_MAX) {
  267. ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
  268. return 0;
  269. }
  270. }
  271. s->next_pts = frame_list_next_pts(s->frame_list);
  272. frame_list_remove_samples(s->frame_list, nb_samples);
  273. calculate_scales(s, nb_samples);
  274. if (nb_samples == 0)
  275. return 0;
  276. out_buf = ff_get_audio_buffer(outlink, nb_samples);
  277. if (!out_buf)
  278. return AVERROR(ENOMEM);
  279. in_buf = ff_get_audio_buffer(outlink, nb_samples);
  280. if (!in_buf) {
  281. av_frame_free(&out_buf);
  282. return AVERROR(ENOMEM);
  283. }
  284. for (i = 0; i < s->nb_inputs; i++) {
  285. if (s->input_state[i] & INPUT_ON) {
  286. int planes, plane_size, p;
  287. av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
  288. nb_samples);
  289. planes = s->planar ? s->nb_channels : 1;
  290. plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
  291. plane_size = FFALIGN(plane_size, 16);
  292. if (out_buf->format == AV_SAMPLE_FMT_FLT ||
  293. out_buf->format == AV_SAMPLE_FMT_FLTP) {
  294. for (p = 0; p < planes; p++) {
  295. s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
  296. (float *) in_buf->extended_data[p],
  297. s->input_scale[i], plane_size);
  298. }
  299. } else {
  300. for (p = 0; p < planes; p++) {
  301. s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
  302. (double *) in_buf->extended_data[p],
  303. s->input_scale[i], plane_size);
  304. }
  305. }
  306. }
  307. }
  308. av_frame_free(&in_buf);
  309. out_buf->pts = s->next_pts;
  310. if (s->next_pts != AV_NOPTS_VALUE)
  311. s->next_pts += nb_samples;
  312. return ff_filter_frame(outlink, out_buf);
  313. }
  314. /**
  315. * Requests a frame, if needed, from each input link other than the first.
  316. */
  317. static int request_samples(AVFilterContext *ctx, int min_samples)
  318. {
  319. MixContext *s = ctx->priv;
  320. int i;
  321. av_assert0(s->nb_inputs > 1);
  322. for (i = 1; i < s->nb_inputs; i++) {
  323. if (!(s->input_state[i] & INPUT_ON) ||
  324. (s->input_state[i] & INPUT_EOF))
  325. continue;
  326. if (av_audio_fifo_size(s->fifos[i]) >= min_samples)
  327. continue;
  328. ff_inlink_request_frame(ctx->inputs[i]);
  329. }
  330. return output_frame(ctx->outputs[0]);
  331. }
  332. /**
  333. * Calculates the number of active inputs and determines EOF based on the
  334. * duration option.
  335. *
  336. * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
  337. */
  338. static int calc_active_inputs(MixContext *s)
  339. {
  340. int i;
  341. int active_inputs = 0;
  342. for (i = 0; i < s->nb_inputs; i++)
  343. active_inputs += !!(s->input_state[i] & INPUT_ON);
  344. s->active_inputs = active_inputs;
  345. if (!active_inputs ||
  346. (s->duration_mode == DURATION_FIRST && !(s->input_state[0] & INPUT_ON)) ||
  347. (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
  348. return AVERROR_EOF;
  349. return 0;
  350. }
  351. static int activate(AVFilterContext *ctx)
  352. {
  353. AVFilterLink *outlink = ctx->outputs[0];
  354. MixContext *s = ctx->priv;
  355. AVFrame *buf = NULL;
  356. int i, ret;
  357. for (i = 0; i < s->nb_inputs; i++) {
  358. AVFilterLink *inlink = ctx->inputs[i];
  359. if ((ret = ff_inlink_consume_frame(ctx->inputs[i], &buf)) > 0) {
  360. if (i == 0) {
  361. int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
  362. outlink->time_base);
  363. ret = frame_list_add_frame(s->frame_list, buf->nb_samples, pts);
  364. if (ret < 0) {
  365. av_frame_free(&buf);
  366. return ret;
  367. }
  368. }
  369. ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
  370. buf->nb_samples);
  371. if (ret < 0) {
  372. av_frame_free(&buf);
  373. return ret;
  374. }
  375. av_frame_free(&buf);
  376. ret = output_frame(outlink);
  377. if (ret < 0)
  378. return ret;
  379. }
  380. }
  381. for (i = 0; i < s->nb_inputs; i++) {
  382. int64_t pts;
  383. int status;
  384. if (ff_inlink_acknowledge_status(ctx->inputs[i], &status, &pts)) {
  385. if (status == AVERROR_EOF) {
  386. if (i == 0) {
  387. s->input_state[i] = 0;
  388. if (s->nb_inputs == 1) {
  389. ff_outlink_set_status(outlink, status, pts);
  390. return 0;
  391. }
  392. } else {
  393. s->input_state[i] |= INPUT_EOF;
  394. if (av_audio_fifo_size(s->fifos[i]) == 0) {
  395. s->input_state[i] = 0;
  396. }
  397. }
  398. }
  399. }
  400. }
  401. if (calc_active_inputs(s)) {
  402. ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
  403. return 0;
  404. }
  405. if (ff_outlink_frame_wanted(outlink)) {
  406. int wanted_samples;
  407. if (!(s->input_state[0] & INPUT_ON))
  408. return request_samples(ctx, 1);
  409. if (s->frame_list->nb_frames == 0) {
  410. ff_inlink_request_frame(ctx->inputs[0]);
  411. return 0;
  412. }
  413. av_assert0(s->frame_list->nb_frames > 0);
  414. wanted_samples = frame_list_next_frame_size(s->frame_list);
  415. return request_samples(ctx, wanted_samples);
  416. }
  417. return 0;
  418. }
  419. static av_cold int init(AVFilterContext *ctx)
  420. {
  421. MixContext *s = ctx->priv;
  422. int i, ret;
  423. for (i = 0; i < s->nb_inputs; i++) {
  424. AVFilterPad pad = { 0 };
  425. pad.type = AVMEDIA_TYPE_AUDIO;
  426. pad.name = av_asprintf("input%d", i);
  427. if (!pad.name)
  428. return AVERROR(ENOMEM);
  429. if ((ret = ff_insert_inpad(ctx, i, &pad)) < 0) {
  430. av_freep(&pad.name);
  431. return ret;
  432. }
  433. }
  434. s->fdsp = avpriv_float_dsp_alloc(0);
  435. if (!s->fdsp)
  436. return AVERROR(ENOMEM);
  437. return 0;
  438. }
  439. static av_cold void uninit(AVFilterContext *ctx)
  440. {
  441. int i;
  442. MixContext *s = ctx->priv;
  443. if (s->fifos) {
  444. for (i = 0; i < s->nb_inputs; i++)
  445. av_audio_fifo_free(s->fifos[i]);
  446. av_freep(&s->fifos);
  447. }
  448. frame_list_clear(s->frame_list);
  449. av_freep(&s->frame_list);
  450. av_freep(&s->input_state);
  451. av_freep(&s->input_scale);
  452. av_freep(&s->fdsp);
  453. for (i = 0; i < ctx->nb_inputs; i++)
  454. av_freep(&ctx->input_pads[i].name);
  455. }
  456. static int query_formats(AVFilterContext *ctx)
  457. {
  458. AVFilterFormats *formats = NULL;
  459. AVFilterChannelLayouts *layouts;
  460. int ret;
  461. layouts = ff_all_channel_counts();
  462. if (!layouts) {
  463. ret = AVERROR(ENOMEM);
  464. goto fail;
  465. }
  466. if ((ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLT )) < 0 ||
  467. (ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLTP)) < 0 ||
  468. (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBL )) < 0 ||
  469. (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBLP)) < 0 ||
  470. (ret = ff_set_common_formats (ctx, formats)) < 0 ||
  471. (ret = ff_set_common_channel_layouts(ctx, layouts)) < 0 ||
  472. (ret = ff_set_common_samplerates(ctx, ff_all_samplerates())) < 0)
  473. goto fail;
  474. return 0;
  475. fail:
  476. if (layouts)
  477. av_freep(&layouts->channel_layouts);
  478. av_freep(&layouts);
  479. return ret;
  480. }
  481. static const AVFilterPad avfilter_af_amix_outputs[] = {
  482. {
  483. .name = "default",
  484. .type = AVMEDIA_TYPE_AUDIO,
  485. .config_props = config_output,
  486. },
  487. { NULL }
  488. };
  489. AVFilter ff_af_amix = {
  490. .name = "amix",
  491. .description = NULL_IF_CONFIG_SMALL("Audio mixing."),
  492. .priv_size = sizeof(MixContext),
  493. .priv_class = &amix_class,
  494. .init = init,
  495. .uninit = uninit,
  496. .activate = activate,
  497. .query_formats = query_formats,
  498. .inputs = NULL,
  499. .outputs = avfilter_af_amix_outputs,
  500. .flags = AVFILTER_FLAG_DYNAMIC_INPUTS,
  501. };