You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

551 lines
16KB

  1. /*
  2. * Audio Mix Filter
  3. * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. /**
  22. * @file
  23. * Audio Mix Filter
  24. *
  25. * Mixes audio from multiple sources into a single output. The channel layout,
  26. * sample rate, and sample format will be the same for all inputs and the
  27. * output.
  28. */
  29. #include "libavutil/audioconvert.h"
  30. #include "libavutil/audio_fifo.h"
  31. #include "libavutil/avassert.h"
  32. #include "libavutil/avstring.h"
  33. #include "libavutil/mathematics.h"
  34. #include "libavutil/opt.h"
  35. #include "libavutil/samplefmt.h"
  36. #include "audio.h"
  37. #include "avfilter.h"
  38. #include "formats.h"
  39. #include "internal.h"
  40. #define INPUT_OFF 0 /**< input has reached EOF */
  41. #define INPUT_ON 1 /**< input is active */
  42. #define INPUT_INACTIVE 2 /**< input is on, but is currently inactive */
  43. #define DURATION_LONGEST 0
  44. #define DURATION_SHORTEST 1
  45. #define DURATION_FIRST 2
  46. typedef struct FrameInfo {
  47. int nb_samples;
  48. int64_t pts;
  49. struct FrameInfo *next;
  50. } FrameInfo;
  51. /**
  52. * Linked list used to store timestamps and frame sizes of all frames in the
  53. * FIFO for the first input.
  54. *
  55. * This is needed to keep timestamps synchronized for the case where multiple
  56. * input frames are pushed to the filter for processing before a frame is
  57. * requested by the output link.
  58. */
  59. typedef struct FrameList {
  60. int nb_frames;
  61. int nb_samples;
  62. FrameInfo *list;
  63. FrameInfo *end;
  64. } FrameList;
  65. static void frame_list_clear(FrameList *frame_list)
  66. {
  67. if (frame_list) {
  68. while (frame_list->list) {
  69. FrameInfo *info = frame_list->list;
  70. frame_list->list = info->next;
  71. av_free(info);
  72. }
  73. frame_list->nb_frames = 0;
  74. frame_list->nb_samples = 0;
  75. frame_list->end = NULL;
  76. }
  77. }
  78. static int frame_list_next_frame_size(FrameList *frame_list)
  79. {
  80. if (!frame_list->list)
  81. return 0;
  82. return frame_list->list->nb_samples;
  83. }
  84. static int64_t frame_list_next_pts(FrameList *frame_list)
  85. {
  86. if (!frame_list->list)
  87. return AV_NOPTS_VALUE;
  88. return frame_list->list->pts;
  89. }
  90. static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
  91. {
  92. if (nb_samples >= frame_list->nb_samples) {
  93. frame_list_clear(frame_list);
  94. } else {
  95. int samples = nb_samples;
  96. while (samples > 0) {
  97. FrameInfo *info = frame_list->list;
  98. av_assert0(info != NULL);
  99. if (info->nb_samples <= samples) {
  100. samples -= info->nb_samples;
  101. frame_list->list = info->next;
  102. if (!frame_list->list)
  103. frame_list->end = NULL;
  104. frame_list->nb_frames--;
  105. frame_list->nb_samples -= info->nb_samples;
  106. av_free(info);
  107. } else {
  108. info->nb_samples -= samples;
  109. info->pts += samples;
  110. frame_list->nb_samples -= samples;
  111. samples = 0;
  112. }
  113. }
  114. }
  115. }
  116. static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
  117. {
  118. FrameInfo *info = av_malloc(sizeof(*info));
  119. if (!info)
  120. return AVERROR(ENOMEM);
  121. info->nb_samples = nb_samples;
  122. info->pts = pts;
  123. info->next = NULL;
  124. if (!frame_list->list) {
  125. frame_list->list = info;
  126. frame_list->end = info;
  127. } else {
  128. av_assert0(frame_list->end != NULL);
  129. frame_list->end->next = info;
  130. frame_list->end = info;
  131. }
  132. frame_list->nb_frames++;
  133. frame_list->nb_samples += nb_samples;
  134. return 0;
  135. }
  136. typedef struct MixContext {
  137. const AVClass *class; /**< class for AVOptions */
  138. int nb_inputs; /**< number of inputs */
  139. int active_inputs; /**< number of input currently active */
  140. int duration_mode; /**< mode for determining duration */
  141. float dropout_transition; /**< transition time when an input drops out */
  142. int nb_channels; /**< number of channels */
  143. int sample_rate; /**< sample rate */
  144. AVAudioFifo **fifos; /**< audio fifo for each input */
  145. uint8_t *input_state; /**< current state of each input */
  146. float *input_scale; /**< mixing scale factor for each input */
  147. float scale_norm; /**< normalization factor for all inputs */
  148. int64_t next_pts; /**< calculated pts for next output frame */
  149. FrameList *frame_list; /**< list of frame info for the first input */
  150. } MixContext;
  151. #define OFFSET(x) offsetof(MixContext, x)
  152. #define A AV_OPT_FLAG_AUDIO_PARAM
  153. static const AVOption options[] = {
  154. { "inputs", "Number of inputs.",
  155. OFFSET(nb_inputs), AV_OPT_TYPE_INT, { 2 }, 1, 32, A },
  156. { "duration", "How to determine the end-of-stream.",
  157. OFFSET(duration_mode), AV_OPT_TYPE_INT, { DURATION_LONGEST }, 0, 2, A, "duration" },
  158. { "longest", "Duration of longest input.", 0, AV_OPT_TYPE_CONST, { DURATION_LONGEST }, INT_MIN, INT_MAX, A, "duration" },
  159. { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { DURATION_SHORTEST }, INT_MIN, INT_MAX, A, "duration" },
  160. { "first", "Duration of first input.", 0, AV_OPT_TYPE_CONST, { DURATION_FIRST }, INT_MIN, INT_MAX, A, "duration" },
  161. { "dropout_transition", "Transition time, in seconds, for volume "
  162. "renormalization when an input stream ends.",
  163. OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { 2.0 }, 0, INT_MAX, A },
  164. { NULL },
  165. };
  166. static const AVClass amix_class = {
  167. .class_name = "amix filter",
  168. .item_name = av_default_item_name,
  169. .option = options,
  170. .version = LIBAVUTIL_VERSION_INT,
  171. };
  172. /**
  173. * Update the scaling factors to apply to each input during mixing.
  174. *
  175. * This balances the full volume range between active inputs and handles
  176. * volume transitions when EOF is encountered on an input but mixing continues
  177. * with the remaining inputs.
  178. */
  179. static void calculate_scales(MixContext *s, int nb_samples)
  180. {
  181. int i;
  182. if (s->scale_norm > s->active_inputs) {
  183. s->scale_norm -= nb_samples / (s->dropout_transition * s->sample_rate);
  184. s->scale_norm = FFMAX(s->scale_norm, s->active_inputs);
  185. }
  186. for (i = 0; i < s->nb_inputs; i++) {
  187. if (s->input_state[i] == INPUT_ON)
  188. s->input_scale[i] = 1.0f / s->scale_norm;
  189. else
  190. s->input_scale[i] = 0.0f;
  191. }
  192. }
  193. static int config_output(AVFilterLink *outlink)
  194. {
  195. AVFilterContext *ctx = outlink->src;
  196. MixContext *s = ctx->priv;
  197. int i;
  198. char buf[64];
  199. s->sample_rate = outlink->sample_rate;
  200. outlink->time_base = (AVRational){ 1, outlink->sample_rate };
  201. s->next_pts = AV_NOPTS_VALUE;
  202. s->frame_list = av_mallocz(sizeof(*s->frame_list));
  203. if (!s->frame_list)
  204. return AVERROR(ENOMEM);
  205. s->fifos = av_mallocz(s->nb_inputs * sizeof(*s->fifos));
  206. if (!s->fifos)
  207. return AVERROR(ENOMEM);
  208. s->nb_channels = av_get_channel_layout_nb_channels(outlink->channel_layout);
  209. for (i = 0; i < s->nb_inputs; i++) {
  210. s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
  211. if (!s->fifos[i])
  212. return AVERROR(ENOMEM);
  213. }
  214. s->input_state = av_malloc(s->nb_inputs);
  215. if (!s->input_state)
  216. return AVERROR(ENOMEM);
  217. memset(s->input_state, INPUT_ON, s->nb_inputs);
  218. s->active_inputs = s->nb_inputs;
  219. s->input_scale = av_mallocz(s->nb_inputs * sizeof(*s->input_scale));
  220. if (!s->input_scale)
  221. return AVERROR(ENOMEM);
  222. s->scale_norm = s->active_inputs;
  223. calculate_scales(s, 0);
  224. av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
  225. av_log(ctx, AV_LOG_VERBOSE,
  226. "inputs:%d fmt:%s srate:%"PRId64" cl:%s\n", s->nb_inputs,
  227. av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
  228. return 0;
  229. }
  230. /* TODO: move optimized version from DSPContext to libavutil */
  231. static void vector_fmac_scalar(float *dst, const float *src, float mul, int len)
  232. {
  233. int i;
  234. for (i = 0; i < len; i++)
  235. dst[i] += src[i] * mul;
  236. }
  237. /**
  238. * Read samples from the input FIFOs, mix, and write to the output link.
  239. */
  240. static int output_frame(AVFilterLink *outlink, int nb_samples)
  241. {
  242. AVFilterContext *ctx = outlink->src;
  243. MixContext *s = ctx->priv;
  244. AVFilterBufferRef *out_buf, *in_buf;
  245. int i;
  246. calculate_scales(s, nb_samples);
  247. out_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
  248. if (!out_buf)
  249. return AVERROR(ENOMEM);
  250. in_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
  251. if (!in_buf)
  252. return AVERROR(ENOMEM);
  253. for (i = 0; i < s->nb_inputs; i++) {
  254. if (s->input_state[i] == INPUT_ON) {
  255. av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
  256. nb_samples);
  257. vector_fmac_scalar((float *)out_buf->extended_data[0],
  258. (float *) in_buf->extended_data[0],
  259. s->input_scale[i], nb_samples * s->nb_channels);
  260. }
  261. }
  262. avfilter_unref_buffer(in_buf);
  263. out_buf->pts = s->next_pts;
  264. if (s->next_pts != AV_NOPTS_VALUE)
  265. s->next_pts += nb_samples;
  266. ff_filter_samples(outlink, out_buf);
  267. return 0;
  268. }
  269. /**
  270. * Returns the smallest number of samples available in the input FIFOs other
  271. * than that of the first input.
  272. */
  273. static int get_available_samples(MixContext *s)
  274. {
  275. int i;
  276. int available_samples = INT_MAX;
  277. av_assert0(s->nb_inputs > 1);
  278. for (i = 1; i < s->nb_inputs; i++) {
  279. int nb_samples;
  280. if (s->input_state[i] == INPUT_OFF)
  281. continue;
  282. nb_samples = av_audio_fifo_size(s->fifos[i]);
  283. available_samples = FFMIN(available_samples, nb_samples);
  284. }
  285. if (available_samples == INT_MAX)
  286. return 0;
  287. return available_samples;
  288. }
  289. /**
  290. * Requests a frame, if needed, from each input link other than the first.
  291. */
  292. static int request_samples(AVFilterContext *ctx, int min_samples)
  293. {
  294. MixContext *s = ctx->priv;
  295. int i, ret;
  296. av_assert0(s->nb_inputs > 1);
  297. for (i = 1; i < s->nb_inputs; i++) {
  298. ret = 0;
  299. if (s->input_state[i] == INPUT_OFF)
  300. continue;
  301. while (!ret && av_audio_fifo_size(s->fifos[i]) < min_samples)
  302. ret = ff_request_frame(ctx->inputs[i]);
  303. if (ret == AVERROR_EOF) {
  304. if (av_audio_fifo_size(s->fifos[i]) == 0) {
  305. s->input_state[i] = INPUT_OFF;
  306. continue;
  307. }
  308. } else if (ret)
  309. return ret;
  310. }
  311. return 0;
  312. }
  313. /**
  314. * Calculates the number of active inputs and determines EOF based on the
  315. * duration option.
  316. *
  317. * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
  318. */
  319. static int calc_active_inputs(MixContext *s)
  320. {
  321. int i;
  322. int active_inputs = 0;
  323. for (i = 0; i < s->nb_inputs; i++)
  324. active_inputs += !!(s->input_state[i] != INPUT_OFF);
  325. s->active_inputs = active_inputs;
  326. if (!active_inputs ||
  327. (s->duration_mode == DURATION_FIRST && s->input_state[0] == INPUT_OFF) ||
  328. (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
  329. return AVERROR_EOF;
  330. return 0;
  331. }
  332. static int request_frame(AVFilterLink *outlink)
  333. {
  334. AVFilterContext *ctx = outlink->src;
  335. MixContext *s = ctx->priv;
  336. int ret;
  337. int wanted_samples, available_samples;
  338. ret = calc_active_inputs(s);
  339. if (ret < 0)
  340. return ret;
  341. if (s->input_state[0] == INPUT_OFF) {
  342. ret = request_samples(ctx, 1);
  343. if (ret < 0)
  344. return ret;
  345. ret = calc_active_inputs(s);
  346. if (ret < 0)
  347. return ret;
  348. available_samples = get_available_samples(s);
  349. if (!available_samples)
  350. return 0;
  351. return output_frame(outlink, available_samples);
  352. }
  353. if (s->frame_list->nb_frames == 0) {
  354. ret = ff_request_frame(ctx->inputs[0]);
  355. if (ret == AVERROR_EOF) {
  356. s->input_state[0] = INPUT_OFF;
  357. if (s->nb_inputs == 1)
  358. return AVERROR_EOF;
  359. else
  360. return AVERROR(EAGAIN);
  361. } else if (ret)
  362. return ret;
  363. }
  364. av_assert0(s->frame_list->nb_frames > 0);
  365. wanted_samples = frame_list_next_frame_size(s->frame_list);
  366. if (s->active_inputs > 1) {
  367. ret = request_samples(ctx, wanted_samples);
  368. if (ret < 0)
  369. return ret;
  370. ret = calc_active_inputs(s);
  371. if (ret < 0)
  372. return ret;
  373. available_samples = get_available_samples(s);
  374. if (!available_samples)
  375. return 0;
  376. available_samples = FFMIN(available_samples, wanted_samples);
  377. } else {
  378. available_samples = wanted_samples;
  379. }
  380. s->next_pts = frame_list_next_pts(s->frame_list);
  381. frame_list_remove_samples(s->frame_list, available_samples);
  382. return output_frame(outlink, available_samples);
  383. }
  384. static void filter_samples(AVFilterLink *inlink, AVFilterBufferRef *buf)
  385. {
  386. AVFilterContext *ctx = inlink->dst;
  387. MixContext *s = ctx->priv;
  388. AVFilterLink *outlink = ctx->outputs[0];
  389. int i;
  390. for (i = 0; i < ctx->input_count; i++)
  391. if (ctx->inputs[i] == inlink)
  392. break;
  393. if (i >= ctx->input_count) {
  394. av_log(ctx, AV_LOG_ERROR, "unknown input link\n");
  395. return;
  396. }
  397. if (i == 0) {
  398. int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
  399. outlink->time_base);
  400. frame_list_add_frame(s->frame_list, buf->audio->nb_samples, pts);
  401. }
  402. av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
  403. buf->audio->nb_samples);
  404. avfilter_unref_buffer(buf);
  405. }
  406. static int init(AVFilterContext *ctx, const char *args, void *opaque)
  407. {
  408. MixContext *s = ctx->priv;
  409. int i, ret;
  410. s->class = &amix_class;
  411. av_opt_set_defaults(s);
  412. if ((ret = av_set_options_string(s, args, "=", ":")) < 0) {
  413. av_log(ctx, AV_LOG_ERROR, "Error parsing options string '%s'.\n", args);
  414. return ret;
  415. }
  416. av_opt_free(s);
  417. for (i = 0; i < s->nb_inputs; i++) {
  418. char name[32];
  419. AVFilterPad pad = { 0 };
  420. snprintf(name, sizeof(name), "input%d", i);
  421. pad.type = AVMEDIA_TYPE_AUDIO;
  422. pad.name = av_strdup(name);
  423. pad.filter_samples = filter_samples;
  424. ff_insert_inpad(ctx, i, &pad);
  425. }
  426. return 0;
  427. }
  428. static void uninit(AVFilterContext *ctx)
  429. {
  430. int i;
  431. MixContext *s = ctx->priv;
  432. if (s->fifos) {
  433. for (i = 0; i < s->nb_inputs; i++)
  434. av_audio_fifo_free(s->fifos[i]);
  435. av_freep(&s->fifos);
  436. }
  437. frame_list_clear(s->frame_list);
  438. av_freep(&s->frame_list);
  439. av_freep(&s->input_state);
  440. av_freep(&s->input_scale);
  441. for (i = 0; i < ctx->input_count; i++)
  442. av_freep(&ctx->input_pads[i].name);
  443. }
  444. static int query_formats(AVFilterContext *ctx)
  445. {
  446. AVFilterFormats *formats = NULL;
  447. ff_add_format(&formats, AV_SAMPLE_FMT_FLT);
  448. ff_set_common_formats(ctx, formats);
  449. ff_set_common_channel_layouts(ctx, ff_all_channel_layouts());
  450. ff_set_common_samplerates(ctx, ff_all_samplerates());
  451. return 0;
  452. }
  453. AVFilter avfilter_af_amix = {
  454. .name = "amix",
  455. .description = NULL_IF_CONFIG_SMALL("Audio mixing."),
  456. .priv_size = sizeof(MixContext),
  457. .init = init,
  458. .uninit = uninit,
  459. .query_formats = query_formats,
  460. .inputs = (const AVFilterPad[]) {{ .name = NULL}},
  461. .outputs = (const AVFilterPad[]) {{ .name = "default",
  462. .type = AVMEDIA_TYPE_AUDIO,
  463. .config_props = config_output,
  464. .request_frame = request_frame },
  465. { .name = NULL}},
  466. };