You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1217 lines
38KB

  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/avassert.h"
  42. #include "libavutil/avstring.h"
  43. #include "libavutil/channel_layout.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct AudioFragment {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct ATempoContext {
  80. const AVClass *class;
  81. // ring-buffer of input samples, necessary because some times
  82. // input fragment position may be adjusted backwards:
  83. uint8_t *buffer;
  84. // ring-buffer maximum capacity, expressed in sample rate time base:
  85. int ring;
  86. // ring-buffer house keeping:
  87. int size;
  88. int head;
  89. int tail;
  90. // 0: input sample position corresponding to the ring buffer tail
  91. // 1: output sample position
  92. int64_t position[2];
  93. // sample format:
  94. enum AVSampleFormat format;
  95. // number of channels:
  96. int channels;
  97. // row of bytes to skip from one sample to next, across multple channels;
  98. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  99. int stride;
  100. // fragment window size, power-of-two integer:
  101. int window;
  102. // Hann window coefficients, for feathering
  103. // (blending) the overlapping fragment region:
  104. float *hann;
  105. // tempo scaling factor:
  106. double tempo;
  107. // a snapshot of previous fragment input and output position values
  108. // captured when the tempo scale factor was set most recently:
  109. int64_t origin[2];
  110. // current/previous fragment ring-buffer:
  111. AudioFragment frag[2];
  112. // current fragment index:
  113. uint64_t nfrag;
  114. // current state:
  115. FilterState state;
  116. // for fast correlation calculation in frequency domain:
  117. RDFTContext *real_to_complex;
  118. RDFTContext *complex_to_real;
  119. FFTSample *correlation;
  120. // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
  121. AVFrame *dst_buffer;
  122. uint8_t *dst;
  123. uint8_t *dst_end;
  124. uint64_t nsamples_in;
  125. uint64_t nsamples_out;
  126. } ATempoContext;
  127. #define YAE_ATEMPO_MIN 0.5
  128. #define YAE_ATEMPO_MAX 100.0
  129. #define OFFSET(x) offsetof(ATempoContext, x)
  130. static const AVOption atempo_options[] = {
  131. { "tempo", "set tempo scale factor",
  132. OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 },
  133. YAE_ATEMPO_MIN,
  134. YAE_ATEMPO_MAX,
  135. AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM },
  136. { NULL }
  137. };
  138. AVFILTER_DEFINE_CLASS(atempo);
  139. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  140. {
  141. return &atempo->frag[atempo->nfrag % 2];
  142. }
  143. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  144. {
  145. return &atempo->frag[(atempo->nfrag + 1) % 2];
  146. }
  147. /**
  148. * Reset filter to initial state, do not deallocate existing local buffers.
  149. */
  150. static void yae_clear(ATempoContext *atempo)
  151. {
  152. atempo->size = 0;
  153. atempo->head = 0;
  154. atempo->tail = 0;
  155. atempo->nfrag = 0;
  156. atempo->state = YAE_LOAD_FRAGMENT;
  157. atempo->position[0] = 0;
  158. atempo->position[1] = 0;
  159. atempo->origin[0] = 0;
  160. atempo->origin[1] = 0;
  161. atempo->frag[0].position[0] = 0;
  162. atempo->frag[0].position[1] = 0;
  163. atempo->frag[0].nsamples = 0;
  164. atempo->frag[1].position[0] = 0;
  165. atempo->frag[1].position[1] = 0;
  166. atempo->frag[1].nsamples = 0;
  167. // shift left position of 1st fragment by half a window
  168. // so that no re-normalization would be required for
  169. // the left half of the 1st fragment:
  170. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  171. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  172. av_frame_free(&atempo->dst_buffer);
  173. atempo->dst = NULL;
  174. atempo->dst_end = NULL;
  175. atempo->nsamples_in = 0;
  176. atempo->nsamples_out = 0;
  177. }
  178. /**
  179. * Reset filter to initial state and deallocate all buffers.
  180. */
  181. static void yae_release_buffers(ATempoContext *atempo)
  182. {
  183. yae_clear(atempo);
  184. av_freep(&atempo->frag[0].data);
  185. av_freep(&atempo->frag[1].data);
  186. av_freep(&atempo->frag[0].xdat);
  187. av_freep(&atempo->frag[1].xdat);
  188. av_freep(&atempo->buffer);
  189. av_freep(&atempo->hann);
  190. av_freep(&atempo->correlation);
  191. av_rdft_end(atempo->real_to_complex);
  192. atempo->real_to_complex = NULL;
  193. av_rdft_end(atempo->complex_to_real);
  194. atempo->complex_to_real = NULL;
  195. }
  196. /* av_realloc is not aligned enough; fortunately, the data does not need to
  197. * be preserved */
  198. #define RE_MALLOC_OR_FAIL(field, field_size) \
  199. do { \
  200. av_freep(&field); \
  201. field = av_malloc(field_size); \
  202. if (!field) { \
  203. yae_release_buffers(atempo); \
  204. return AVERROR(ENOMEM); \
  205. } \
  206. } while (0)
  207. /**
  208. * Prepare filter for processing audio data of given format,
  209. * sample rate and number of channels.
  210. */
  211. static int yae_reset(ATempoContext *atempo,
  212. enum AVSampleFormat format,
  213. int sample_rate,
  214. int channels)
  215. {
  216. const int sample_size = av_get_bytes_per_sample(format);
  217. uint32_t nlevels = 0;
  218. uint32_t pot;
  219. int i;
  220. atempo->format = format;
  221. atempo->channels = channels;
  222. atempo->stride = sample_size * channels;
  223. // pick a segment window size:
  224. atempo->window = sample_rate / 24;
  225. // adjust window size to be a power-of-two integer:
  226. nlevels = av_log2(atempo->window);
  227. pot = 1 << nlevels;
  228. av_assert0(pot <= atempo->window);
  229. if (pot < atempo->window) {
  230. atempo->window = pot * 2;
  231. nlevels++;
  232. }
  233. // initialize audio fragment buffers:
  234. RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  235. RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  236. RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  237. RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  238. // initialize rDFT contexts:
  239. av_rdft_end(atempo->real_to_complex);
  240. atempo->real_to_complex = NULL;
  241. av_rdft_end(atempo->complex_to_real);
  242. atempo->complex_to_real = NULL;
  243. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  244. if (!atempo->real_to_complex) {
  245. yae_release_buffers(atempo);
  246. return AVERROR(ENOMEM);
  247. }
  248. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  249. if (!atempo->complex_to_real) {
  250. yae_release_buffers(atempo);
  251. return AVERROR(ENOMEM);
  252. }
  253. RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  254. atempo->ring = atempo->window * 3;
  255. RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  256. // initialize the Hann window function:
  257. RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  258. for (i = 0; i < atempo->window; i++) {
  259. double t = (double)i / (double)(atempo->window - 1);
  260. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  261. atempo->hann[i] = (float)h;
  262. }
  263. yae_clear(atempo);
  264. return 0;
  265. }
  266. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  267. {
  268. const AudioFragment *prev;
  269. ATempoContext *atempo = ctx->priv;
  270. char *tail = NULL;
  271. double tempo = av_strtod(arg_tempo, &tail);
  272. if (tail && *tail) {
  273. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  274. return AVERROR(EINVAL);
  275. }
  276. if (tempo < YAE_ATEMPO_MIN || tempo > YAE_ATEMPO_MAX) {
  277. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [%f, %f] range\n",
  278. tempo, YAE_ATEMPO_MIN, YAE_ATEMPO_MAX);
  279. return AVERROR(EINVAL);
  280. }
  281. prev = yae_prev_frag(atempo);
  282. atempo->origin[0] = prev->position[0] + atempo->window / 2;
  283. atempo->origin[1] = prev->position[1] + atempo->window / 2;
  284. atempo->tempo = tempo;
  285. return 0;
  286. }
  287. /**
  288. * A helper macro for initializing complex data buffer with scalar data
  289. * of a given type.
  290. */
  291. #define yae_init_xdat(scalar_type, scalar_max) \
  292. do { \
  293. const uint8_t *src_end = src + \
  294. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  295. \
  296. FFTSample *xdat = frag->xdat; \
  297. scalar_type tmp; \
  298. \
  299. if (atempo->channels == 1) { \
  300. for (; src < src_end; xdat++) { \
  301. tmp = *(const scalar_type *)src; \
  302. src += sizeof(scalar_type); \
  303. \
  304. *xdat = (FFTSample)tmp; \
  305. } \
  306. } else { \
  307. FFTSample s, max, ti, si; \
  308. int i; \
  309. \
  310. for (; src < src_end; xdat++) { \
  311. tmp = *(const scalar_type *)src; \
  312. src += sizeof(scalar_type); \
  313. \
  314. max = (FFTSample)tmp; \
  315. s = FFMIN((FFTSample)scalar_max, \
  316. (FFTSample)fabsf(max)); \
  317. \
  318. for (i = 1; i < atempo->channels; i++) { \
  319. tmp = *(const scalar_type *)src; \
  320. src += sizeof(scalar_type); \
  321. \
  322. ti = (FFTSample)tmp; \
  323. si = FFMIN((FFTSample)scalar_max, \
  324. (FFTSample)fabsf(ti)); \
  325. \
  326. if (s < si) { \
  327. s = si; \
  328. max = ti; \
  329. } \
  330. } \
  331. \
  332. *xdat = max; \
  333. } \
  334. } \
  335. } while (0)
  336. /**
  337. * Initialize complex data buffer of a given audio fragment
  338. * with down-mixed mono data of appropriate scalar type.
  339. */
  340. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  341. {
  342. // shortcuts:
  343. const uint8_t *src = frag->data;
  344. // init complex data buffer used for FFT and Correlation:
  345. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  346. if (atempo->format == AV_SAMPLE_FMT_U8) {
  347. yae_init_xdat(uint8_t, 127);
  348. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  349. yae_init_xdat(int16_t, 32767);
  350. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  351. yae_init_xdat(int, 2147483647);
  352. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  353. yae_init_xdat(float, 1);
  354. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  355. yae_init_xdat(double, 1);
  356. }
  357. }
  358. /**
  359. * Populate the internal data buffer on as-needed basis.
  360. *
  361. * @return
  362. * 0 if requested data was already available or was successfully loaded,
  363. * AVERROR(EAGAIN) if more input data is required.
  364. */
  365. static int yae_load_data(ATempoContext *atempo,
  366. const uint8_t **src_ref,
  367. const uint8_t *src_end,
  368. int64_t stop_here)
  369. {
  370. // shortcut:
  371. const uint8_t *src = *src_ref;
  372. const int read_size = stop_here - atempo->position[0];
  373. if (stop_here <= atempo->position[0]) {
  374. return 0;
  375. }
  376. // samples are not expected to be skipped, unless tempo is greater than 2:
  377. av_assert0(read_size <= atempo->ring || atempo->tempo > 2.0);
  378. while (atempo->position[0] < stop_here && src < src_end) {
  379. int src_samples = (src_end - src) / atempo->stride;
  380. // load data piece-wise, in order to avoid complicating the logic:
  381. int nsamples = FFMIN(read_size, src_samples);
  382. int na;
  383. int nb;
  384. nsamples = FFMIN(nsamples, atempo->ring);
  385. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  386. nb = FFMIN(nsamples - na, atempo->ring);
  387. if (na) {
  388. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  389. memcpy(a, src, na * atempo->stride);
  390. src += na * atempo->stride;
  391. atempo->position[0] += na;
  392. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  393. atempo->tail = (atempo->tail + na) % atempo->ring;
  394. atempo->head =
  395. atempo->size < atempo->ring ?
  396. atempo->tail - atempo->size :
  397. atempo->tail;
  398. }
  399. if (nb) {
  400. uint8_t *b = atempo->buffer;
  401. memcpy(b, src, nb * atempo->stride);
  402. src += nb * atempo->stride;
  403. atempo->position[0] += nb;
  404. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  405. atempo->tail = (atempo->tail + nb) % atempo->ring;
  406. atempo->head =
  407. atempo->size < atempo->ring ?
  408. atempo->tail - atempo->size :
  409. atempo->tail;
  410. }
  411. }
  412. // pass back the updated source buffer pointer:
  413. *src_ref = src;
  414. // sanity check:
  415. av_assert0(atempo->position[0] <= stop_here);
  416. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  417. }
  418. /**
  419. * Populate current audio fragment data buffer.
  420. *
  421. * @return
  422. * 0 when the fragment is ready,
  423. * AVERROR(EAGAIN) if more input data is required.
  424. */
  425. static int yae_load_frag(ATempoContext *atempo,
  426. const uint8_t **src_ref,
  427. const uint8_t *src_end)
  428. {
  429. // shortcuts:
  430. AudioFragment *frag = yae_curr_frag(atempo);
  431. uint8_t *dst;
  432. int64_t missing, start, zeros;
  433. uint32_t nsamples;
  434. const uint8_t *a, *b;
  435. int i0, i1, n0, n1, na, nb;
  436. int64_t stop_here = frag->position[0] + atempo->window;
  437. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  438. return AVERROR(EAGAIN);
  439. }
  440. // calculate the number of samples we don't have:
  441. missing =
  442. stop_here > atempo->position[0] ?
  443. stop_here - atempo->position[0] : 0;
  444. nsamples =
  445. missing < (int64_t)atempo->window ?
  446. (uint32_t)(atempo->window - missing) : 0;
  447. // setup the output buffer:
  448. frag->nsamples = nsamples;
  449. dst = frag->data;
  450. start = atempo->position[0] - atempo->size;
  451. zeros = 0;
  452. if (frag->position[0] < start) {
  453. // what we don't have we substitute with zeros:
  454. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  455. av_assert0(zeros != nsamples);
  456. memset(dst, 0, zeros * atempo->stride);
  457. dst += zeros * atempo->stride;
  458. }
  459. if (zeros == nsamples) {
  460. return 0;
  461. }
  462. // get the remaining data from the ring buffer:
  463. na = (atempo->head < atempo->tail ?
  464. atempo->tail - atempo->head :
  465. atempo->ring - atempo->head);
  466. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  467. // sanity check:
  468. av_assert0(nsamples <= zeros + na + nb);
  469. a = atempo->buffer + atempo->head * atempo->stride;
  470. b = atempo->buffer;
  471. i0 = frag->position[0] + zeros - start;
  472. i1 = i0 < na ? 0 : i0 - na;
  473. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  474. n1 = nsamples - zeros - n0;
  475. if (n0) {
  476. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  477. dst += n0 * atempo->stride;
  478. }
  479. if (n1) {
  480. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  481. }
  482. return 0;
  483. }
  484. /**
  485. * Prepare for loading next audio fragment.
  486. */
  487. static void yae_advance_to_next_frag(ATempoContext *atempo)
  488. {
  489. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  490. const AudioFragment *prev;
  491. AudioFragment *frag;
  492. atempo->nfrag++;
  493. prev = yae_prev_frag(atempo);
  494. frag = yae_curr_frag(atempo);
  495. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  496. frag->position[1] = prev->position[1] + atempo->window / 2;
  497. frag->nsamples = 0;
  498. }
  499. /**
  500. * Calculate cross-correlation via rDFT.
  501. *
  502. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  503. * and transform back via complex_to_real rDFT.
  504. */
  505. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  506. RDFTContext *complex_to_real,
  507. const FFTComplex *xa,
  508. const FFTComplex *xb,
  509. const int window)
  510. {
  511. FFTComplex *xc = (FFTComplex *)xcorr;
  512. int i;
  513. // NOTE: first element requires special care -- Given Y = rDFT(X),
  514. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  515. // stores Re(Y[N/2]) in place of Im(Y[0]).
  516. xc->re = xa->re * xb->re;
  517. xc->im = xa->im * xb->im;
  518. xa++;
  519. xb++;
  520. xc++;
  521. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  522. xc->re = (xa->re * xb->re + xa->im * xb->im);
  523. xc->im = (xa->im * xb->re - xa->re * xb->im);
  524. }
  525. // apply inverse rDFT:
  526. av_rdft_calc(complex_to_real, xcorr);
  527. }
  528. /**
  529. * Calculate alignment offset for given fragment
  530. * relative to the previous fragment.
  531. *
  532. * @return alignment offset of current fragment relative to previous.
  533. */
  534. static int yae_align(AudioFragment *frag,
  535. const AudioFragment *prev,
  536. const int window,
  537. const int delta_max,
  538. const int drift,
  539. FFTSample *correlation,
  540. RDFTContext *complex_to_real)
  541. {
  542. int best_offset = -drift;
  543. FFTSample best_metric = -FLT_MAX;
  544. FFTSample *xcorr;
  545. int i0;
  546. int i1;
  547. int i;
  548. yae_xcorr_via_rdft(correlation,
  549. complex_to_real,
  550. (const FFTComplex *)prev->xdat,
  551. (const FFTComplex *)frag->xdat,
  552. window);
  553. // identify search window boundaries:
  554. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  555. i0 = FFMIN(i0, window);
  556. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  557. i1 = FFMAX(i1, 0);
  558. // identify cross-correlation peaks within search window:
  559. xcorr = correlation + i0;
  560. for (i = i0; i < i1; i++, xcorr++) {
  561. FFTSample metric = *xcorr;
  562. // normalize:
  563. FFTSample drifti = (FFTSample)(drift + i);
  564. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  565. if (metric > best_metric) {
  566. best_metric = metric;
  567. best_offset = i - window / 2;
  568. }
  569. }
  570. return best_offset;
  571. }
  572. /**
  573. * Adjust current fragment position for better alignment
  574. * with previous fragment.
  575. *
  576. * @return alignment correction.
  577. */
  578. static int yae_adjust_position(ATempoContext *atempo)
  579. {
  580. const AudioFragment *prev = yae_prev_frag(atempo);
  581. AudioFragment *frag = yae_curr_frag(atempo);
  582. const double prev_output_position =
  583. (double)(prev->position[1] - atempo->origin[1] + atempo->window / 2) *
  584. atempo->tempo;
  585. const double ideal_output_position =
  586. (double)(prev->position[0] - atempo->origin[0] + atempo->window / 2);
  587. const int drift = (int)(prev_output_position - ideal_output_position);
  588. const int delta_max = atempo->window / 2;
  589. const int correction = yae_align(frag,
  590. prev,
  591. atempo->window,
  592. delta_max,
  593. drift,
  594. atempo->correlation,
  595. atempo->complex_to_real);
  596. if (correction) {
  597. // adjust fragment position:
  598. frag->position[0] -= correction;
  599. // clear so that the fragment can be reloaded:
  600. frag->nsamples = 0;
  601. }
  602. return correction;
  603. }
  604. /**
  605. * A helper macro for blending the overlap region of previous
  606. * and current audio fragment.
  607. */
  608. #define yae_blend(scalar_type) \
  609. do { \
  610. const scalar_type *aaa = (const scalar_type *)a; \
  611. const scalar_type *bbb = (const scalar_type *)b; \
  612. \
  613. scalar_type *out = (scalar_type *)dst; \
  614. scalar_type *out_end = (scalar_type *)dst_end; \
  615. int64_t i; \
  616. \
  617. for (i = 0; i < overlap && out < out_end; \
  618. i++, atempo->position[1]++, wa++, wb++) { \
  619. float w0 = *wa; \
  620. float w1 = *wb; \
  621. int j; \
  622. \
  623. for (j = 0; j < atempo->channels; \
  624. j++, aaa++, bbb++, out++) { \
  625. float t0 = (float)*aaa; \
  626. float t1 = (float)*bbb; \
  627. \
  628. *out = \
  629. frag->position[0] + i < 0 ? \
  630. *aaa : \
  631. (scalar_type)(t0 * w0 + t1 * w1); \
  632. } \
  633. } \
  634. dst = (uint8_t *)out; \
  635. } while (0)
  636. /**
  637. * Blend the overlap region of previous and current audio fragment
  638. * and output the results to the given destination buffer.
  639. *
  640. * @return
  641. * 0 if the overlap region was completely stored in the dst buffer,
  642. * AVERROR(EAGAIN) if more destination buffer space is required.
  643. */
  644. static int yae_overlap_add(ATempoContext *atempo,
  645. uint8_t **dst_ref,
  646. uint8_t *dst_end)
  647. {
  648. // shortcuts:
  649. const AudioFragment *prev = yae_prev_frag(atempo);
  650. const AudioFragment *frag = yae_curr_frag(atempo);
  651. const int64_t start_here = FFMAX(atempo->position[1],
  652. frag->position[1]);
  653. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  654. frag->position[1] + frag->nsamples);
  655. const int64_t overlap = stop_here - start_here;
  656. const int64_t ia = start_here - prev->position[1];
  657. const int64_t ib = start_here - frag->position[1];
  658. const float *wa = atempo->hann + ia;
  659. const float *wb = atempo->hann + ib;
  660. const uint8_t *a = prev->data + ia * atempo->stride;
  661. const uint8_t *b = frag->data + ib * atempo->stride;
  662. uint8_t *dst = *dst_ref;
  663. av_assert0(start_here <= stop_here &&
  664. frag->position[1] <= start_here &&
  665. overlap <= frag->nsamples);
  666. if (atempo->format == AV_SAMPLE_FMT_U8) {
  667. yae_blend(uint8_t);
  668. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  669. yae_blend(int16_t);
  670. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  671. yae_blend(int);
  672. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  673. yae_blend(float);
  674. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  675. yae_blend(double);
  676. }
  677. // pass-back the updated destination buffer pointer:
  678. *dst_ref = dst;
  679. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  680. }
  681. /**
  682. * Feed as much data to the filter as it is able to consume
  683. * and receive as much processed data in the destination buffer
  684. * as it is able to produce or store.
  685. */
  686. static void
  687. yae_apply(ATempoContext *atempo,
  688. const uint8_t **src_ref,
  689. const uint8_t *src_end,
  690. uint8_t **dst_ref,
  691. uint8_t *dst_end)
  692. {
  693. while (1) {
  694. if (atempo->state == YAE_LOAD_FRAGMENT) {
  695. // load additional data for the current fragment:
  696. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  697. break;
  698. }
  699. // down-mix to mono:
  700. yae_downmix(atempo, yae_curr_frag(atempo));
  701. // apply rDFT:
  702. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  703. // must load the second fragment before alignment can start:
  704. if (!atempo->nfrag) {
  705. yae_advance_to_next_frag(atempo);
  706. continue;
  707. }
  708. atempo->state = YAE_ADJUST_POSITION;
  709. }
  710. if (atempo->state == YAE_ADJUST_POSITION) {
  711. // adjust position for better alignment:
  712. if (yae_adjust_position(atempo)) {
  713. // reload the fragment at the corrected position, so that the
  714. // Hann window blending would not require normalization:
  715. atempo->state = YAE_RELOAD_FRAGMENT;
  716. } else {
  717. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  718. }
  719. }
  720. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  721. // load additional data if necessary due to position adjustment:
  722. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  723. break;
  724. }
  725. // down-mix to mono:
  726. yae_downmix(atempo, yae_curr_frag(atempo));
  727. // apply rDFT:
  728. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  729. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  730. }
  731. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  732. // overlap-add and output the result:
  733. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  734. break;
  735. }
  736. // advance to the next fragment, repeat:
  737. yae_advance_to_next_frag(atempo);
  738. atempo->state = YAE_LOAD_FRAGMENT;
  739. }
  740. }
  741. }
  742. /**
  743. * Flush any buffered data from the filter.
  744. *
  745. * @return
  746. * 0 if all data was completely stored in the dst buffer,
  747. * AVERROR(EAGAIN) if more destination buffer space is required.
  748. */
  749. static int yae_flush(ATempoContext *atempo,
  750. uint8_t **dst_ref,
  751. uint8_t *dst_end)
  752. {
  753. AudioFragment *frag = yae_curr_frag(atempo);
  754. int64_t overlap_end;
  755. int64_t start_here;
  756. int64_t stop_here;
  757. int64_t offset;
  758. const uint8_t *src;
  759. uint8_t *dst;
  760. int src_size;
  761. int dst_size;
  762. int nbytes;
  763. atempo->state = YAE_FLUSH_OUTPUT;
  764. if (!atempo->nfrag) {
  765. // there is nothing to flush:
  766. return 0;
  767. }
  768. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  769. atempo->position[1] == frag->position[1] + frag->nsamples) {
  770. // the current fragment is already flushed:
  771. return 0;
  772. }
  773. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  774. // finish loading the current (possibly partial) fragment:
  775. yae_load_frag(atempo, NULL, NULL);
  776. if (atempo->nfrag) {
  777. // down-mix to mono:
  778. yae_downmix(atempo, frag);
  779. // apply rDFT:
  780. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  781. // align current fragment to previous fragment:
  782. if (yae_adjust_position(atempo)) {
  783. // reload the current fragment due to adjusted position:
  784. yae_load_frag(atempo, NULL, NULL);
  785. }
  786. }
  787. }
  788. // flush the overlap region:
  789. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  790. frag->nsamples);
  791. while (atempo->position[1] < overlap_end) {
  792. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  793. return AVERROR(EAGAIN);
  794. }
  795. }
  796. // check whether all of the input samples have been consumed:
  797. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  798. yae_advance_to_next_frag(atempo);
  799. return AVERROR(EAGAIN);
  800. }
  801. // flush the remainder of the current fragment:
  802. start_here = FFMAX(atempo->position[1], overlap_end);
  803. stop_here = frag->position[1] + frag->nsamples;
  804. offset = start_here - frag->position[1];
  805. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  806. src = frag->data + offset * atempo->stride;
  807. dst = (uint8_t *)*dst_ref;
  808. src_size = (int)(stop_here - start_here) * atempo->stride;
  809. dst_size = dst_end - dst;
  810. nbytes = FFMIN(src_size, dst_size);
  811. memcpy(dst, src, nbytes);
  812. dst += nbytes;
  813. atempo->position[1] += (nbytes / atempo->stride);
  814. // pass-back the updated destination buffer pointer:
  815. *dst_ref = (uint8_t *)dst;
  816. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  817. }
  818. static av_cold int init(AVFilterContext *ctx)
  819. {
  820. ATempoContext *atempo = ctx->priv;
  821. atempo->format = AV_SAMPLE_FMT_NONE;
  822. atempo->state = YAE_LOAD_FRAGMENT;
  823. return 0;
  824. }
  825. static av_cold void uninit(AVFilterContext *ctx)
  826. {
  827. ATempoContext *atempo = ctx->priv;
  828. yae_release_buffers(atempo);
  829. }
  830. static int query_formats(AVFilterContext *ctx)
  831. {
  832. AVFilterChannelLayouts *layouts = NULL;
  833. AVFilterFormats *formats = NULL;
  834. // WSOLA necessitates an internal sliding window ring buffer
  835. // for incoming audio stream.
  836. //
  837. // Planar sample formats are too cumbersome to store in a ring buffer,
  838. // therefore planar sample formats are not supported.
  839. //
  840. static const enum AVSampleFormat sample_fmts[] = {
  841. AV_SAMPLE_FMT_U8,
  842. AV_SAMPLE_FMT_S16,
  843. AV_SAMPLE_FMT_S32,
  844. AV_SAMPLE_FMT_FLT,
  845. AV_SAMPLE_FMT_DBL,
  846. AV_SAMPLE_FMT_NONE
  847. };
  848. int ret;
  849. layouts = ff_all_channel_counts();
  850. if (!layouts) {
  851. return AVERROR(ENOMEM);
  852. }
  853. ret = ff_set_common_channel_layouts(ctx, layouts);
  854. if (ret < 0)
  855. return ret;
  856. formats = ff_make_format_list(sample_fmts);
  857. if (!formats) {
  858. return AVERROR(ENOMEM);
  859. }
  860. ret = ff_set_common_formats(ctx, formats);
  861. if (ret < 0)
  862. return ret;
  863. formats = ff_all_samplerates();
  864. if (!formats) {
  865. return AVERROR(ENOMEM);
  866. }
  867. return ff_set_common_samplerates(ctx, formats);
  868. }
  869. static int config_props(AVFilterLink *inlink)
  870. {
  871. AVFilterContext *ctx = inlink->dst;
  872. ATempoContext *atempo = ctx->priv;
  873. enum AVSampleFormat format = inlink->format;
  874. int sample_rate = (int)inlink->sample_rate;
  875. return yae_reset(atempo, format, sample_rate, inlink->channels);
  876. }
  877. static int push_samples(ATempoContext *atempo,
  878. AVFilterLink *outlink,
  879. int n_out)
  880. {
  881. int ret;
  882. atempo->dst_buffer->sample_rate = outlink->sample_rate;
  883. atempo->dst_buffer->nb_samples = n_out;
  884. // adjust the PTS:
  885. atempo->dst_buffer->pts =
  886. av_rescale_q(atempo->nsamples_out,
  887. (AVRational){ 1, outlink->sample_rate },
  888. outlink->time_base);
  889. ret = ff_filter_frame(outlink, atempo->dst_buffer);
  890. atempo->dst_buffer = NULL;
  891. atempo->dst = NULL;
  892. atempo->dst_end = NULL;
  893. if (ret < 0)
  894. return ret;
  895. atempo->nsamples_out += n_out;
  896. return 0;
  897. }
  898. static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer)
  899. {
  900. AVFilterContext *ctx = inlink->dst;
  901. ATempoContext *atempo = ctx->priv;
  902. AVFilterLink *outlink = ctx->outputs[0];
  903. int ret = 0;
  904. int n_in = src_buffer->nb_samples;
  905. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  906. const uint8_t *src = src_buffer->data[0];
  907. const uint8_t *src_end = src + n_in * atempo->stride;
  908. while (src < src_end) {
  909. if (!atempo->dst_buffer) {
  910. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out);
  911. if (!atempo->dst_buffer) {
  912. av_frame_free(&src_buffer);
  913. return AVERROR(ENOMEM);
  914. }
  915. av_frame_copy_props(atempo->dst_buffer, src_buffer);
  916. atempo->dst = atempo->dst_buffer->data[0];
  917. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  918. }
  919. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  920. if (atempo->dst == atempo->dst_end) {
  921. int n_samples = ((atempo->dst - atempo->dst_buffer->data[0]) /
  922. atempo->stride);
  923. ret = push_samples(atempo, outlink, n_samples);
  924. if (ret < 0)
  925. goto end;
  926. }
  927. }
  928. atempo->nsamples_in += n_in;
  929. end:
  930. av_frame_free(&src_buffer);
  931. return ret;
  932. }
  933. static int request_frame(AVFilterLink *outlink)
  934. {
  935. AVFilterContext *ctx = outlink->src;
  936. ATempoContext *atempo = ctx->priv;
  937. int ret;
  938. ret = ff_request_frame(ctx->inputs[0]);
  939. if (ret == AVERROR_EOF) {
  940. // flush the filter:
  941. int n_max = atempo->ring;
  942. int n_out;
  943. int err = AVERROR(EAGAIN);
  944. while (err == AVERROR(EAGAIN)) {
  945. if (!atempo->dst_buffer) {
  946. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max);
  947. if (!atempo->dst_buffer)
  948. return AVERROR(ENOMEM);
  949. atempo->dst = atempo->dst_buffer->data[0];
  950. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  951. }
  952. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  953. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  954. atempo->stride);
  955. if (n_out) {
  956. ret = push_samples(atempo, outlink, n_out);
  957. if (ret < 0)
  958. return ret;
  959. }
  960. }
  961. av_frame_free(&atempo->dst_buffer);
  962. atempo->dst = NULL;
  963. atempo->dst_end = NULL;
  964. return AVERROR_EOF;
  965. }
  966. return ret;
  967. }
  968. static int process_command(AVFilterContext *ctx,
  969. const char *cmd,
  970. const char *arg,
  971. char *res,
  972. int res_len,
  973. int flags)
  974. {
  975. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  976. }
  977. static const AVFilterPad atempo_inputs[] = {
  978. {
  979. .name = "default",
  980. .type = AVMEDIA_TYPE_AUDIO,
  981. .filter_frame = filter_frame,
  982. .config_props = config_props,
  983. },
  984. { NULL }
  985. };
  986. static const AVFilterPad atempo_outputs[] = {
  987. {
  988. .name = "default",
  989. .request_frame = request_frame,
  990. .type = AVMEDIA_TYPE_AUDIO,
  991. },
  992. { NULL }
  993. };
  994. AVFilter ff_af_atempo = {
  995. .name = "atempo",
  996. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  997. .init = init,
  998. .uninit = uninit,
  999. .query_formats = query_formats,
  1000. .process_command = process_command,
  1001. .priv_size = sizeof(ATempoContext),
  1002. .priv_class = &atempo_class,
  1003. .inputs = atempo_inputs,
  1004. .outputs = atempo_outputs,
  1005. };