You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1182 lines
37KB

  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/avassert.h"
  42. #include "libavutil/avstring.h"
  43. #include "libavutil/channel_layout.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct {
  80. const AVClass *class;
  81. // ring-buffer of input samples, necessary because some times
  82. // input fragment position may be adjusted backwards:
  83. uint8_t *buffer;
  84. // ring-buffer maximum capacity, expressed in sample rate time base:
  85. int ring;
  86. // ring-buffer house keeping:
  87. int size;
  88. int head;
  89. int tail;
  90. // 0: input sample position corresponding to the ring buffer tail
  91. // 1: output sample position
  92. int64_t position[2];
  93. // sample format:
  94. enum AVSampleFormat format;
  95. // number of channels:
  96. int channels;
  97. // row of bytes to skip from one sample to next, across multple channels;
  98. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  99. int stride;
  100. // fragment window size, power-of-two integer:
  101. int window;
  102. // Hann window coefficients, for feathering
  103. // (blending) the overlapping fragment region:
  104. float *hann;
  105. // tempo scaling factor:
  106. double tempo;
  107. // cumulative alignment drift:
  108. int drift;
  109. // current/previous fragment ring-buffer:
  110. AudioFragment frag[2];
  111. // current fragment index:
  112. uint64_t nfrag;
  113. // current state:
  114. FilterState state;
  115. // for fast correlation calculation in frequency domain:
  116. RDFTContext *real_to_complex;
  117. RDFTContext *complex_to_real;
  118. FFTSample *correlation;
  119. // for managing AVFilterPad.request_frame and AVFilterPad.filter_frame
  120. AVFrame *dst_buffer;
  121. uint8_t *dst;
  122. uint8_t *dst_end;
  123. uint64_t nsamples_in;
  124. uint64_t nsamples_out;
  125. } ATempoContext;
  126. #define OFFSET(x) offsetof(ATempoContext, x)
  127. static const AVOption atempo_options[] = {
  128. { "tempo", "set tempo scale factor",
  129. OFFSET(tempo), AV_OPT_TYPE_DOUBLE, { .dbl = 1.0 }, 0.5, 2.0,
  130. AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM },
  131. { NULL }
  132. };
  133. AVFILTER_DEFINE_CLASS(atempo);
  134. /**
  135. * Reset filter to initial state, do not deallocate existing local buffers.
  136. */
  137. static void yae_clear(ATempoContext *atempo)
  138. {
  139. atempo->size = 0;
  140. atempo->head = 0;
  141. atempo->tail = 0;
  142. atempo->drift = 0;
  143. atempo->nfrag = 0;
  144. atempo->state = YAE_LOAD_FRAGMENT;
  145. atempo->position[0] = 0;
  146. atempo->position[1] = 0;
  147. atempo->frag[0].position[0] = 0;
  148. atempo->frag[0].position[1] = 0;
  149. atempo->frag[0].nsamples = 0;
  150. atempo->frag[1].position[0] = 0;
  151. atempo->frag[1].position[1] = 0;
  152. atempo->frag[1].nsamples = 0;
  153. // shift left position of 1st fragment by half a window
  154. // so that no re-normalization would be required for
  155. // the left half of the 1st fragment:
  156. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  157. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  158. av_frame_free(&atempo->dst_buffer);
  159. atempo->dst = NULL;
  160. atempo->dst_end = NULL;
  161. atempo->nsamples_in = 0;
  162. atempo->nsamples_out = 0;
  163. }
  164. /**
  165. * Reset filter to initial state and deallocate all buffers.
  166. */
  167. static void yae_release_buffers(ATempoContext *atempo)
  168. {
  169. yae_clear(atempo);
  170. av_freep(&atempo->frag[0].data);
  171. av_freep(&atempo->frag[1].data);
  172. av_freep(&atempo->frag[0].xdat);
  173. av_freep(&atempo->frag[1].xdat);
  174. av_freep(&atempo->buffer);
  175. av_freep(&atempo->hann);
  176. av_freep(&atempo->correlation);
  177. av_rdft_end(atempo->real_to_complex);
  178. atempo->real_to_complex = NULL;
  179. av_rdft_end(atempo->complex_to_real);
  180. atempo->complex_to_real = NULL;
  181. }
  182. /* av_realloc is not aligned enough; fortunately, the data does not need to
  183. * be preserved */
  184. #define RE_MALLOC_OR_FAIL(field, field_size) \
  185. do { \
  186. av_freep(&field); \
  187. field = av_malloc(field_size); \
  188. if (!field) { \
  189. yae_release_buffers(atempo); \
  190. return AVERROR(ENOMEM); \
  191. } \
  192. } while (0)
  193. /**
  194. * Prepare filter for processing audio data of given format,
  195. * sample rate and number of channels.
  196. */
  197. static int yae_reset(ATempoContext *atempo,
  198. enum AVSampleFormat format,
  199. int sample_rate,
  200. int channels)
  201. {
  202. const int sample_size = av_get_bytes_per_sample(format);
  203. uint32_t nlevels = 0;
  204. uint32_t pot;
  205. int i;
  206. atempo->format = format;
  207. atempo->channels = channels;
  208. atempo->stride = sample_size * channels;
  209. // pick a segment window size:
  210. atempo->window = sample_rate / 24;
  211. // adjust window size to be a power-of-two integer:
  212. nlevels = av_log2(atempo->window);
  213. pot = 1 << nlevels;
  214. av_assert0(pot <= atempo->window);
  215. if (pot < atempo->window) {
  216. atempo->window = pot * 2;
  217. nlevels++;
  218. }
  219. // initialize audio fragment buffers:
  220. RE_MALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  221. RE_MALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  222. RE_MALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  223. RE_MALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  224. // initialize rDFT contexts:
  225. av_rdft_end(atempo->real_to_complex);
  226. atempo->real_to_complex = NULL;
  227. av_rdft_end(atempo->complex_to_real);
  228. atempo->complex_to_real = NULL;
  229. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  230. if (!atempo->real_to_complex) {
  231. yae_release_buffers(atempo);
  232. return AVERROR(ENOMEM);
  233. }
  234. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  235. if (!atempo->complex_to_real) {
  236. yae_release_buffers(atempo);
  237. return AVERROR(ENOMEM);
  238. }
  239. RE_MALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  240. atempo->ring = atempo->window * 3;
  241. RE_MALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  242. // initialize the Hann window function:
  243. RE_MALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  244. for (i = 0; i < atempo->window; i++) {
  245. double t = (double)i / (double)(atempo->window - 1);
  246. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  247. atempo->hann[i] = (float)h;
  248. }
  249. yae_clear(atempo);
  250. return 0;
  251. }
  252. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  253. {
  254. ATempoContext *atempo = ctx->priv;
  255. char *tail = NULL;
  256. double tempo = av_strtod(arg_tempo, &tail);
  257. if (tail && *tail) {
  258. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  259. return AVERROR(EINVAL);
  260. }
  261. if (tempo < 0.5 || tempo > 2.0) {
  262. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
  263. tempo);
  264. return AVERROR(EINVAL);
  265. }
  266. atempo->tempo = tempo;
  267. return 0;
  268. }
  269. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  270. {
  271. return &atempo->frag[atempo->nfrag % 2];
  272. }
  273. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  274. {
  275. return &atempo->frag[(atempo->nfrag + 1) % 2];
  276. }
  277. /**
  278. * A helper macro for initializing complex data buffer with scalar data
  279. * of a given type.
  280. */
  281. #define yae_init_xdat(scalar_type, scalar_max) \
  282. do { \
  283. const uint8_t *src_end = src + \
  284. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  285. \
  286. FFTSample *xdat = frag->xdat; \
  287. scalar_type tmp; \
  288. \
  289. if (atempo->channels == 1) { \
  290. for (; src < src_end; xdat++) { \
  291. tmp = *(const scalar_type *)src; \
  292. src += sizeof(scalar_type); \
  293. \
  294. *xdat = (FFTSample)tmp; \
  295. } \
  296. } else { \
  297. FFTSample s, max, ti, si; \
  298. int i; \
  299. \
  300. for (; src < src_end; xdat++) { \
  301. tmp = *(const scalar_type *)src; \
  302. src += sizeof(scalar_type); \
  303. \
  304. max = (FFTSample)tmp; \
  305. s = FFMIN((FFTSample)scalar_max, \
  306. (FFTSample)fabsf(max)); \
  307. \
  308. for (i = 1; i < atempo->channels; i++) { \
  309. tmp = *(const scalar_type *)src; \
  310. src += sizeof(scalar_type); \
  311. \
  312. ti = (FFTSample)tmp; \
  313. si = FFMIN((FFTSample)scalar_max, \
  314. (FFTSample)fabsf(ti)); \
  315. \
  316. if (s < si) { \
  317. s = si; \
  318. max = ti; \
  319. } \
  320. } \
  321. \
  322. *xdat = max; \
  323. } \
  324. } \
  325. } while (0)
  326. /**
  327. * Initialize complex data buffer of a given audio fragment
  328. * with down-mixed mono data of appropriate scalar type.
  329. */
  330. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  331. {
  332. // shortcuts:
  333. const uint8_t *src = frag->data;
  334. // init complex data buffer used for FFT and Correlation:
  335. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  336. if (atempo->format == AV_SAMPLE_FMT_U8) {
  337. yae_init_xdat(uint8_t, 127);
  338. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  339. yae_init_xdat(int16_t, 32767);
  340. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  341. yae_init_xdat(int, 2147483647);
  342. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  343. yae_init_xdat(float, 1);
  344. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  345. yae_init_xdat(double, 1);
  346. }
  347. }
  348. /**
  349. * Populate the internal data buffer on as-needed basis.
  350. *
  351. * @return
  352. * 0 if requested data was already available or was successfully loaded,
  353. * AVERROR(EAGAIN) if more input data is required.
  354. */
  355. static int yae_load_data(ATempoContext *atempo,
  356. const uint8_t **src_ref,
  357. const uint8_t *src_end,
  358. int64_t stop_here)
  359. {
  360. // shortcut:
  361. const uint8_t *src = *src_ref;
  362. const int read_size = stop_here - atempo->position[0];
  363. if (stop_here <= atempo->position[0]) {
  364. return 0;
  365. }
  366. // samples are not expected to be skipped:
  367. av_assert0(read_size <= atempo->ring);
  368. while (atempo->position[0] < stop_here && src < src_end) {
  369. int src_samples = (src_end - src) / atempo->stride;
  370. // load data piece-wise, in order to avoid complicating the logic:
  371. int nsamples = FFMIN(read_size, src_samples);
  372. int na;
  373. int nb;
  374. nsamples = FFMIN(nsamples, atempo->ring);
  375. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  376. nb = FFMIN(nsamples - na, atempo->ring);
  377. if (na) {
  378. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  379. memcpy(a, src, na * atempo->stride);
  380. src += na * atempo->stride;
  381. atempo->position[0] += na;
  382. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  383. atempo->tail = (atempo->tail + na) % atempo->ring;
  384. atempo->head =
  385. atempo->size < atempo->ring ?
  386. atempo->tail - atempo->size :
  387. atempo->tail;
  388. }
  389. if (nb) {
  390. uint8_t *b = atempo->buffer;
  391. memcpy(b, src, nb * atempo->stride);
  392. src += nb * atempo->stride;
  393. atempo->position[0] += nb;
  394. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  395. atempo->tail = (atempo->tail + nb) % atempo->ring;
  396. atempo->head =
  397. atempo->size < atempo->ring ?
  398. atempo->tail - atempo->size :
  399. atempo->tail;
  400. }
  401. }
  402. // pass back the updated source buffer pointer:
  403. *src_ref = src;
  404. // sanity check:
  405. av_assert0(atempo->position[0] <= stop_here);
  406. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  407. }
  408. /**
  409. * Populate current audio fragment data buffer.
  410. *
  411. * @return
  412. * 0 when the fragment is ready,
  413. * AVERROR(EAGAIN) if more input data is required.
  414. */
  415. static int yae_load_frag(ATempoContext *atempo,
  416. const uint8_t **src_ref,
  417. const uint8_t *src_end)
  418. {
  419. // shortcuts:
  420. AudioFragment *frag = yae_curr_frag(atempo);
  421. uint8_t *dst;
  422. int64_t missing, start, zeros;
  423. uint32_t nsamples;
  424. const uint8_t *a, *b;
  425. int i0, i1, n0, n1, na, nb;
  426. int64_t stop_here = frag->position[0] + atempo->window;
  427. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  428. return AVERROR(EAGAIN);
  429. }
  430. // calculate the number of samples we don't have:
  431. missing =
  432. stop_here > atempo->position[0] ?
  433. stop_here - atempo->position[0] : 0;
  434. nsamples =
  435. missing < (int64_t)atempo->window ?
  436. (uint32_t)(atempo->window - missing) : 0;
  437. // setup the output buffer:
  438. frag->nsamples = nsamples;
  439. dst = frag->data;
  440. start = atempo->position[0] - atempo->size;
  441. zeros = 0;
  442. if (frag->position[0] < start) {
  443. // what we don't have we substitute with zeros:
  444. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  445. av_assert0(zeros != nsamples);
  446. memset(dst, 0, zeros * atempo->stride);
  447. dst += zeros * atempo->stride;
  448. }
  449. if (zeros == nsamples) {
  450. return 0;
  451. }
  452. // get the remaining data from the ring buffer:
  453. na = (atempo->head < atempo->tail ?
  454. atempo->tail - atempo->head :
  455. atempo->ring - atempo->head);
  456. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  457. // sanity check:
  458. av_assert0(nsamples <= zeros + na + nb);
  459. a = atempo->buffer + atempo->head * atempo->stride;
  460. b = atempo->buffer;
  461. i0 = frag->position[0] + zeros - start;
  462. i1 = i0 < na ? 0 : i0 - na;
  463. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  464. n1 = nsamples - zeros - n0;
  465. if (n0) {
  466. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  467. dst += n0 * atempo->stride;
  468. }
  469. if (n1) {
  470. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  471. }
  472. return 0;
  473. }
  474. /**
  475. * Prepare for loading next audio fragment.
  476. */
  477. static void yae_advance_to_next_frag(ATempoContext *atempo)
  478. {
  479. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  480. const AudioFragment *prev;
  481. AudioFragment *frag;
  482. atempo->nfrag++;
  483. prev = yae_prev_frag(atempo);
  484. frag = yae_curr_frag(atempo);
  485. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  486. frag->position[1] = prev->position[1] + atempo->window / 2;
  487. frag->nsamples = 0;
  488. }
  489. /**
  490. * Calculate cross-correlation via rDFT.
  491. *
  492. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  493. * and transform back via complex_to_real rDFT.
  494. */
  495. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  496. RDFTContext *complex_to_real,
  497. const FFTComplex *xa,
  498. const FFTComplex *xb,
  499. const int window)
  500. {
  501. FFTComplex *xc = (FFTComplex *)xcorr;
  502. int i;
  503. // NOTE: first element requires special care -- Given Y = rDFT(X),
  504. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  505. // stores Re(Y[N/2]) in place of Im(Y[0]).
  506. xc->re = xa->re * xb->re;
  507. xc->im = xa->im * xb->im;
  508. xa++;
  509. xb++;
  510. xc++;
  511. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  512. xc->re = (xa->re * xb->re + xa->im * xb->im);
  513. xc->im = (xa->im * xb->re - xa->re * xb->im);
  514. }
  515. // apply inverse rDFT:
  516. av_rdft_calc(complex_to_real, xcorr);
  517. }
  518. /**
  519. * Calculate alignment offset for given fragment
  520. * relative to the previous fragment.
  521. *
  522. * @return alignment offset of current fragment relative to previous.
  523. */
  524. static int yae_align(AudioFragment *frag,
  525. const AudioFragment *prev,
  526. const int window,
  527. const int delta_max,
  528. const int drift,
  529. FFTSample *correlation,
  530. RDFTContext *complex_to_real)
  531. {
  532. int best_offset = -drift;
  533. FFTSample best_metric = -FLT_MAX;
  534. FFTSample *xcorr;
  535. int i0;
  536. int i1;
  537. int i;
  538. yae_xcorr_via_rdft(correlation,
  539. complex_to_real,
  540. (const FFTComplex *)prev->xdat,
  541. (const FFTComplex *)frag->xdat,
  542. window);
  543. // identify search window boundaries:
  544. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  545. i0 = FFMIN(i0, window);
  546. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  547. i1 = FFMAX(i1, 0);
  548. // identify cross-correlation peaks within search window:
  549. xcorr = correlation + i0;
  550. for (i = i0; i < i1; i++, xcorr++) {
  551. FFTSample metric = *xcorr;
  552. // normalize:
  553. FFTSample drifti = (FFTSample)(drift + i);
  554. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  555. if (metric > best_metric) {
  556. best_metric = metric;
  557. best_offset = i - window / 2;
  558. }
  559. }
  560. return best_offset;
  561. }
  562. /**
  563. * Adjust current fragment position for better alignment
  564. * with previous fragment.
  565. *
  566. * @return alignment correction.
  567. */
  568. static int yae_adjust_position(ATempoContext *atempo)
  569. {
  570. const AudioFragment *prev = yae_prev_frag(atempo);
  571. AudioFragment *frag = yae_curr_frag(atempo);
  572. const int delta_max = atempo->window / 2;
  573. const int correction = yae_align(frag,
  574. prev,
  575. atempo->window,
  576. delta_max,
  577. atempo->drift,
  578. atempo->correlation,
  579. atempo->complex_to_real);
  580. if (correction) {
  581. // adjust fragment position:
  582. frag->position[0] -= correction;
  583. // clear so that the fragment can be reloaded:
  584. frag->nsamples = 0;
  585. // update cumulative correction drift counter:
  586. atempo->drift += correction;
  587. }
  588. return correction;
  589. }
  590. /**
  591. * A helper macro for blending the overlap region of previous
  592. * and current audio fragment.
  593. */
  594. #define yae_blend(scalar_type) \
  595. do { \
  596. const scalar_type *aaa = (const scalar_type *)a; \
  597. const scalar_type *bbb = (const scalar_type *)b; \
  598. \
  599. scalar_type *out = (scalar_type *)dst; \
  600. scalar_type *out_end = (scalar_type *)dst_end; \
  601. int64_t i; \
  602. \
  603. for (i = 0; i < overlap && out < out_end; \
  604. i++, atempo->position[1]++, wa++, wb++) { \
  605. float w0 = *wa; \
  606. float w1 = *wb; \
  607. int j; \
  608. \
  609. for (j = 0; j < atempo->channels; \
  610. j++, aaa++, bbb++, out++) { \
  611. float t0 = (float)*aaa; \
  612. float t1 = (float)*bbb; \
  613. \
  614. *out = \
  615. frag->position[0] + i < 0 ? \
  616. *aaa : \
  617. (scalar_type)(t0 * w0 + t1 * w1); \
  618. } \
  619. } \
  620. dst = (uint8_t *)out; \
  621. } while (0)
  622. /**
  623. * Blend the overlap region of previous and current audio fragment
  624. * and output the results to the given destination buffer.
  625. *
  626. * @return
  627. * 0 if the overlap region was completely stored in the dst buffer,
  628. * AVERROR(EAGAIN) if more destination buffer space is required.
  629. */
  630. static int yae_overlap_add(ATempoContext *atempo,
  631. uint8_t **dst_ref,
  632. uint8_t *dst_end)
  633. {
  634. // shortcuts:
  635. const AudioFragment *prev = yae_prev_frag(atempo);
  636. const AudioFragment *frag = yae_curr_frag(atempo);
  637. const int64_t start_here = FFMAX(atempo->position[1],
  638. frag->position[1]);
  639. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  640. frag->position[1] + frag->nsamples);
  641. const int64_t overlap = stop_here - start_here;
  642. const int64_t ia = start_here - prev->position[1];
  643. const int64_t ib = start_here - frag->position[1];
  644. const float *wa = atempo->hann + ia;
  645. const float *wb = atempo->hann + ib;
  646. const uint8_t *a = prev->data + ia * atempo->stride;
  647. const uint8_t *b = frag->data + ib * atempo->stride;
  648. uint8_t *dst = *dst_ref;
  649. av_assert0(start_here <= stop_here &&
  650. frag->position[1] <= start_here &&
  651. overlap <= frag->nsamples);
  652. if (atempo->format == AV_SAMPLE_FMT_U8) {
  653. yae_blend(uint8_t);
  654. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  655. yae_blend(int16_t);
  656. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  657. yae_blend(int);
  658. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  659. yae_blend(float);
  660. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  661. yae_blend(double);
  662. }
  663. // pass-back the updated destination buffer pointer:
  664. *dst_ref = dst;
  665. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  666. }
  667. /**
  668. * Feed as much data to the filter as it is able to consume
  669. * and receive as much processed data in the destination buffer
  670. * as it is able to produce or store.
  671. */
  672. static void
  673. yae_apply(ATempoContext *atempo,
  674. const uint8_t **src_ref,
  675. const uint8_t *src_end,
  676. uint8_t **dst_ref,
  677. uint8_t *dst_end)
  678. {
  679. while (1) {
  680. if (atempo->state == YAE_LOAD_FRAGMENT) {
  681. // load additional data for the current fragment:
  682. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  683. break;
  684. }
  685. // down-mix to mono:
  686. yae_downmix(atempo, yae_curr_frag(atempo));
  687. // apply rDFT:
  688. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  689. // must load the second fragment before alignment can start:
  690. if (!atempo->nfrag) {
  691. yae_advance_to_next_frag(atempo);
  692. continue;
  693. }
  694. atempo->state = YAE_ADJUST_POSITION;
  695. }
  696. if (atempo->state == YAE_ADJUST_POSITION) {
  697. // adjust position for better alignment:
  698. if (yae_adjust_position(atempo)) {
  699. // reload the fragment at the corrected position, so that the
  700. // Hann window blending would not require normalization:
  701. atempo->state = YAE_RELOAD_FRAGMENT;
  702. } else {
  703. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  704. }
  705. }
  706. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  707. // load additional data if necessary due to position adjustment:
  708. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  709. break;
  710. }
  711. // down-mix to mono:
  712. yae_downmix(atempo, yae_curr_frag(atempo));
  713. // apply rDFT:
  714. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  715. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  716. }
  717. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  718. // overlap-add and output the result:
  719. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  720. break;
  721. }
  722. // advance to the next fragment, repeat:
  723. yae_advance_to_next_frag(atempo);
  724. atempo->state = YAE_LOAD_FRAGMENT;
  725. }
  726. }
  727. }
  728. /**
  729. * Flush any buffered data from the filter.
  730. *
  731. * @return
  732. * 0 if all data was completely stored in the dst buffer,
  733. * AVERROR(EAGAIN) if more destination buffer space is required.
  734. */
  735. static int yae_flush(ATempoContext *atempo,
  736. uint8_t **dst_ref,
  737. uint8_t *dst_end)
  738. {
  739. AudioFragment *frag = yae_curr_frag(atempo);
  740. int64_t overlap_end;
  741. int64_t start_here;
  742. int64_t stop_here;
  743. int64_t offset;
  744. const uint8_t *src;
  745. uint8_t *dst;
  746. int src_size;
  747. int dst_size;
  748. int nbytes;
  749. atempo->state = YAE_FLUSH_OUTPUT;
  750. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  751. atempo->position[1] == frag->position[1] + frag->nsamples) {
  752. // the current fragment is already flushed:
  753. return 0;
  754. }
  755. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  756. // finish loading the current (possibly partial) fragment:
  757. yae_load_frag(atempo, NULL, NULL);
  758. if (atempo->nfrag) {
  759. // down-mix to mono:
  760. yae_downmix(atempo, frag);
  761. // apply rDFT:
  762. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  763. // align current fragment to previous fragment:
  764. if (yae_adjust_position(atempo)) {
  765. // reload the current fragment due to adjusted position:
  766. yae_load_frag(atempo, NULL, NULL);
  767. }
  768. }
  769. }
  770. // flush the overlap region:
  771. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  772. frag->nsamples);
  773. while (atempo->position[1] < overlap_end) {
  774. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  775. return AVERROR(EAGAIN);
  776. }
  777. }
  778. // flush the remaininder of the current fragment:
  779. start_here = FFMAX(atempo->position[1], overlap_end);
  780. stop_here = frag->position[1] + frag->nsamples;
  781. offset = start_here - frag->position[1];
  782. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  783. src = frag->data + offset * atempo->stride;
  784. dst = (uint8_t *)*dst_ref;
  785. src_size = (int)(stop_here - start_here) * atempo->stride;
  786. dst_size = dst_end - dst;
  787. nbytes = FFMIN(src_size, dst_size);
  788. memcpy(dst, src, nbytes);
  789. dst += nbytes;
  790. atempo->position[1] += (nbytes / atempo->stride);
  791. // pass-back the updated destination buffer pointer:
  792. *dst_ref = (uint8_t *)dst;
  793. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  794. }
  795. static av_cold int init(AVFilterContext *ctx)
  796. {
  797. ATempoContext *atempo = ctx->priv;
  798. atempo->format = AV_SAMPLE_FMT_NONE;
  799. atempo->state = YAE_LOAD_FRAGMENT;
  800. return 0;
  801. }
  802. static av_cold void uninit(AVFilterContext *ctx)
  803. {
  804. ATempoContext *atempo = ctx->priv;
  805. yae_release_buffers(atempo);
  806. }
  807. static int query_formats(AVFilterContext *ctx)
  808. {
  809. AVFilterChannelLayouts *layouts = NULL;
  810. AVFilterFormats *formats = NULL;
  811. // WSOLA necessitates an internal sliding window ring buffer
  812. // for incoming audio stream.
  813. //
  814. // Planar sample formats are too cumbersome to store in a ring buffer,
  815. // therefore planar sample formats are not supported.
  816. //
  817. static const enum AVSampleFormat sample_fmts[] = {
  818. AV_SAMPLE_FMT_U8,
  819. AV_SAMPLE_FMT_S16,
  820. AV_SAMPLE_FMT_S32,
  821. AV_SAMPLE_FMT_FLT,
  822. AV_SAMPLE_FMT_DBL,
  823. AV_SAMPLE_FMT_NONE
  824. };
  825. layouts = ff_all_channel_layouts();
  826. if (!layouts) {
  827. return AVERROR(ENOMEM);
  828. }
  829. ff_set_common_channel_layouts(ctx, layouts);
  830. formats = ff_make_format_list(sample_fmts);
  831. if (!formats) {
  832. return AVERROR(ENOMEM);
  833. }
  834. ff_set_common_formats(ctx, formats);
  835. formats = ff_all_samplerates();
  836. if (!formats) {
  837. return AVERROR(ENOMEM);
  838. }
  839. ff_set_common_samplerates(ctx, formats);
  840. return 0;
  841. }
  842. static int config_props(AVFilterLink *inlink)
  843. {
  844. AVFilterContext *ctx = inlink->dst;
  845. ATempoContext *atempo = ctx->priv;
  846. enum AVSampleFormat format = inlink->format;
  847. int sample_rate = (int)inlink->sample_rate;
  848. int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
  849. ctx->outputs[0]->flags |= FF_LINK_FLAG_REQUEST_LOOP;
  850. return yae_reset(atempo, format, sample_rate, channels);
  851. }
  852. static int push_samples(ATempoContext *atempo,
  853. AVFilterLink *outlink,
  854. int n_out)
  855. {
  856. int ret;
  857. atempo->dst_buffer->sample_rate = outlink->sample_rate;
  858. atempo->dst_buffer->nb_samples = n_out;
  859. // adjust the PTS:
  860. atempo->dst_buffer->pts =
  861. av_rescale_q(atempo->nsamples_out,
  862. (AVRational){ 1, outlink->sample_rate },
  863. outlink->time_base);
  864. ret = ff_filter_frame(outlink, atempo->dst_buffer);
  865. if (ret < 0)
  866. return ret;
  867. atempo->dst_buffer = NULL;
  868. atempo->dst = NULL;
  869. atempo->dst_end = NULL;
  870. atempo->nsamples_out += n_out;
  871. return 0;
  872. }
  873. static int filter_frame(AVFilterLink *inlink, AVFrame *src_buffer)
  874. {
  875. AVFilterContext *ctx = inlink->dst;
  876. ATempoContext *atempo = ctx->priv;
  877. AVFilterLink *outlink = ctx->outputs[0];
  878. int ret = 0;
  879. int n_in = src_buffer->nb_samples;
  880. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  881. const uint8_t *src = src_buffer->data[0];
  882. const uint8_t *src_end = src + n_in * atempo->stride;
  883. while (src < src_end) {
  884. if (!atempo->dst_buffer) {
  885. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_out);
  886. if (!atempo->dst_buffer)
  887. return AVERROR(ENOMEM);
  888. av_frame_copy_props(atempo->dst_buffer, src_buffer);
  889. atempo->dst = atempo->dst_buffer->data[0];
  890. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  891. }
  892. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  893. if (atempo->dst == atempo->dst_end) {
  894. ret = push_samples(atempo, outlink, n_out);
  895. if (ret < 0)
  896. goto end;
  897. }
  898. }
  899. atempo->nsamples_in += n_in;
  900. end:
  901. av_frame_free(&src_buffer);
  902. return ret;
  903. }
  904. static int request_frame(AVFilterLink *outlink)
  905. {
  906. AVFilterContext *ctx = outlink->src;
  907. ATempoContext *atempo = ctx->priv;
  908. int ret;
  909. ret = ff_request_frame(ctx->inputs[0]);
  910. if (ret == AVERROR_EOF) {
  911. // flush the filter:
  912. int n_max = atempo->ring;
  913. int n_out;
  914. int err = AVERROR(EAGAIN);
  915. while (err == AVERROR(EAGAIN)) {
  916. if (!atempo->dst_buffer) {
  917. atempo->dst_buffer = ff_get_audio_buffer(outlink, n_max);
  918. if (!atempo->dst_buffer)
  919. return AVERROR(ENOMEM);
  920. atempo->dst = atempo->dst_buffer->data[0];
  921. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  922. }
  923. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  924. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  925. atempo->stride);
  926. if (n_out) {
  927. ret = push_samples(atempo, outlink, n_out);
  928. }
  929. }
  930. av_frame_free(&atempo->dst_buffer);
  931. atempo->dst = NULL;
  932. atempo->dst_end = NULL;
  933. return AVERROR_EOF;
  934. }
  935. return ret;
  936. }
  937. static int process_command(AVFilterContext *ctx,
  938. const char *cmd,
  939. const char *arg,
  940. char *res,
  941. int res_len,
  942. int flags)
  943. {
  944. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  945. }
  946. static const AVFilterPad atempo_inputs[] = {
  947. {
  948. .name = "default",
  949. .type = AVMEDIA_TYPE_AUDIO,
  950. .filter_frame = filter_frame,
  951. .config_props = config_props,
  952. },
  953. { NULL }
  954. };
  955. static const AVFilterPad atempo_outputs[] = {
  956. {
  957. .name = "default",
  958. .request_frame = request_frame,
  959. .type = AVMEDIA_TYPE_AUDIO,
  960. },
  961. { NULL }
  962. };
  963. AVFilter avfilter_af_atempo = {
  964. .name = "atempo",
  965. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  966. .init = init,
  967. .uninit = uninit,
  968. .query_formats = query_formats,
  969. .process_command = process_command,
  970. .priv_size = sizeof(ATempoContext),
  971. .priv_class = &atempo_class,
  972. .inputs = atempo_inputs,
  973. .outputs = atempo_outputs,
  974. };