You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1163 lines
37KB

  1. /*
  2. * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. /**
  21. * @file
  22. * tempo scaling audio filter -- an implementation of WSOLA algorithm
  23. *
  24. * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
  25. * from Apprentice Video player by Pavel Koshevoy.
  26. * https://sourceforge.net/projects/apprenticevideo/
  27. *
  28. * An explanation of SOLA algorithm is available at
  29. * http://www.surina.net/article/time-and-pitch-scaling.html
  30. *
  31. * WSOLA is very similar to SOLA, only one major difference exists between
  32. * these algorithms. SOLA shifts audio fragments along the output stream,
  33. * where as WSOLA shifts audio fragments along the input stream.
  34. *
  35. * The advantage of WSOLA algorithm is that the overlap region size is
  36. * always the same, therefore the blending function is constant and
  37. * can be precomputed.
  38. */
  39. #include <float.h>
  40. #include "libavcodec/avfft.h"
  41. #include "libavutil/audioconvert.h"
  42. #include "libavutil/avassert.h"
  43. #include "libavutil/avstring.h"
  44. #include "libavutil/eval.h"
  45. #include "libavutil/opt.h"
  46. #include "libavutil/samplefmt.h"
  47. #include "avfilter.h"
  48. #include "audio.h"
  49. #include "internal.h"
  50. /**
  51. * A fragment of audio waveform
  52. */
  53. typedef struct {
  54. // index of the first sample of this fragment in the overall waveform;
  55. // 0: input sample position
  56. // 1: output sample position
  57. int64_t position[2];
  58. // original packed multi-channel samples:
  59. uint8_t *data;
  60. // number of samples in this fragment:
  61. int nsamples;
  62. // rDFT transform of the down-mixed mono fragment, used for
  63. // fast waveform alignment via correlation in frequency domain:
  64. FFTSample *xdat;
  65. } AudioFragment;
  66. /**
  67. * Filter state machine states
  68. */
  69. typedef enum {
  70. YAE_LOAD_FRAGMENT,
  71. YAE_ADJUST_POSITION,
  72. YAE_RELOAD_FRAGMENT,
  73. YAE_OUTPUT_OVERLAP_ADD,
  74. YAE_FLUSH_OUTPUT,
  75. } FilterState;
  76. /**
  77. * Filter state machine
  78. */
  79. typedef struct {
  80. // ring-buffer of input samples, necessary because some times
  81. // input fragment position may be adjusted backwards:
  82. uint8_t *buffer;
  83. // ring-buffer maximum capacity, expressed in sample rate time base:
  84. int ring;
  85. // ring-buffer house keeping:
  86. int size;
  87. int head;
  88. int tail;
  89. // 0: input sample position corresponding to the ring buffer tail
  90. // 1: output sample position
  91. int64_t position[2];
  92. // sample format:
  93. enum AVSampleFormat format;
  94. // number of channels:
  95. int channels;
  96. // row of bytes to skip from one sample to next, across multple channels;
  97. // stride = (number-of-channels * bits-per-sample-per-channel) / 8
  98. int stride;
  99. // fragment window size, power-of-two integer:
  100. int window;
  101. // Hann window coefficients, for feathering
  102. // (blending) the overlapping fragment region:
  103. float *hann;
  104. // tempo scaling factor:
  105. double tempo;
  106. // cumulative alignment drift:
  107. int drift;
  108. // current/previous fragment ring-buffer:
  109. AudioFragment frag[2];
  110. // current fragment index:
  111. uint64_t nfrag;
  112. // current state:
  113. FilterState state;
  114. // for fast correlation calculation in frequency domain:
  115. RDFTContext *real_to_complex;
  116. RDFTContext *complex_to_real;
  117. FFTSample *correlation;
  118. // for managing AVFilterPad.request_frame and AVFilterPad.filter_samples
  119. int request_fulfilled;
  120. AVFilterBufferRef *dst_buffer;
  121. uint8_t *dst;
  122. uint8_t *dst_end;
  123. uint64_t nsamples_in;
  124. uint64_t nsamples_out;
  125. } ATempoContext;
  126. /**
  127. * Reset filter to initial state, do not deallocate existing local buffers.
  128. */
  129. static void yae_clear(ATempoContext *atempo)
  130. {
  131. atempo->size = 0;
  132. atempo->head = 0;
  133. atempo->tail = 0;
  134. atempo->drift = 0;
  135. atempo->nfrag = 0;
  136. atempo->state = YAE_LOAD_FRAGMENT;
  137. atempo->position[0] = 0;
  138. atempo->position[1] = 0;
  139. atempo->frag[0].position[0] = 0;
  140. atempo->frag[0].position[1] = 0;
  141. atempo->frag[0].nsamples = 0;
  142. atempo->frag[1].position[0] = 0;
  143. atempo->frag[1].position[1] = 0;
  144. atempo->frag[1].nsamples = 0;
  145. // shift left position of 1st fragment by half a window
  146. // so that no re-normalization would be required for
  147. // the left half of the 1st fragment:
  148. atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
  149. atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
  150. avfilter_unref_bufferp(&atempo->dst_buffer);
  151. atempo->dst = NULL;
  152. atempo->dst_end = NULL;
  153. atempo->request_fulfilled = 0;
  154. atempo->nsamples_in = 0;
  155. atempo->nsamples_out = 0;
  156. }
  157. /**
  158. * Reset filter to initial state and deallocate all buffers.
  159. */
  160. static void yae_release_buffers(ATempoContext *atempo)
  161. {
  162. yae_clear(atempo);
  163. av_freep(&atempo->frag[0].data);
  164. av_freep(&atempo->frag[1].data);
  165. av_freep(&atempo->frag[0].xdat);
  166. av_freep(&atempo->frag[1].xdat);
  167. av_freep(&atempo->buffer);
  168. av_freep(&atempo->hann);
  169. av_freep(&atempo->correlation);
  170. av_rdft_end(atempo->real_to_complex);
  171. atempo->real_to_complex = NULL;
  172. av_rdft_end(atempo->complex_to_real);
  173. atempo->complex_to_real = NULL;
  174. }
  175. #define REALLOC_OR_FAIL(field, field_size) \
  176. do { \
  177. void * new_field = av_realloc(field, (field_size)); \
  178. if (!new_field) { \
  179. yae_release_buffers(atempo); \
  180. return AVERROR(ENOMEM); \
  181. } \
  182. field = new_field; \
  183. } while (0)
  184. /**
  185. * Prepare filter for processing audio data of given format,
  186. * sample rate and number of channels.
  187. */
  188. static int yae_reset(ATempoContext *atempo,
  189. enum AVSampleFormat format,
  190. int sample_rate,
  191. int channels)
  192. {
  193. const int sample_size = av_get_bytes_per_sample(format);
  194. uint32_t nlevels = 0;
  195. uint32_t pot;
  196. int i;
  197. atempo->format = format;
  198. atempo->channels = channels;
  199. atempo->stride = sample_size * channels;
  200. // pick a segment window size:
  201. atempo->window = sample_rate / 24;
  202. // adjust window size to be a power-of-two integer:
  203. nlevels = av_log2(atempo->window);
  204. pot = 1 << nlevels;
  205. av_assert0(pot <= atempo->window);
  206. if (pot < atempo->window) {
  207. atempo->window = pot * 2;
  208. nlevels++;
  209. }
  210. // initialize audio fragment buffers:
  211. REALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
  212. REALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
  213. REALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
  214. REALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
  215. // initialize rDFT contexts:
  216. av_rdft_end(atempo->real_to_complex);
  217. atempo->real_to_complex = NULL;
  218. av_rdft_end(atempo->complex_to_real);
  219. atempo->complex_to_real = NULL;
  220. atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
  221. if (!atempo->real_to_complex) {
  222. yae_release_buffers(atempo);
  223. return AVERROR(ENOMEM);
  224. }
  225. atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
  226. if (!atempo->complex_to_real) {
  227. yae_release_buffers(atempo);
  228. return AVERROR(ENOMEM);
  229. }
  230. REALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
  231. atempo->ring = atempo->window * 3;
  232. REALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
  233. // initialize the Hann window function:
  234. REALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
  235. for (i = 0; i < atempo->window; i++) {
  236. double t = (double)i / (double)(atempo->window - 1);
  237. double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
  238. atempo->hann[i] = (float)h;
  239. }
  240. yae_clear(atempo);
  241. return 0;
  242. }
  243. static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
  244. {
  245. ATempoContext *atempo = ctx->priv;
  246. char *tail = NULL;
  247. double tempo = av_strtod(arg_tempo, &tail);
  248. if (tail && *tail) {
  249. av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
  250. return AVERROR(EINVAL);
  251. }
  252. if (tempo < 0.5 || tempo > 2.0) {
  253. av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
  254. tempo);
  255. return AVERROR(EINVAL);
  256. }
  257. atempo->tempo = tempo;
  258. return 0;
  259. }
  260. inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
  261. {
  262. return &atempo->frag[atempo->nfrag % 2];
  263. }
  264. inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
  265. {
  266. return &atempo->frag[(atempo->nfrag + 1) % 2];
  267. }
  268. /**
  269. * A helper macro for initializing complex data buffer with scalar data
  270. * of a given type.
  271. */
  272. #define yae_init_xdat(scalar_type, scalar_max) \
  273. do { \
  274. const uint8_t *src_end = src + \
  275. frag->nsamples * atempo->channels * sizeof(scalar_type); \
  276. \
  277. FFTSample *xdat = frag->xdat; \
  278. scalar_type tmp; \
  279. \
  280. if (atempo->channels == 1) { \
  281. for (; src < src_end; xdat++) { \
  282. tmp = *(const scalar_type *)src; \
  283. src += sizeof(scalar_type); \
  284. \
  285. *xdat = (FFTSample)tmp; \
  286. } \
  287. } else { \
  288. FFTSample s, max, ti, si; \
  289. int i; \
  290. \
  291. for (; src < src_end; xdat++) { \
  292. tmp = *(const scalar_type *)src; \
  293. src += sizeof(scalar_type); \
  294. \
  295. max = (FFTSample)tmp; \
  296. s = FFMIN((FFTSample)scalar_max, \
  297. (FFTSample)fabsf(max)); \
  298. \
  299. for (i = 1; i < atempo->channels; i++) { \
  300. tmp = *(const scalar_type *)src; \
  301. src += sizeof(scalar_type); \
  302. \
  303. ti = (FFTSample)tmp; \
  304. si = FFMIN((FFTSample)scalar_max, \
  305. (FFTSample)fabsf(ti)); \
  306. \
  307. if (s < si) { \
  308. s = si; \
  309. max = ti; \
  310. } \
  311. } \
  312. \
  313. *xdat = max; \
  314. } \
  315. } \
  316. } while (0)
  317. /**
  318. * Initialize complex data buffer of a given audio fragment
  319. * with down-mixed mono data of appropriate scalar type.
  320. */
  321. static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
  322. {
  323. // shortcuts:
  324. const uint8_t *src = frag->data;
  325. // init complex data buffer used for FFT and Correlation:
  326. memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
  327. if (atempo->format == AV_SAMPLE_FMT_U8) {
  328. yae_init_xdat(uint8_t, 127);
  329. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  330. yae_init_xdat(int16_t, 32767);
  331. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  332. yae_init_xdat(int, 2147483647);
  333. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  334. yae_init_xdat(float, 1);
  335. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  336. yae_init_xdat(double, 1);
  337. }
  338. }
  339. /**
  340. * Populate the internal data buffer on as-needed basis.
  341. *
  342. * @return
  343. * 0 if requested data was already available or was successfully loaded,
  344. * AVERROR(EAGAIN) if more input data is required.
  345. */
  346. static int yae_load_data(ATempoContext *atempo,
  347. const uint8_t **src_ref,
  348. const uint8_t *src_end,
  349. int64_t stop_here)
  350. {
  351. // shortcut:
  352. const uint8_t *src = *src_ref;
  353. const int read_size = stop_here - atempo->position[0];
  354. if (stop_here <= atempo->position[0]) {
  355. return 0;
  356. }
  357. // samples are not expected to be skipped:
  358. av_assert0(read_size <= atempo->ring);
  359. while (atempo->position[0] < stop_here && src < src_end) {
  360. int src_samples = (src_end - src) / atempo->stride;
  361. // load data piece-wise, in order to avoid complicating the logic:
  362. int nsamples = FFMIN(read_size, src_samples);
  363. int na;
  364. int nb;
  365. nsamples = FFMIN(nsamples, atempo->ring);
  366. na = FFMIN(nsamples, atempo->ring - atempo->tail);
  367. nb = FFMIN(nsamples - na, atempo->ring);
  368. if (na) {
  369. uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
  370. memcpy(a, src, na * atempo->stride);
  371. src += na * atempo->stride;
  372. atempo->position[0] += na;
  373. atempo->size = FFMIN(atempo->size + na, atempo->ring);
  374. atempo->tail = (atempo->tail + na) % atempo->ring;
  375. atempo->head =
  376. atempo->size < atempo->ring ?
  377. atempo->tail - atempo->size :
  378. atempo->tail;
  379. }
  380. if (nb) {
  381. uint8_t *b = atempo->buffer;
  382. memcpy(b, src, nb * atempo->stride);
  383. src += nb * atempo->stride;
  384. atempo->position[0] += nb;
  385. atempo->size = FFMIN(atempo->size + nb, atempo->ring);
  386. atempo->tail = (atempo->tail + nb) % atempo->ring;
  387. atempo->head =
  388. atempo->size < atempo->ring ?
  389. atempo->tail - atempo->size :
  390. atempo->tail;
  391. }
  392. }
  393. // pass back the updated source buffer pointer:
  394. *src_ref = src;
  395. // sanity check:
  396. av_assert0(atempo->position[0] <= stop_here);
  397. return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
  398. }
  399. /**
  400. * Populate current audio fragment data buffer.
  401. *
  402. * @return
  403. * 0 when the fragment is ready,
  404. * AVERROR(EAGAIN) if more input data is required.
  405. */
  406. static int yae_load_frag(ATempoContext *atempo,
  407. const uint8_t **src_ref,
  408. const uint8_t *src_end)
  409. {
  410. // shortcuts:
  411. AudioFragment *frag = yae_curr_frag(atempo);
  412. uint8_t *dst;
  413. int64_t missing, start, zeros;
  414. uint32_t nsamples;
  415. const uint8_t *a, *b;
  416. int i0, i1, n0, n1, na, nb;
  417. int64_t stop_here = frag->position[0] + atempo->window;
  418. if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
  419. return AVERROR(EAGAIN);
  420. }
  421. // calculate the number of samples we don't have:
  422. missing =
  423. stop_here > atempo->position[0] ?
  424. stop_here - atempo->position[0] : 0;
  425. nsamples =
  426. missing < (int64_t)atempo->window ?
  427. (uint32_t)(atempo->window - missing) : 0;
  428. // setup the output buffer:
  429. frag->nsamples = nsamples;
  430. dst = frag->data;
  431. start = atempo->position[0] - atempo->size;
  432. zeros = 0;
  433. if (frag->position[0] < start) {
  434. // what we don't have we substitute with zeros:
  435. zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
  436. av_assert0(zeros != nsamples);
  437. memset(dst, 0, zeros * atempo->stride);
  438. dst += zeros * atempo->stride;
  439. }
  440. if (zeros == nsamples) {
  441. return 0;
  442. }
  443. // get the remaining data from the ring buffer:
  444. na = (atempo->head < atempo->tail ?
  445. atempo->tail - atempo->head :
  446. atempo->ring - atempo->head);
  447. nb = atempo->head < atempo->tail ? 0 : atempo->tail;
  448. // sanity check:
  449. av_assert0(nsamples <= zeros + na + nb);
  450. a = atempo->buffer + atempo->head * atempo->stride;
  451. b = atempo->buffer;
  452. i0 = frag->position[0] + zeros - start;
  453. i1 = i0 < na ? 0 : i0 - na;
  454. n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
  455. n1 = nsamples - zeros - n0;
  456. if (n0) {
  457. memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
  458. dst += n0 * atempo->stride;
  459. }
  460. if (n1) {
  461. memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
  462. dst += n1 * atempo->stride;
  463. }
  464. return 0;
  465. }
  466. /**
  467. * Prepare for loading next audio fragment.
  468. */
  469. static void yae_advance_to_next_frag(ATempoContext *atempo)
  470. {
  471. const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
  472. const AudioFragment *prev;
  473. AudioFragment *frag;
  474. atempo->nfrag++;
  475. prev = yae_prev_frag(atempo);
  476. frag = yae_curr_frag(atempo);
  477. frag->position[0] = prev->position[0] + (int64_t)fragment_step;
  478. frag->position[1] = prev->position[1] + atempo->window / 2;
  479. frag->nsamples = 0;
  480. }
  481. /**
  482. * Calculate cross-correlation via rDFT.
  483. *
  484. * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
  485. * and transform back via complex_to_real rDFT.
  486. */
  487. static void yae_xcorr_via_rdft(FFTSample *xcorr,
  488. RDFTContext *complex_to_real,
  489. const FFTComplex *xa,
  490. const FFTComplex *xb,
  491. const int window)
  492. {
  493. FFTComplex *xc = (FFTComplex *)xcorr;
  494. int i;
  495. // NOTE: first element requires special care -- Given Y = rDFT(X),
  496. // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
  497. // stores Re(Y[N/2]) in place of Im(Y[0]).
  498. xc->re = xa->re * xb->re;
  499. xc->im = xa->im * xb->im;
  500. xa++;
  501. xb++;
  502. xc++;
  503. for (i = 1; i < window; i++, xa++, xb++, xc++) {
  504. xc->re = (xa->re * xb->re + xa->im * xb->im);
  505. xc->im = (xa->im * xb->re - xa->re * xb->im);
  506. }
  507. // apply inverse rDFT:
  508. av_rdft_calc(complex_to_real, xcorr);
  509. }
  510. /**
  511. * Calculate alignment offset for given fragment
  512. * relative to the previous fragment.
  513. *
  514. * @return alignment offset of current fragment relative to previous.
  515. */
  516. static int yae_align(AudioFragment *frag,
  517. const AudioFragment *prev,
  518. const int window,
  519. const int delta_max,
  520. const int drift,
  521. FFTSample *correlation,
  522. RDFTContext *complex_to_real)
  523. {
  524. int best_offset = -drift;
  525. FFTSample best_metric = -FLT_MAX;
  526. FFTSample *xcorr;
  527. int i0;
  528. int i1;
  529. int i;
  530. yae_xcorr_via_rdft(correlation,
  531. complex_to_real,
  532. (const FFTComplex *)prev->xdat,
  533. (const FFTComplex *)frag->xdat,
  534. window);
  535. // identify search window boundaries:
  536. i0 = FFMAX(window / 2 - delta_max - drift, 0);
  537. i0 = FFMIN(i0, window);
  538. i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
  539. i1 = FFMAX(i1, 0);
  540. // identify cross-correlation peaks within search window:
  541. xcorr = correlation + i0;
  542. for (i = i0; i < i1; i++, xcorr++) {
  543. FFTSample metric = *xcorr;
  544. // normalize:
  545. FFTSample drifti = (FFTSample)(drift + i);
  546. metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
  547. if (metric > best_metric) {
  548. best_metric = metric;
  549. best_offset = i - window / 2;
  550. }
  551. }
  552. return best_offset;
  553. }
  554. /**
  555. * Adjust current fragment position for better alignment
  556. * with previous fragment.
  557. *
  558. * @return alignment correction.
  559. */
  560. static int yae_adjust_position(ATempoContext *atempo)
  561. {
  562. const AudioFragment *prev = yae_prev_frag(atempo);
  563. AudioFragment *frag = yae_curr_frag(atempo);
  564. const int delta_max = atempo->window / 2;
  565. const int correction = yae_align(frag,
  566. prev,
  567. atempo->window,
  568. delta_max,
  569. atempo->drift,
  570. atempo->correlation,
  571. atempo->complex_to_real);
  572. if (correction) {
  573. // adjust fragment position:
  574. frag->position[0] -= correction;
  575. // clear so that the fragment can be reloaded:
  576. frag->nsamples = 0;
  577. // update cumulative correction drift counter:
  578. atempo->drift += correction;
  579. }
  580. return correction;
  581. }
  582. /**
  583. * A helper macro for blending the overlap region of previous
  584. * and current audio fragment.
  585. */
  586. #define yae_blend(scalar_type) \
  587. do { \
  588. const scalar_type *aaa = (const scalar_type *)a; \
  589. const scalar_type *bbb = (const scalar_type *)b; \
  590. \
  591. scalar_type *out = (scalar_type *)dst; \
  592. scalar_type *out_end = (scalar_type *)dst_end; \
  593. int64_t i; \
  594. \
  595. for (i = 0; i < overlap && out < out_end; \
  596. i++, atempo->position[1]++, wa++, wb++) { \
  597. float w0 = *wa; \
  598. float w1 = *wb; \
  599. int j; \
  600. \
  601. for (j = 0; j < atempo->channels; \
  602. j++, aaa++, bbb++, out++) { \
  603. float t0 = (float)*aaa; \
  604. float t1 = (float)*bbb; \
  605. \
  606. *out = \
  607. frag->position[0] + i < 0 ? \
  608. *aaa : \
  609. (scalar_type)(t0 * w0 + t1 * w1); \
  610. } \
  611. } \
  612. dst = (uint8_t *)out; \
  613. } while (0)
  614. /**
  615. * Blend the overlap region of previous and current audio fragment
  616. * and output the results to the given destination buffer.
  617. *
  618. * @return
  619. * 0 if the overlap region was completely stored in the dst buffer,
  620. * AVERROR(EAGAIN) if more destination buffer space is required.
  621. */
  622. static int yae_overlap_add(ATempoContext *atempo,
  623. uint8_t **dst_ref,
  624. uint8_t *dst_end)
  625. {
  626. // shortcuts:
  627. const AudioFragment *prev = yae_prev_frag(atempo);
  628. const AudioFragment *frag = yae_curr_frag(atempo);
  629. const int64_t start_here = FFMAX(atempo->position[1],
  630. frag->position[1]);
  631. const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
  632. frag->position[1] + frag->nsamples);
  633. const int64_t overlap = stop_here - start_here;
  634. const int64_t ia = start_here - prev->position[1];
  635. const int64_t ib = start_here - frag->position[1];
  636. const float *wa = atempo->hann + ia;
  637. const float *wb = atempo->hann + ib;
  638. const uint8_t *a = prev->data + ia * atempo->stride;
  639. const uint8_t *b = frag->data + ib * atempo->stride;
  640. uint8_t *dst = *dst_ref;
  641. av_assert0(start_here <= stop_here &&
  642. frag->position[1] <= start_here &&
  643. overlap <= frag->nsamples);
  644. if (atempo->format == AV_SAMPLE_FMT_U8) {
  645. yae_blend(uint8_t);
  646. } else if (atempo->format == AV_SAMPLE_FMT_S16) {
  647. yae_blend(int16_t);
  648. } else if (atempo->format == AV_SAMPLE_FMT_S32) {
  649. yae_blend(int);
  650. } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
  651. yae_blend(float);
  652. } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
  653. yae_blend(double);
  654. }
  655. // pass-back the updated destination buffer pointer:
  656. *dst_ref = dst;
  657. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  658. }
  659. /**
  660. * Feed as much data to the filter as it is able to consume
  661. * and receive as much processed data in the destination buffer
  662. * as it is able to produce or store.
  663. */
  664. static void
  665. yae_apply(ATempoContext *atempo,
  666. const uint8_t **src_ref,
  667. const uint8_t *src_end,
  668. uint8_t **dst_ref,
  669. uint8_t *dst_end)
  670. {
  671. while (1) {
  672. if (atempo->state == YAE_LOAD_FRAGMENT) {
  673. // load additional data for the current fragment:
  674. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  675. break;
  676. }
  677. // down-mix to mono:
  678. yae_downmix(atempo, yae_curr_frag(atempo));
  679. // apply rDFT:
  680. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  681. // must load the second fragment before alignment can start:
  682. if (!atempo->nfrag) {
  683. yae_advance_to_next_frag(atempo);
  684. continue;
  685. }
  686. atempo->state = YAE_ADJUST_POSITION;
  687. }
  688. if (atempo->state == YAE_ADJUST_POSITION) {
  689. // adjust position for better alignment:
  690. if (yae_adjust_position(atempo)) {
  691. // reload the fragment at the corrected position, so that the
  692. // Hann window blending would not require normalization:
  693. atempo->state = YAE_RELOAD_FRAGMENT;
  694. } else {
  695. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  696. }
  697. }
  698. if (atempo->state == YAE_RELOAD_FRAGMENT) {
  699. // load additional data if necessary due to position adjustment:
  700. if (yae_load_frag(atempo, src_ref, src_end) != 0) {
  701. break;
  702. }
  703. // down-mix to mono:
  704. yae_downmix(atempo, yae_curr_frag(atempo));
  705. // apply rDFT:
  706. av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
  707. atempo->state = YAE_OUTPUT_OVERLAP_ADD;
  708. }
  709. if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
  710. // overlap-add and output the result:
  711. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  712. break;
  713. }
  714. // advance to the next fragment, repeat:
  715. yae_advance_to_next_frag(atempo);
  716. atempo->state = YAE_LOAD_FRAGMENT;
  717. }
  718. }
  719. }
  720. /**
  721. * Flush any buffered data from the filter.
  722. *
  723. * @return
  724. * 0 if all data was completely stored in the dst buffer,
  725. * AVERROR(EAGAIN) if more destination buffer space is required.
  726. */
  727. static int yae_flush(ATempoContext *atempo,
  728. uint8_t **dst_ref,
  729. uint8_t *dst_end)
  730. {
  731. AudioFragment *frag = yae_curr_frag(atempo);
  732. int64_t overlap_end;
  733. int64_t start_here;
  734. int64_t stop_here;
  735. int64_t offset;
  736. const uint8_t *src;
  737. uint8_t *dst;
  738. int src_size;
  739. int dst_size;
  740. int nbytes;
  741. atempo->state = YAE_FLUSH_OUTPUT;
  742. if (atempo->position[0] == frag->position[0] + frag->nsamples &&
  743. atempo->position[1] == frag->position[1] + frag->nsamples) {
  744. // the current fragment is already flushed:
  745. return 0;
  746. }
  747. if (frag->position[0] + frag->nsamples < atempo->position[0]) {
  748. // finish loading the current (possibly partial) fragment:
  749. yae_load_frag(atempo, NULL, NULL);
  750. if (atempo->nfrag) {
  751. // down-mix to mono:
  752. yae_downmix(atempo, frag);
  753. // apply rDFT:
  754. av_rdft_calc(atempo->real_to_complex, frag->xdat);
  755. // align current fragment to previous fragment:
  756. if (yae_adjust_position(atempo)) {
  757. // reload the current fragment due to adjusted position:
  758. yae_load_frag(atempo, NULL, NULL);
  759. }
  760. }
  761. }
  762. // flush the overlap region:
  763. overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
  764. frag->nsamples);
  765. while (atempo->position[1] < overlap_end) {
  766. if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
  767. return AVERROR(EAGAIN);
  768. }
  769. }
  770. // flush the remaininder of the current fragment:
  771. start_here = FFMAX(atempo->position[1], overlap_end);
  772. stop_here = frag->position[1] + frag->nsamples;
  773. offset = start_here - frag->position[1];
  774. av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
  775. src = frag->data + offset * atempo->stride;
  776. dst = (uint8_t *)*dst_ref;
  777. src_size = (int)(stop_here - start_here) * atempo->stride;
  778. dst_size = dst_end - dst;
  779. nbytes = FFMIN(src_size, dst_size);
  780. memcpy(dst, src, nbytes);
  781. dst += nbytes;
  782. atempo->position[1] += (nbytes / atempo->stride);
  783. // pass-back the updated destination buffer pointer:
  784. *dst_ref = (uint8_t *)dst;
  785. return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
  786. }
  787. static av_cold int init(AVFilterContext *ctx, const char *args)
  788. {
  789. ATempoContext *atempo = ctx->priv;
  790. // NOTE: this assumes that the caller has memset ctx->priv to 0:
  791. atempo->format = AV_SAMPLE_FMT_NONE;
  792. atempo->tempo = 1.0;
  793. atempo->state = YAE_LOAD_FRAGMENT;
  794. return args ? yae_set_tempo(ctx, args) : 0;
  795. }
  796. static av_cold void uninit(AVFilterContext *ctx)
  797. {
  798. ATempoContext *atempo = ctx->priv;
  799. yae_release_buffers(atempo);
  800. }
  801. static int query_formats(AVFilterContext *ctx)
  802. {
  803. AVFilterChannelLayouts *layouts = NULL;
  804. AVFilterFormats *formats = NULL;
  805. // WSOLA necessitates an internal sliding window ring buffer
  806. // for incoming audio stream.
  807. //
  808. // Planar sample formats are too cumbersome to store in a ring buffer,
  809. // therefore planar sample formats are not supported.
  810. //
  811. enum AVSampleFormat sample_fmts[] = {
  812. AV_SAMPLE_FMT_U8,
  813. AV_SAMPLE_FMT_S16,
  814. AV_SAMPLE_FMT_S32,
  815. AV_SAMPLE_FMT_FLT,
  816. AV_SAMPLE_FMT_DBL,
  817. AV_SAMPLE_FMT_NONE
  818. };
  819. layouts = ff_all_channel_layouts();
  820. if (!layouts) {
  821. return AVERROR(ENOMEM);
  822. }
  823. ff_set_common_channel_layouts(ctx, layouts);
  824. formats = ff_make_format_list(sample_fmts);
  825. if (!formats) {
  826. return AVERROR(ENOMEM);
  827. }
  828. ff_set_common_formats(ctx, formats);
  829. formats = ff_all_samplerates();
  830. if (!formats) {
  831. return AVERROR(ENOMEM);
  832. }
  833. ff_set_common_samplerates(ctx, formats);
  834. return 0;
  835. }
  836. static int config_props(AVFilterLink *inlink)
  837. {
  838. AVFilterContext *ctx = inlink->dst;
  839. ATempoContext *atempo = ctx->priv;
  840. enum AVSampleFormat format = inlink->format;
  841. int sample_rate = (int)inlink->sample_rate;
  842. int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
  843. return yae_reset(atempo, format, sample_rate, channels);
  844. }
  845. static void push_samples(ATempoContext *atempo,
  846. AVFilterLink *outlink,
  847. int n_out)
  848. {
  849. atempo->dst_buffer->audio->sample_rate = outlink->sample_rate;
  850. atempo->dst_buffer->audio->nb_samples = n_out;
  851. // adjust the PTS:
  852. atempo->dst_buffer->pts =
  853. av_rescale_q(atempo->nsamples_out,
  854. (AVRational){ 1, outlink->sample_rate },
  855. outlink->time_base);
  856. ff_filter_samples(outlink, atempo->dst_buffer);
  857. atempo->dst_buffer = NULL;
  858. atempo->dst = NULL;
  859. atempo->dst_end = NULL;
  860. atempo->nsamples_out += n_out;
  861. }
  862. static int filter_samples(AVFilterLink *inlink,
  863. AVFilterBufferRef *src_buffer)
  864. {
  865. AVFilterContext *ctx = inlink->dst;
  866. ATempoContext *atempo = ctx->priv;
  867. AVFilterLink *outlink = ctx->outputs[0];
  868. int n_in = src_buffer->audio->nb_samples;
  869. int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
  870. const uint8_t *src = src_buffer->data[0];
  871. const uint8_t *src_end = src + n_in * atempo->stride;
  872. while (src < src_end) {
  873. if (!atempo->dst_buffer) {
  874. atempo->dst_buffer = ff_get_audio_buffer(outlink,
  875. AV_PERM_WRITE,
  876. n_out);
  877. avfilter_copy_buffer_ref_props(atempo->dst_buffer, src_buffer);
  878. atempo->dst = atempo->dst_buffer->data[0];
  879. atempo->dst_end = atempo->dst + n_out * atempo->stride;
  880. }
  881. yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
  882. if (atempo->dst == atempo->dst_end) {
  883. push_samples(atempo, outlink, n_out);
  884. atempo->request_fulfilled = 1;
  885. }
  886. }
  887. atempo->nsamples_in += n_in;
  888. avfilter_unref_bufferp(&src_buffer);
  889. return 0;
  890. }
  891. static int request_frame(AVFilterLink *outlink)
  892. {
  893. AVFilterContext *ctx = outlink->src;
  894. ATempoContext *atempo = ctx->priv;
  895. int ret;
  896. atempo->request_fulfilled = 0;
  897. do {
  898. ret = ff_request_frame(ctx->inputs[0]);
  899. }
  900. while (!atempo->request_fulfilled && ret >= 0);
  901. if (ret == AVERROR_EOF) {
  902. // flush the filter:
  903. int n_max = atempo->ring;
  904. int n_out;
  905. int err = AVERROR(EAGAIN);
  906. while (err == AVERROR(EAGAIN)) {
  907. if (!atempo->dst_buffer) {
  908. atempo->dst_buffer = ff_get_audio_buffer(outlink,
  909. AV_PERM_WRITE,
  910. n_max);
  911. atempo->dst = atempo->dst_buffer->data[0];
  912. atempo->dst_end = atempo->dst + n_max * atempo->stride;
  913. }
  914. err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
  915. n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
  916. atempo->stride);
  917. if (n_out) {
  918. push_samples(atempo, outlink, n_out);
  919. }
  920. }
  921. avfilter_unref_bufferp(&atempo->dst_buffer);
  922. atempo->dst = NULL;
  923. atempo->dst_end = NULL;
  924. return AVERROR_EOF;
  925. }
  926. return ret;
  927. }
  928. static int process_command(AVFilterContext *ctx,
  929. const char *cmd,
  930. const char *arg,
  931. char *res,
  932. int res_len,
  933. int flags)
  934. {
  935. return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
  936. }
  937. AVFilter avfilter_af_atempo = {
  938. .name = "atempo",
  939. .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
  940. .init = init,
  941. .uninit = uninit,
  942. .query_formats = query_formats,
  943. .process_command = process_command,
  944. .priv_size = sizeof(ATempoContext),
  945. .inputs = (const AVFilterPad[]) {
  946. { .name = "default",
  947. .type = AVMEDIA_TYPE_AUDIO,
  948. .filter_samples = filter_samples,
  949. .config_props = config_props,
  950. .min_perms = AV_PERM_READ, },
  951. { .name = NULL}
  952. },
  953. .outputs = (const AVFilterPad[]) {
  954. { .name = "default",
  955. .request_frame = request_frame,
  956. .type = AVMEDIA_TYPE_AUDIO, },
  957. { .name = NULL}
  958. },
  959. };