You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1637 lines
47KB

  1. /*
  2. * Copyright (c) 2018 Gregor Richards
  3. * Copyright (c) 2017 Mozilla
  4. * Copyright (c) 2005-2009 Xiph.Org Foundation
  5. * Copyright (c) 2007-2008 CSIRO
  6. * Copyright (c) 2008-2011 Octasic Inc.
  7. * Copyright (c) Jean-Marc Valin
  8. * Copyright (c) 2019 Paul B Mahol
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * - Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * - Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in the
  19. * documentation and/or other materials provided with the distribution.
  20. *
  21. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22. * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  24. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
  25. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  26. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  27. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  28. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  29. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  30. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  31. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  32. */
  33. #include <float.h>
  34. #include "libavutil/avassert.h"
  35. #include "libavutil/avstring.h"
  36. #include "libavutil/float_dsp.h"
  37. #include "libavutil/mem_internal.h"
  38. #include "libavutil/opt.h"
  39. #include "libavutil/tx.h"
  40. #include "avfilter.h"
  41. #include "audio.h"
  42. #include "filters.h"
  43. #include "formats.h"
  44. #define FRAME_SIZE_SHIFT 2
  45. #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
  46. #define WINDOW_SIZE (2*FRAME_SIZE)
  47. #define FREQ_SIZE (FRAME_SIZE + 1)
  48. #define PITCH_MIN_PERIOD 60
  49. #define PITCH_MAX_PERIOD 768
  50. #define PITCH_FRAME_SIZE 960
  51. #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
  52. #define SQUARE(x) ((x)*(x))
  53. #define NB_BANDS 22
  54. #define CEPS_MEM 8
  55. #define NB_DELTA_CEPS 6
  56. #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
  57. #define WEIGHTS_SCALE (1.f/256)
  58. #define MAX_NEURONS 128
  59. #define ACTIVATION_TANH 0
  60. #define ACTIVATION_SIGMOID 1
  61. #define ACTIVATION_RELU 2
  62. #define Q15ONE 1.0f
  63. typedef struct DenseLayer {
  64. const float *bias;
  65. const float *input_weights;
  66. int nb_inputs;
  67. int nb_neurons;
  68. int activation;
  69. } DenseLayer;
  70. typedef struct GRULayer {
  71. const float *bias;
  72. const float *input_weights;
  73. const float *recurrent_weights;
  74. int nb_inputs;
  75. int nb_neurons;
  76. int activation;
  77. } GRULayer;
  78. typedef struct RNNModel {
  79. int input_dense_size;
  80. const DenseLayer *input_dense;
  81. int vad_gru_size;
  82. const GRULayer *vad_gru;
  83. int noise_gru_size;
  84. const GRULayer *noise_gru;
  85. int denoise_gru_size;
  86. const GRULayer *denoise_gru;
  87. int denoise_output_size;
  88. const DenseLayer *denoise_output;
  89. int vad_output_size;
  90. const DenseLayer *vad_output;
  91. } RNNModel;
  92. typedef struct RNNState {
  93. float *vad_gru_state;
  94. float *noise_gru_state;
  95. float *denoise_gru_state;
  96. RNNModel *model;
  97. } RNNState;
  98. typedef struct DenoiseState {
  99. float analysis_mem[FRAME_SIZE];
  100. float cepstral_mem[CEPS_MEM][NB_BANDS];
  101. int memid;
  102. DECLARE_ALIGNED(32, float, synthesis_mem)[FRAME_SIZE];
  103. float pitch_buf[PITCH_BUF_SIZE];
  104. float pitch_enh_buf[PITCH_BUF_SIZE];
  105. float last_gain;
  106. int last_period;
  107. float mem_hp_x[2];
  108. float lastg[NB_BANDS];
  109. float history[FRAME_SIZE];
  110. RNNState rnn[2];
  111. AVTXContext *tx, *txi;
  112. av_tx_fn tx_fn, txi_fn;
  113. } DenoiseState;
  114. typedef struct AudioRNNContext {
  115. const AVClass *class;
  116. char *model_name;
  117. float mix;
  118. int channels;
  119. DenoiseState *st;
  120. DECLARE_ALIGNED(32, float, window)[WINDOW_SIZE];
  121. DECLARE_ALIGNED(32, float, dct_table)[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)];
  122. RNNModel *model[2];
  123. AVFloatDSPContext *fdsp;
  124. } AudioRNNContext;
  125. #define F_ACTIVATION_TANH 0
  126. #define F_ACTIVATION_SIGMOID 1
  127. #define F_ACTIVATION_RELU 2
  128. static void rnnoise_model_free(RNNModel *model)
  129. {
  130. #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
  131. #define FREE_DENSE(name) do { \
  132. if (model->name) { \
  133. av_free((void *) model->name->input_weights); \
  134. av_free((void *) model->name->bias); \
  135. av_free((void *) model->name); \
  136. } \
  137. } while (0)
  138. #define FREE_GRU(name) do { \
  139. if (model->name) { \
  140. av_free((void *) model->name->input_weights); \
  141. av_free((void *) model->name->recurrent_weights); \
  142. av_free((void *) model->name->bias); \
  143. av_free((void *) model->name); \
  144. } \
  145. } while (0)
  146. if (!model)
  147. return;
  148. FREE_DENSE(input_dense);
  149. FREE_GRU(vad_gru);
  150. FREE_GRU(noise_gru);
  151. FREE_GRU(denoise_gru);
  152. FREE_DENSE(denoise_output);
  153. FREE_DENSE(vad_output);
  154. av_free(model);
  155. }
  156. static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
  157. {
  158. RNNModel *ret = NULL;
  159. DenseLayer *input_dense;
  160. GRULayer *vad_gru;
  161. GRULayer *noise_gru;
  162. GRULayer *denoise_gru;
  163. DenseLayer *denoise_output;
  164. DenseLayer *vad_output;
  165. int in;
  166. if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
  167. return AVERROR_INVALIDDATA;
  168. ret = av_calloc(1, sizeof(RNNModel));
  169. if (!ret)
  170. return AVERROR(ENOMEM);
  171. #define ALLOC_LAYER(type, name) \
  172. name = av_calloc(1, sizeof(type)); \
  173. if (!name) { \
  174. rnnoise_model_free(ret); \
  175. return AVERROR(ENOMEM); \
  176. } \
  177. ret->name = name
  178. ALLOC_LAYER(DenseLayer, input_dense);
  179. ALLOC_LAYER(GRULayer, vad_gru);
  180. ALLOC_LAYER(GRULayer, noise_gru);
  181. ALLOC_LAYER(GRULayer, denoise_gru);
  182. ALLOC_LAYER(DenseLayer, denoise_output);
  183. ALLOC_LAYER(DenseLayer, vad_output);
  184. #define INPUT_VAL(name) do { \
  185. if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
  186. rnnoise_model_free(ret); \
  187. return AVERROR(EINVAL); \
  188. } \
  189. name = in; \
  190. } while (0)
  191. #define INPUT_ACTIVATION(name) do { \
  192. int activation; \
  193. INPUT_VAL(activation); \
  194. switch (activation) { \
  195. case F_ACTIVATION_SIGMOID: \
  196. name = ACTIVATION_SIGMOID; \
  197. break; \
  198. case F_ACTIVATION_RELU: \
  199. name = ACTIVATION_RELU; \
  200. break; \
  201. default: \
  202. name = ACTIVATION_TANH; \
  203. } \
  204. } while (0)
  205. #define INPUT_ARRAY(name, len) do { \
  206. float *values = av_calloc((len), sizeof(float)); \
  207. if (!values) { \
  208. rnnoise_model_free(ret); \
  209. return AVERROR(ENOMEM); \
  210. } \
  211. name = values; \
  212. for (int i = 0; i < (len); i++) { \
  213. if (fscanf(f, "%d", &in) != 1) { \
  214. rnnoise_model_free(ret); \
  215. return AVERROR(EINVAL); \
  216. } \
  217. values[i] = in; \
  218. } \
  219. } while (0)
  220. #define INPUT_ARRAY3(name, len0, len1, len2) do { \
  221. float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
  222. if (!values) { \
  223. rnnoise_model_free(ret); \
  224. return AVERROR(ENOMEM); \
  225. } \
  226. name = values; \
  227. for (int k = 0; k < (len0); k++) { \
  228. for (int i = 0; i < (len2); i++) { \
  229. for (int j = 0; j < (len1); j++) { \
  230. if (fscanf(f, "%d", &in) != 1) { \
  231. rnnoise_model_free(ret); \
  232. return AVERROR(EINVAL); \
  233. } \
  234. values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
  235. } \
  236. } \
  237. } \
  238. } while (0)
  239. #define NEW_LINE() do { \
  240. int c; \
  241. while ((c = fgetc(f)) != EOF) { \
  242. if (c == '\n') \
  243. break; \
  244. } \
  245. } while (0)
  246. #define INPUT_DENSE(name) do { \
  247. INPUT_VAL(name->nb_inputs); \
  248. INPUT_VAL(name->nb_neurons); \
  249. ret->name ## _size = name->nb_neurons; \
  250. INPUT_ACTIVATION(name->activation); \
  251. NEW_LINE(); \
  252. INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
  253. NEW_LINE(); \
  254. INPUT_ARRAY(name->bias, name->nb_neurons); \
  255. NEW_LINE(); \
  256. } while (0)
  257. #define INPUT_GRU(name) do { \
  258. INPUT_VAL(name->nb_inputs); \
  259. INPUT_VAL(name->nb_neurons); \
  260. ret->name ## _size = name->nb_neurons; \
  261. INPUT_ACTIVATION(name->activation); \
  262. NEW_LINE(); \
  263. INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
  264. NEW_LINE(); \
  265. INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
  266. NEW_LINE(); \
  267. INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
  268. NEW_LINE(); \
  269. } while (0)
  270. INPUT_DENSE(input_dense);
  271. INPUT_GRU(vad_gru);
  272. INPUT_GRU(noise_gru);
  273. INPUT_GRU(denoise_gru);
  274. INPUT_DENSE(denoise_output);
  275. INPUT_DENSE(vad_output);
  276. if (vad_output->nb_neurons != 1) {
  277. rnnoise_model_free(ret);
  278. return AVERROR(EINVAL);
  279. }
  280. *rnn = ret;
  281. return 0;
  282. }
  283. static int query_formats(AVFilterContext *ctx)
  284. {
  285. AVFilterFormats *formats = NULL;
  286. AVFilterChannelLayouts *layouts = NULL;
  287. static const enum AVSampleFormat sample_fmts[] = {
  288. AV_SAMPLE_FMT_FLTP,
  289. AV_SAMPLE_FMT_NONE
  290. };
  291. int ret, sample_rates[] = { 48000, -1 };
  292. formats = ff_make_format_list(sample_fmts);
  293. if (!formats)
  294. return AVERROR(ENOMEM);
  295. ret = ff_set_common_formats(ctx, formats);
  296. if (ret < 0)
  297. return ret;
  298. layouts = ff_all_channel_counts();
  299. if (!layouts)
  300. return AVERROR(ENOMEM);
  301. ret = ff_set_common_channel_layouts(ctx, layouts);
  302. if (ret < 0)
  303. return ret;
  304. formats = ff_make_format_list(sample_rates);
  305. if (!formats)
  306. return AVERROR(ENOMEM);
  307. return ff_set_common_samplerates(ctx, formats);
  308. }
  309. static int config_input(AVFilterLink *inlink)
  310. {
  311. AVFilterContext *ctx = inlink->dst;
  312. AudioRNNContext *s = ctx->priv;
  313. int ret;
  314. s->channels = inlink->channels;
  315. if (!s->st)
  316. s->st = av_calloc(s->channels, sizeof(DenoiseState));
  317. if (!s->st)
  318. return AVERROR(ENOMEM);
  319. for (int i = 0; i < s->channels; i++) {
  320. DenoiseState *st = &s->st[i];
  321. st->rnn[0].model = s->model[0];
  322. st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
  323. st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
  324. st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
  325. if (!st->rnn[0].vad_gru_state ||
  326. !st->rnn[0].noise_gru_state ||
  327. !st->rnn[0].denoise_gru_state)
  328. return AVERROR(ENOMEM);
  329. }
  330. for (int i = 0; i < s->channels; i++) {
  331. DenoiseState *st = &s->st[i];
  332. if (!st->tx)
  333. ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, NULL, 0);
  334. if (ret < 0)
  335. return ret;
  336. if (!st->txi)
  337. ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, NULL, 0);
  338. if (ret < 0)
  339. return ret;
  340. }
  341. return 0;
  342. }
  343. static void biquad(float *y, float mem[2], const float *x,
  344. const float *b, const float *a, int N)
  345. {
  346. for (int i = 0; i < N; i++) {
  347. float xi, yi;
  348. xi = x[i];
  349. yi = x[i] + mem[0];
  350. mem[0] = mem[1] + (b[0]*xi - a[0]*yi);
  351. mem[1] = (b[1]*xi - a[1]*yi);
  352. y[i] = yi;
  353. }
  354. }
  355. #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
  356. #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
  357. #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
  358. static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
  359. {
  360. AVComplexFloat x[WINDOW_SIZE];
  361. AVComplexFloat y[WINDOW_SIZE];
  362. for (int i = 0; i < WINDOW_SIZE; i++) {
  363. x[i].re = in[i];
  364. x[i].im = 0;
  365. }
  366. st->tx_fn(st->tx, y, x, sizeof(float));
  367. RNN_COPY(out, y, FREQ_SIZE);
  368. }
  369. static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
  370. {
  371. AVComplexFloat x[WINDOW_SIZE];
  372. AVComplexFloat y[WINDOW_SIZE];
  373. RNN_COPY(x, in, FREQ_SIZE);
  374. for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {
  375. x[i].re = x[WINDOW_SIZE - i].re;
  376. x[i].im = -x[WINDOW_SIZE - i].im;
  377. }
  378. st->txi_fn(st->txi, y, x, sizeof(float));
  379. for (int i = 0; i < WINDOW_SIZE; i++)
  380. out[i] = y[i].re / WINDOW_SIZE;
  381. }
  382. static const uint8_t eband5ms[] = {
  383. /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
  384. 0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
  385. };
  386. static void compute_band_energy(float *bandE, const AVComplexFloat *X)
  387. {
  388. float sum[NB_BANDS] = {0};
  389. for (int i = 0; i < NB_BANDS - 1; i++) {
  390. int band_size;
  391. band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
  392. for (int j = 0; j < band_size; j++) {
  393. float tmp, frac = (float)j / band_size;
  394. tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);
  395. tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);
  396. sum[i] += (1.f - frac) * tmp;
  397. sum[i + 1] += frac * tmp;
  398. }
  399. }
  400. sum[0] *= 2;
  401. sum[NB_BANDS - 1] *= 2;
  402. for (int i = 0; i < NB_BANDS; i++)
  403. bandE[i] = sum[i];
  404. }
  405. static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
  406. {
  407. float sum[NB_BANDS] = { 0 };
  408. for (int i = 0; i < NB_BANDS - 1; i++) {
  409. int band_size;
  410. band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
  411. for (int j = 0; j < band_size; j++) {
  412. float tmp, frac = (float)j / band_size;
  413. tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;
  414. tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;
  415. sum[i] += (1 - frac) * tmp;
  416. sum[i + 1] += frac * tmp;
  417. }
  418. }
  419. sum[0] *= 2;
  420. sum[NB_BANDS-1] *= 2;
  421. for (int i = 0; i < NB_BANDS; i++)
  422. bandE[i] = sum[i];
  423. }
  424. static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
  425. {
  426. LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
  427. RNN_COPY(x, st->analysis_mem, FRAME_SIZE);
  428. RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);
  429. RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
  430. s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
  431. forward_transform(st, X, x);
  432. compute_band_energy(Ex, X);
  433. }
  434. static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
  435. {
  436. LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
  437. const float *src = st->history;
  438. const float mix = s->mix;
  439. const float imix = 1.f - FFMAX(mix, 0.f);
  440. inverse_transform(st, x, y);
  441. s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
  442. s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
  443. RNN_COPY(out, x, FRAME_SIZE);
  444. RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
  445. for (int n = 0; n < FRAME_SIZE; n++)
  446. out[n] = out[n] * mix + src[n] * imix;
  447. }
  448. static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
  449. {
  450. float y_0, y_1, y_2, y_3 = 0;
  451. int j;
  452. y_0 = *y++;
  453. y_1 = *y++;
  454. y_2 = *y++;
  455. for (j = 0; j < len - 3; j += 4) {
  456. float tmp;
  457. tmp = *x++;
  458. y_3 = *y++;
  459. sum[0] += tmp * y_0;
  460. sum[1] += tmp * y_1;
  461. sum[2] += tmp * y_2;
  462. sum[3] += tmp * y_3;
  463. tmp = *x++;
  464. y_0 = *y++;
  465. sum[0] += tmp * y_1;
  466. sum[1] += tmp * y_2;
  467. sum[2] += tmp * y_3;
  468. sum[3] += tmp * y_0;
  469. tmp = *x++;
  470. y_1 = *y++;
  471. sum[0] += tmp * y_2;
  472. sum[1] += tmp * y_3;
  473. sum[2] += tmp * y_0;
  474. sum[3] += tmp * y_1;
  475. tmp = *x++;
  476. y_2 = *y++;
  477. sum[0] += tmp * y_3;
  478. sum[1] += tmp * y_0;
  479. sum[2] += tmp * y_1;
  480. sum[3] += tmp * y_2;
  481. }
  482. if (j++ < len) {
  483. float tmp = *x++;
  484. y_3 = *y++;
  485. sum[0] += tmp * y_0;
  486. sum[1] += tmp * y_1;
  487. sum[2] += tmp * y_2;
  488. sum[3] += tmp * y_3;
  489. }
  490. if (j++ < len) {
  491. float tmp=*x++;
  492. y_0 = *y++;
  493. sum[0] += tmp * y_1;
  494. sum[1] += tmp * y_2;
  495. sum[2] += tmp * y_3;
  496. sum[3] += tmp * y_0;
  497. }
  498. if (j < len) {
  499. float tmp=*x++;
  500. y_1 = *y++;
  501. sum[0] += tmp * y_2;
  502. sum[1] += tmp * y_3;
  503. sum[2] += tmp * y_0;
  504. sum[3] += tmp * y_1;
  505. }
  506. }
  507. static inline float celt_inner_prod(const float *x,
  508. const float *y, int N)
  509. {
  510. float xy = 0.f;
  511. for (int i = 0; i < N; i++)
  512. xy += x[i] * y[i];
  513. return xy;
  514. }
  515. static void celt_pitch_xcorr(const float *x, const float *y,
  516. float *xcorr, int len, int max_pitch)
  517. {
  518. int i;
  519. for (i = 0; i < max_pitch - 3; i += 4) {
  520. float sum[4] = { 0, 0, 0, 0};
  521. xcorr_kernel(x, y + i, sum, len);
  522. xcorr[i] = sum[0];
  523. xcorr[i + 1] = sum[1];
  524. xcorr[i + 2] = sum[2];
  525. xcorr[i + 3] = sum[3];
  526. }
  527. /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
  528. for (; i < max_pitch; i++) {
  529. xcorr[i] = celt_inner_prod(x, y + i, len);
  530. }
  531. }
  532. static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */
  533. float *ac, /* out: [0...lag-1] ac values */
  534. const float *window,
  535. int overlap,
  536. int lag,
  537. int n)
  538. {
  539. int fastN = n - lag;
  540. int shift;
  541. const float *xptr;
  542. float xx[PITCH_BUF_SIZE>>1];
  543. if (overlap == 0) {
  544. xptr = x;
  545. } else {
  546. for (int i = 0; i < n; i++)
  547. xx[i] = x[i];
  548. for (int i = 0; i < overlap; i++) {
  549. xx[i] = x[i] * window[i];
  550. xx[n-i-1] = x[n-i-1] * window[i];
  551. }
  552. xptr = xx;
  553. }
  554. shift = 0;
  555. celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
  556. for (int k = 0; k <= lag; k++) {
  557. float d = 0.f;
  558. for (int i = k + fastN; i < n; i++)
  559. d += xptr[i] * xptr[i-k];
  560. ac[k] += d;
  561. }
  562. return shift;
  563. }
  564. static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */
  565. const float *ac, /* in: [0...p] autocorrelation values */
  566. int p)
  567. {
  568. float r, error = ac[0];
  569. RNN_CLEAR(lpc, p);
  570. if (ac[0] != 0) {
  571. for (int i = 0; i < p; i++) {
  572. /* Sum up this iteration's reflection coefficient */
  573. float rr = 0;
  574. for (int j = 0; j < i; j++)
  575. rr += (lpc[j] * ac[i - j]);
  576. rr += ac[i + 1];
  577. r = -rr/error;
  578. /* Update LPC coefficients and total error */
  579. lpc[i] = r;
  580. for (int j = 0; j < (i + 1) >> 1; j++) {
  581. float tmp1, tmp2;
  582. tmp1 = lpc[j];
  583. tmp2 = lpc[i-1-j];
  584. lpc[j] = tmp1 + (r*tmp2);
  585. lpc[i-1-j] = tmp2 + (r*tmp1);
  586. }
  587. error = error - (r * r *error);
  588. /* Bail out once we get 30 dB gain */
  589. if (error < .001f * ac[0])
  590. break;
  591. }
  592. }
  593. }
  594. static void celt_fir5(const float *x,
  595. const float *num,
  596. float *y,
  597. int N,
  598. float *mem)
  599. {
  600. float num0, num1, num2, num3, num4;
  601. float mem0, mem1, mem2, mem3, mem4;
  602. num0 = num[0];
  603. num1 = num[1];
  604. num2 = num[2];
  605. num3 = num[3];
  606. num4 = num[4];
  607. mem0 = mem[0];
  608. mem1 = mem[1];
  609. mem2 = mem[2];
  610. mem3 = mem[3];
  611. mem4 = mem[4];
  612. for (int i = 0; i < N; i++) {
  613. float sum = x[i];
  614. sum += (num0*mem0);
  615. sum += (num1*mem1);
  616. sum += (num2*mem2);
  617. sum += (num3*mem3);
  618. sum += (num4*mem4);
  619. mem4 = mem3;
  620. mem3 = mem2;
  621. mem2 = mem1;
  622. mem1 = mem0;
  623. mem0 = x[i];
  624. y[i] = sum;
  625. }
  626. mem[0] = mem0;
  627. mem[1] = mem1;
  628. mem[2] = mem2;
  629. mem[3] = mem3;
  630. mem[4] = mem4;
  631. }
  632. static void pitch_downsample(float *x[], float *x_lp,
  633. int len, int C)
  634. {
  635. float ac[5];
  636. float tmp=Q15ONE;
  637. float lpc[4], mem[5]={0,0,0,0,0};
  638. float lpc2[5];
  639. float c1 = .8f;
  640. for (int i = 1; i < len >> 1; i++)
  641. x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);
  642. x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
  643. if (C==2) {
  644. for (int i = 1; i < len >> 1; i++)
  645. x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));
  646. x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
  647. }
  648. celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);
  649. /* Noise floor -40 dB */
  650. ac[0] *= 1.0001f;
  651. /* Lag windowing */
  652. for (int i = 1; i <= 4; i++) {
  653. /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
  654. ac[i] -= ac[i]*(.008f*i)*(.008f*i);
  655. }
  656. celt_lpc(lpc, ac, 4);
  657. for (int i = 0; i < 4; i++) {
  658. tmp = .9f * tmp;
  659. lpc[i] = (lpc[i] * tmp);
  660. }
  661. /* Add a zero */
  662. lpc2[0] = lpc[0] + .8f;
  663. lpc2[1] = lpc[1] + (c1 * lpc[0]);
  664. lpc2[2] = lpc[2] + (c1 * lpc[1]);
  665. lpc2[3] = lpc[3] + (c1 * lpc[2]);
  666. lpc2[4] = (c1 * lpc[3]);
  667. celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
  668. }
  669. static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,
  670. int N, float *xy1, float *xy2)
  671. {
  672. float xy01 = 0, xy02 = 0;
  673. for (int i = 0; i < N; i++) {
  674. xy01 += (x[i] * y01[i]);
  675. xy02 += (x[i] * y02[i]);
  676. }
  677. *xy1 = xy01;
  678. *xy2 = xy02;
  679. }
  680. static float compute_pitch_gain(float xy, float xx, float yy)
  681. {
  682. return xy / sqrtf(1.f + xx * yy);
  683. }
  684. static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
  685. static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
  686. int *T0_, int prev_period, float prev_gain)
  687. {
  688. int k, i, T, T0;
  689. float g, g0;
  690. float pg;
  691. float xy,xx,yy,xy2;
  692. float xcorr[3];
  693. float best_xy, best_yy;
  694. int offset;
  695. int minperiod0;
  696. float yy_lookup[PITCH_MAX_PERIOD+1];
  697. minperiod0 = minperiod;
  698. maxperiod /= 2;
  699. minperiod /= 2;
  700. *T0_ /= 2;
  701. prev_period /= 2;
  702. N /= 2;
  703. x += maxperiod;
  704. if (*T0_>=maxperiod)
  705. *T0_=maxperiod-1;
  706. T = T0 = *T0_;
  707. dual_inner_prod(x, x, x-T0, N, &xx, &xy);
  708. yy_lookup[0] = xx;
  709. yy=xx;
  710. for (i = 1; i <= maxperiod; i++) {
  711. yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);
  712. yy_lookup[i] = FFMAX(0, yy);
  713. }
  714. yy = yy_lookup[T0];
  715. best_xy = xy;
  716. best_yy = yy;
  717. g = g0 = compute_pitch_gain(xy, xx, yy);
  718. /* Look for any pitch at T/k */
  719. for (k = 2; k <= 15; k++) {
  720. int T1, T1b;
  721. float g1;
  722. float cont=0;
  723. float thresh;
  724. T1 = (2*T0+k)/(2*k);
  725. if (T1 < minperiod)
  726. break;
  727. /* Look for another strong correlation at T1b */
  728. if (k==2)
  729. {
  730. if (T1+T0>maxperiod)
  731. T1b = T0;
  732. else
  733. T1b = T0+T1;
  734. } else
  735. {
  736. T1b = (2*second_check[k]*T0+k)/(2*k);
  737. }
  738. dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
  739. xy = .5f * (xy + xy2);
  740. yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
  741. g1 = compute_pitch_gain(xy, xx, yy);
  742. if (FFABS(T1-prev_period)<=1)
  743. cont = prev_gain;
  744. else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
  745. cont = prev_gain * .5f;
  746. else
  747. cont = 0;
  748. thresh = FFMAX(.3f, (.7f * g0) - cont);
  749. /* Bias against very high pitch (very short period) to avoid false-positives
  750. due to short-term correlation */
  751. if (T1<3*minperiod)
  752. thresh = FFMAX(.4f, (.85f * g0) - cont);
  753. else if (T1<2*minperiod)
  754. thresh = FFMAX(.5f, (.9f * g0) - cont);
  755. if (g1 > thresh)
  756. {
  757. best_xy = xy;
  758. best_yy = yy;
  759. T = T1;
  760. g = g1;
  761. }
  762. }
  763. best_xy = FFMAX(0, best_xy);
  764. if (best_yy <= best_xy)
  765. pg = Q15ONE;
  766. else
  767. pg = best_xy/(best_yy + 1);
  768. for (k = 0; k < 3; k++)
  769. xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
  770. if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
  771. offset = 1;
  772. else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
  773. offset = -1;
  774. else
  775. offset = 0;
  776. if (pg > g)
  777. pg = g;
  778. *T0_ = 2*T+offset;
  779. if (*T0_<minperiod0)
  780. *T0_=minperiod0;
  781. return pg;
  782. }
  783. static void find_best_pitch(float *xcorr, float *y, int len,
  784. int max_pitch, int *best_pitch)
  785. {
  786. float best_num[2];
  787. float best_den[2];
  788. float Syy = 1.f;
  789. best_num[0] = -1;
  790. best_num[1] = -1;
  791. best_den[0] = 0;
  792. best_den[1] = 0;
  793. best_pitch[0] = 0;
  794. best_pitch[1] = 1;
  795. for (int j = 0; j < len; j++)
  796. Syy += y[j] * y[j];
  797. for (int i = 0; i < max_pitch; i++) {
  798. if (xcorr[i]>0) {
  799. float num;
  800. float xcorr16;
  801. xcorr16 = xcorr[i];
  802. /* Considering the range of xcorr16, this should avoid both underflows
  803. and overflows (inf) when squaring xcorr16 */
  804. xcorr16 *= 1e-12f;
  805. num = xcorr16 * xcorr16;
  806. if ((num * best_den[1]) > (best_num[1] * Syy)) {
  807. if ((num * best_den[0]) > (best_num[0] * Syy)) {
  808. best_num[1] = best_num[0];
  809. best_den[1] = best_den[0];
  810. best_pitch[1] = best_pitch[0];
  811. best_num[0] = num;
  812. best_den[0] = Syy;
  813. best_pitch[0] = i;
  814. } else {
  815. best_num[1] = num;
  816. best_den[1] = Syy;
  817. best_pitch[1] = i;
  818. }
  819. }
  820. }
  821. Syy += y[i+len]*y[i+len] - y[i] * y[i];
  822. Syy = FFMAX(1, Syy);
  823. }
  824. }
  825. static void pitch_search(const float *x_lp, float *y,
  826. int len, int max_pitch, int *pitch)
  827. {
  828. int lag;
  829. int best_pitch[2]={0,0};
  830. int offset;
  831. float x_lp4[WINDOW_SIZE];
  832. float y_lp4[WINDOW_SIZE];
  833. float xcorr[WINDOW_SIZE];
  834. lag = len+max_pitch;
  835. /* Downsample by 2 again */
  836. for (int j = 0; j < len >> 2; j++)
  837. x_lp4[j] = x_lp[2*j];
  838. for (int j = 0; j < lag >> 2; j++)
  839. y_lp4[j] = y[2*j];
  840. /* Coarse search with 4x decimation */
  841. celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
  842. find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);
  843. /* Finer search with 2x decimation */
  844. for (int i = 0; i < max_pitch >> 1; i++) {
  845. float sum;
  846. xcorr[i] = 0;
  847. if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)
  848. continue;
  849. sum = celt_inner_prod(x_lp, y+i, len>>1);
  850. xcorr[i] = FFMAX(-1, sum);
  851. }
  852. find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);
  853. /* Refine by pseudo-interpolation */
  854. if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
  855. float a, b, c;
  856. a = xcorr[best_pitch[0] - 1];
  857. b = xcorr[best_pitch[0]];
  858. c = xcorr[best_pitch[0] + 1];
  859. if (c - a > .7f * (b - a))
  860. offset = 1;
  861. else if (a - c > .7f * (b-c))
  862. offset = -1;
  863. else
  864. offset = 0;
  865. } else {
  866. offset = 0;
  867. }
  868. *pitch = 2 * best_pitch[0] - offset;
  869. }
  870. static void dct(AudioRNNContext *s, float *out, const float *in)
  871. {
  872. for (int i = 0; i < NB_BANDS; i++) {
  873. float sum;
  874. sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));
  875. out[i] = sum * sqrtf(2.f / 22);
  876. }
  877. }
  878. static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P,
  879. float *Ex, float *Ep, float *Exp, float *features, const float *in)
  880. {
  881. float E = 0;
  882. float *ceps_0, *ceps_1, *ceps_2;
  883. float spec_variability = 0;
  884. LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);
  885. LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);
  886. float pitch_buf[PITCH_BUF_SIZE>>1];
  887. int pitch_index;
  888. float gain;
  889. float *(pre[1]);
  890. float tmp[NB_BANDS];
  891. float follow, logMax;
  892. frame_analysis(s, st, X, Ex, in);
  893. RNN_MOVE(st->pitch_buf, &st->pitch_buf[FRAME_SIZE], PITCH_BUF_SIZE-FRAME_SIZE);
  894. RNN_COPY(&st->pitch_buf[PITCH_BUF_SIZE-FRAME_SIZE], in, FRAME_SIZE);
  895. pre[0] = &st->pitch_buf[0];
  896. pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
  897. pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
  898. PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
  899. pitch_index = PITCH_MAX_PERIOD-pitch_index;
  900. gain = remove_doubling(pitch_buf, PITCH_MAX_PERIOD, PITCH_MIN_PERIOD,
  901. PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
  902. st->last_period = pitch_index;
  903. st->last_gain = gain;
  904. for (int i = 0; i < WINDOW_SIZE; i++)
  905. p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
  906. s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);
  907. forward_transform(st, P, p);
  908. compute_band_energy(Ep, P);
  909. compute_band_corr(Exp, X, P);
  910. for (int i = 0; i < NB_BANDS; i++)
  911. Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);
  912. dct(s, tmp, Exp);
  913. for (int i = 0; i < NB_DELTA_CEPS; i++)
  914. features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
  915. features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
  916. features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
  917. features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
  918. logMax = -2;
  919. follow = -2;
  920. for (int i = 0; i < NB_BANDS; i++) {
  921. Ly[i] = log10f(1e-2f + Ex[i]);
  922. Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));
  923. logMax = FFMAX(logMax, Ly[i]);
  924. follow = FFMAX(follow-1.5, Ly[i]);
  925. E += Ex[i];
  926. }
  927. if (E < 0.04f) {
  928. /* If there's no audio, avoid messing up the state. */
  929. RNN_CLEAR(features, NB_FEATURES);
  930. return 1;
  931. }
  932. dct(s, features, Ly);
  933. features[0] -= 12;
  934. features[1] -= 4;
  935. ceps_0 = st->cepstral_mem[st->memid];
  936. ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
  937. ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
  938. for (int i = 0; i < NB_BANDS; i++)
  939. ceps_0[i] = features[i];
  940. st->memid++;
  941. for (int i = 0; i < NB_DELTA_CEPS; i++) {
  942. features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
  943. features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
  944. features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
  945. }
  946. /* Spectral variability features. */
  947. if (st->memid == CEPS_MEM)
  948. st->memid = 0;
  949. for (int i = 0; i < CEPS_MEM; i++) {
  950. float mindist = 1e15f;
  951. for (int j = 0; j < CEPS_MEM; j++) {
  952. float dist = 0.f;
  953. for (int k = 0; k < NB_BANDS; k++) {
  954. float tmp;
  955. tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
  956. dist += tmp*tmp;
  957. }
  958. if (j != i)
  959. mindist = FFMIN(mindist, dist);
  960. }
  961. spec_variability += mindist;
  962. }
  963. features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
  964. return 0;
  965. }
  966. static void interp_band_gain(float *g, const float *bandE)
  967. {
  968. memset(g, 0, sizeof(*g) * FREQ_SIZE);
  969. for (int i = 0; i < NB_BANDS - 1; i++) {
  970. const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
  971. for (int j = 0; j < band_size; j++) {
  972. float frac = (float)j / band_size;
  973. g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];
  974. }
  975. }
  976. }
  977. static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,
  978. const float *Exp, const float *g)
  979. {
  980. float newE[NB_BANDS];
  981. float r[NB_BANDS];
  982. float norm[NB_BANDS];
  983. float rf[FREQ_SIZE] = {0};
  984. float normf[FREQ_SIZE]={0};
  985. for (int i = 0; i < NB_BANDS; i++) {
  986. if (Exp[i]>g[i]) r[i] = 1;
  987. else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
  988. r[i] = sqrtf(av_clipf(r[i], 0, 1));
  989. r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));
  990. }
  991. interp_band_gain(rf, r);
  992. for (int i = 0; i < FREQ_SIZE; i++) {
  993. X[i].re += rf[i]*P[i].re;
  994. X[i].im += rf[i]*P[i].im;
  995. }
  996. compute_band_energy(newE, X);
  997. for (int i = 0; i < NB_BANDS; i++) {
  998. norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));
  999. }
  1000. interp_band_gain(normf, norm);
  1001. for (int i = 0; i < FREQ_SIZE; i++) {
  1002. X[i].re *= normf[i];
  1003. X[i].im *= normf[i];
  1004. }
  1005. }
  1006. static const float tansig_table[201] = {
  1007. 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
  1008. 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
  1009. 0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
  1010. 0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
  1011. 0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
  1012. 0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
  1013. 0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
  1014. 0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
  1015. 0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
  1016. 0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
  1017. 0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
  1018. 0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
  1019. 0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
  1020. 0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
  1021. 0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
  1022. 0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
  1023. 0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
  1024. 0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
  1025. 0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
  1026. 0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
  1027. 0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
  1028. 0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
  1029. 0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
  1030. 0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
  1031. 0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
  1032. 0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
  1033. 0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
  1034. 0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
  1035. 0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
  1036. 0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
  1037. 0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
  1038. 0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
  1039. 0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
  1040. 0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
  1041. 0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
  1042. 0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
  1043. 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
  1044. 0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
  1045. 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
  1046. 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
  1047. 1.000000f,
  1048. };
  1049. static inline float tansig_approx(float x)
  1050. {
  1051. float y, dy;
  1052. float sign=1;
  1053. int i;
  1054. /* Tests are reversed to catch NaNs */
  1055. if (!(x<8))
  1056. return 1;
  1057. if (!(x>-8))
  1058. return -1;
  1059. /* Another check in case of -ffast-math */
  1060. if (isnan(x))
  1061. return 0;
  1062. if (x < 0) {
  1063. x=-x;
  1064. sign=-1;
  1065. }
  1066. i = (int)floor(.5f+25*x);
  1067. x -= .04f*i;
  1068. y = tansig_table[i];
  1069. dy = 1-y*y;
  1070. y = y + x*dy*(1 - y*x);
  1071. return sign*y;
  1072. }
  1073. static inline float sigmoid_approx(float x)
  1074. {
  1075. return .5f + .5f*tansig_approx(.5f*x);
  1076. }
  1077. static void compute_dense(const DenseLayer *layer, float *output, const float *input)
  1078. {
  1079. const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;
  1080. for (int i = 0; i < N; i++) {
  1081. /* Compute update gate. */
  1082. float sum = layer->bias[i];
  1083. for (int j = 0; j < M; j++)
  1084. sum += layer->input_weights[j * stride + i] * input[j];
  1085. output[i] = WEIGHTS_SCALE * sum;
  1086. }
  1087. if (layer->activation == ACTIVATION_SIGMOID) {
  1088. for (int i = 0; i < N; i++)
  1089. output[i] = sigmoid_approx(output[i]);
  1090. } else if (layer->activation == ACTIVATION_TANH) {
  1091. for (int i = 0; i < N; i++)
  1092. output[i] = tansig_approx(output[i]);
  1093. } else if (layer->activation == ACTIVATION_RELU) {
  1094. for (int i = 0; i < N; i++)
  1095. output[i] = FFMAX(0, output[i]);
  1096. } else {
  1097. av_assert0(0);
  1098. }
  1099. }
  1100. static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
  1101. {
  1102. LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);
  1103. LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);
  1104. LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);
  1105. const int M = gru->nb_inputs;
  1106. const int N = gru->nb_neurons;
  1107. const int AN = FFALIGN(N, 4);
  1108. const int AM = FFALIGN(M, 4);
  1109. const int stride = 3 * AN, istride = 3 * AM;
  1110. for (int i = 0; i < N; i++) {
  1111. /* Compute update gate. */
  1112. float sum = gru->bias[i];
  1113. sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);
  1114. sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);
  1115. z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
  1116. }
  1117. for (int i = 0; i < N; i++) {
  1118. /* Compute reset gate. */
  1119. float sum = gru->bias[N + i];
  1120. sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);
  1121. sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);
  1122. r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
  1123. }
  1124. for (int i = 0; i < N; i++) {
  1125. /* Compute output. */
  1126. float sum = gru->bias[2 * N + i];
  1127. sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);
  1128. for (int j = 0; j < N; j++)
  1129. sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];
  1130. if (gru->activation == ACTIVATION_SIGMOID)
  1131. sum = sigmoid_approx(WEIGHTS_SCALE * sum);
  1132. else if (gru->activation == ACTIVATION_TANH)
  1133. sum = tansig_approx(WEIGHTS_SCALE * sum);
  1134. else if (gru->activation == ACTIVATION_RELU)
  1135. sum = FFMAX(0, WEIGHTS_SCALE * sum);
  1136. else
  1137. av_assert0(0);
  1138. h[i] = z[i] * state[i] + (1.f - z[i]) * sum;
  1139. }
  1140. RNN_COPY(state, h, N);
  1141. }
  1142. #define INPUT_SIZE 42
  1143. static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
  1144. {
  1145. LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);
  1146. LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);
  1147. LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);
  1148. compute_dense(rnn->model->input_dense, dense_out, input);
  1149. compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);
  1150. compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);
  1151. memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));
  1152. memcpy(noise_input + rnn->model->input_dense_size,
  1153. rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
  1154. memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,
  1155. input, INPUT_SIZE * sizeof(float));
  1156. compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);
  1157. memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
  1158. memcpy(denoise_input + rnn->model->vad_gru_size,
  1159. rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));
  1160. memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,
  1161. input, INPUT_SIZE * sizeof(float));
  1162. compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);
  1163. compute_dense(rnn->model->denoise_output, gains, rnn->denoise_gru_state);
  1164. }
  1165. static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
  1166. int disabled)
  1167. {
  1168. AVComplexFloat X[FREQ_SIZE];
  1169. AVComplexFloat P[WINDOW_SIZE];
  1170. float x[FRAME_SIZE];
  1171. float Ex[NB_BANDS], Ep[NB_BANDS];
  1172. LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);
  1173. float features[NB_FEATURES];
  1174. float g[NB_BANDS];
  1175. float gf[FREQ_SIZE];
  1176. float vad_prob = 0;
  1177. float *history = st->history;
  1178. static const float a_hp[2] = {-1.99599, 0.99600};
  1179. static const float b_hp[2] = {-2, 1};
  1180. int silence;
  1181. biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
  1182. silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
  1183. if (!silence && !disabled) {
  1184. compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
  1185. pitch_filter(X, P, Ex, Ep, Exp, g);
  1186. for (int i = 0; i < NB_BANDS; i++) {
  1187. float alpha = .6f;
  1188. g[i] = FFMAX(g[i], alpha * st->lastg[i]);
  1189. st->lastg[i] = g[i];
  1190. }
  1191. interp_band_gain(gf, g);
  1192. for (int i = 0; i < FREQ_SIZE; i++) {
  1193. X[i].re *= gf[i];
  1194. X[i].im *= gf[i];
  1195. }
  1196. }
  1197. frame_synthesis(s, st, out, X);
  1198. memcpy(history, in, FRAME_SIZE * sizeof(*history));
  1199. return vad_prob;
  1200. }
  1201. typedef struct ThreadData {
  1202. AVFrame *in, *out;
  1203. } ThreadData;
  1204. static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
  1205. {
  1206. AudioRNNContext *s = ctx->priv;
  1207. ThreadData *td = arg;
  1208. AVFrame *in = td->in;
  1209. AVFrame *out = td->out;
  1210. const int start = (out->channels * jobnr) / nb_jobs;
  1211. const int end = (out->channels * (jobnr+1)) / nb_jobs;
  1212. for (int ch = start; ch < end; ch++) {
  1213. rnnoise_channel(s, &s->st[ch],
  1214. (float *)out->extended_data[ch],
  1215. (const float *)in->extended_data[ch],
  1216. ctx->is_disabled);
  1217. }
  1218. return 0;
  1219. }
  1220. static int filter_frame(AVFilterLink *inlink, AVFrame *in)
  1221. {
  1222. AVFilterContext *ctx = inlink->dst;
  1223. AVFilterLink *outlink = ctx->outputs[0];
  1224. AVFrame *out = NULL;
  1225. ThreadData td;
  1226. out = ff_get_audio_buffer(outlink, FRAME_SIZE);
  1227. if (!out) {
  1228. av_frame_free(&in);
  1229. return AVERROR(ENOMEM);
  1230. }
  1231. out->pts = in->pts;
  1232. td.in = in; td.out = out;
  1233. ctx->internal->execute(ctx, rnnoise_channels, &td, NULL, FFMIN(outlink->channels,
  1234. ff_filter_get_nb_threads(ctx)));
  1235. av_frame_free(&in);
  1236. return ff_filter_frame(outlink, out);
  1237. }
  1238. static int activate(AVFilterContext *ctx)
  1239. {
  1240. AVFilterLink *inlink = ctx->inputs[0];
  1241. AVFilterLink *outlink = ctx->outputs[0];
  1242. AVFrame *in = NULL;
  1243. int ret;
  1244. FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
  1245. ret = ff_inlink_consume_samples(inlink, FRAME_SIZE, FRAME_SIZE, &in);
  1246. if (ret < 0)
  1247. return ret;
  1248. if (ret > 0)
  1249. return filter_frame(inlink, in);
  1250. FF_FILTER_FORWARD_STATUS(inlink, outlink);
  1251. FF_FILTER_FORWARD_WANTED(outlink, inlink);
  1252. return FFERROR_NOT_READY;
  1253. }
  1254. static int open_model(AVFilterContext *ctx, RNNModel **model)
  1255. {
  1256. AudioRNNContext *s = ctx->priv;
  1257. int ret;
  1258. FILE *f;
  1259. if (!s->model_name)
  1260. return AVERROR(EINVAL);
  1261. f = av_fopen_utf8(s->model_name, "r");
  1262. if (!f) {
  1263. av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
  1264. return AVERROR(EINVAL);
  1265. }
  1266. ret = rnnoise_model_from_file(f, model);
  1267. fclose(f);
  1268. if (!*model || ret < 0)
  1269. return ret;
  1270. return 0;
  1271. }
  1272. static av_cold int init(AVFilterContext *ctx)
  1273. {
  1274. AudioRNNContext *s = ctx->priv;
  1275. int ret;
  1276. s->fdsp = avpriv_float_dsp_alloc(0);
  1277. if (!s->fdsp)
  1278. return AVERROR(ENOMEM);
  1279. ret = open_model(ctx, &s->model[0]);
  1280. if (ret < 0)
  1281. return ret;
  1282. for (int i = 0; i < FRAME_SIZE; i++) {
  1283. s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
  1284. s->window[WINDOW_SIZE - 1 - i] = s->window[i];
  1285. }
  1286. for (int i = 0; i < NB_BANDS; i++) {
  1287. for (int j = 0; j < NB_BANDS; j++) {
  1288. s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);
  1289. if (j == 0)
  1290. s->dct_table[j][i] *= sqrtf(.5);
  1291. }
  1292. }
  1293. return 0;
  1294. }
  1295. static void free_model(AVFilterContext *ctx, int n)
  1296. {
  1297. AudioRNNContext *s = ctx->priv;
  1298. rnnoise_model_free(s->model[n]);
  1299. s->model[n] = NULL;
  1300. for (int ch = 0; ch < s->channels && s->st; ch++) {
  1301. av_freep(&s->st[ch].rnn[n].vad_gru_state);
  1302. av_freep(&s->st[ch].rnn[n].noise_gru_state);
  1303. av_freep(&s->st[ch].rnn[n].denoise_gru_state);
  1304. }
  1305. }
  1306. static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
  1307. char *res, int res_len, int flags)
  1308. {
  1309. AudioRNNContext *s = ctx->priv;
  1310. int ret;
  1311. ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
  1312. if (ret < 0)
  1313. return ret;
  1314. ret = open_model(ctx, &s->model[1]);
  1315. if (ret < 0)
  1316. return ret;
  1317. FFSWAP(RNNModel *, s->model[0], s->model[1]);
  1318. for (int ch = 0; ch < s->channels; ch++)
  1319. FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
  1320. ret = config_input(ctx->inputs[0]);
  1321. if (ret < 0) {
  1322. for (int ch = 0; ch < s->channels; ch++)
  1323. FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
  1324. FFSWAP(RNNModel *, s->model[0], s->model[1]);
  1325. return ret;
  1326. }
  1327. free_model(ctx, 1);
  1328. return 0;
  1329. }
  1330. static av_cold void uninit(AVFilterContext *ctx)
  1331. {
  1332. AudioRNNContext *s = ctx->priv;
  1333. av_freep(&s->fdsp);
  1334. free_model(ctx, 0);
  1335. for (int ch = 0; ch < s->channels && s->st; ch++) {
  1336. av_tx_uninit(&s->st[ch].tx);
  1337. av_tx_uninit(&s->st[ch].txi);
  1338. }
  1339. av_freep(&s->st);
  1340. }
  1341. static const AVFilterPad inputs[] = {
  1342. {
  1343. .name = "default",
  1344. .type = AVMEDIA_TYPE_AUDIO,
  1345. .config_props = config_input,
  1346. },
  1347. { NULL }
  1348. };
  1349. static const AVFilterPad outputs[] = {
  1350. {
  1351. .name = "default",
  1352. .type = AVMEDIA_TYPE_AUDIO,
  1353. },
  1354. { NULL }
  1355. };
  1356. #define OFFSET(x) offsetof(AudioRNNContext, x)
  1357. #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
  1358. static const AVOption arnndn_options[] = {
  1359. { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
  1360. { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
  1361. { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
  1362. { NULL }
  1363. };
  1364. AVFILTER_DEFINE_CLASS(arnndn);
  1365. AVFilter ff_af_arnndn = {
  1366. .name = "arnndn",
  1367. .description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
  1368. .query_formats = query_formats,
  1369. .priv_size = sizeof(AudioRNNContext),
  1370. .priv_class = &arnndn_class,
  1371. .activate = activate,
  1372. .init = init,
  1373. .uninit = uninit,
  1374. .inputs = inputs,
  1375. .outputs = outputs,
  1376. .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
  1377. AVFILTER_FLAG_SLICE_THREADS,
  1378. .process_command = process_command,
  1379. };