You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

557 lines
17KB

  1. /*
  2. * Opus encoder
  3. * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "opusenc_psy.h"
  22. #include "opus_pvq.h"
  23. #include "opustab.h"
  24. #include "mdct15.h"
  25. #include "libavutil/qsort.h"
  26. /* Populate metrics without taking into consideration neighbouring steps */
  27. static void step_collect_psy_metrics(OpusPsyContext *s, int index)
  28. {
  29. int silence = 0, ch, i, j;
  30. OpusPsyStep *st = s->steps[index];
  31. st->index = index;
  32. for (ch = 0; ch < s->avctx->channels; ch++) {
  33. const int lap_size = (1 << s->bsize_analysis);
  34. for (i = 1; i <= FFMIN(lap_size, index); i++) {
  35. const int offset = i*120;
  36. AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i);
  37. memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
  38. }
  39. for (i = 0; i < lap_size; i++) {
  40. const int offset = i*120 + lap_size;
  41. AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i);
  42. memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
  43. }
  44. s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
  45. (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
  46. s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
  47. for (i = 0; i < CELT_MAX_BANDS; i++)
  48. st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
  49. }
  50. for (ch = 0; ch < s->avctx->channels; ch++) {
  51. for (i = 0; i < CELT_MAX_BANDS; i++) {
  52. float avg_c_s, energy = 0.0f, dist_dev = 0.0f;
  53. const int range = ff_celt_freq_range[i] << s->bsize_analysis;
  54. const float *coeffs = st->bands[ch][i];
  55. for (j = 0; j < range; j++)
  56. energy += coeffs[j]*coeffs[j];
  57. st->energy[ch][i] += sqrtf(energy);
  58. silence |= !!st->energy[ch][i];
  59. avg_c_s = energy / range;
  60. for (j = 0; j < range; j++) {
  61. const float c_s = coeffs[j]*coeffs[j];
  62. dist_dev = (avg_c_s - c_s)*(avg_c_s - c_s);
  63. }
  64. st->tone[ch][i] += sqrtf(dist_dev);
  65. }
  66. }
  67. st->silence = !silence;
  68. if (s->avctx->channels > 1) {
  69. for (i = 0; i < CELT_MAX_BANDS; i++) {
  70. float incompat = 0.0f;
  71. const float *coeffs1 = st->bands[0][i];
  72. const float *coeffs2 = st->bands[1][i];
  73. const int range = ff_celt_freq_range[i] << s->bsize_analysis;
  74. for (j = 0; j < range; j++)
  75. incompat += (coeffs1[j] - coeffs2[j])*(coeffs1[j] - coeffs2[j]);
  76. st->stereo[i] = sqrtf(incompat);
  77. }
  78. }
  79. for (ch = 0; ch < s->avctx->channels; ch++) {
  80. for (i = 0; i < CELT_MAX_BANDS; i++) {
  81. OpusBandExcitation *ex = &s->ex[ch][i];
  82. float bp_e = bessel_filter(&s->bfilter_lo[ch][i], st->energy[ch][i]);
  83. bp_e = bessel_filter(&s->bfilter_hi[ch][i], bp_e);
  84. bp_e *= bp_e;
  85. if (bp_e > ex->excitation) {
  86. st->change_amp[ch][i] = bp_e - ex->excitation;
  87. st->total_change += st->change_amp[ch][i];
  88. ex->excitation = ex->excitation_init = bp_e;
  89. ex->excitation_dist = 0.0f;
  90. }
  91. if (ex->excitation > 0.0f) {
  92. ex->excitation -= av_clipf((1/expf(ex->excitation_dist)), ex->excitation_init/20, ex->excitation_init/1.09);
  93. ex->excitation = FFMAX(ex->excitation, 0.0f);
  94. ex->excitation_dist += 1.0f;
  95. }
  96. }
  97. }
  98. }
  99. static void search_for_change_points(OpusPsyContext *s, float tgt_change,
  100. int offset_s, int offset_e, int resolution,
  101. int level)
  102. {
  103. int i;
  104. float c_change = 0.0f;
  105. if ((offset_e - offset_s) <= resolution)
  106. return;
  107. for (i = offset_s; i < offset_e; i++) {
  108. c_change += s->steps[i]->total_change;
  109. if (c_change > tgt_change)
  110. break;
  111. }
  112. if (i == offset_e)
  113. return;
  114. search_for_change_points(s, tgt_change / 2.0f, offset_s, i + 0, resolution, level + 1);
  115. s->inflection_points[s->inflection_points_count++] = i;
  116. search_for_change_points(s, tgt_change / 2.0f, i + 1, offset_e, resolution, level + 1);
  117. }
  118. static int flush_silent_frames(OpusPsyContext *s)
  119. {
  120. int fsize, silent_frames;
  121. for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++)
  122. if (!s->steps[silent_frames]->silence)
  123. break;
  124. if (--silent_frames < 0)
  125. return 0;
  126. for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) {
  127. if ((1 << fsize) > silent_frames)
  128. continue;
  129. s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize);
  130. s->p.framesize = fsize;
  131. return 1;
  132. }
  133. return 0;
  134. }
  135. /* Main function which decides frame size and frames per current packet */
  136. static void psy_output_groups(OpusPsyContext *s)
  137. {
  138. int max_delay_samples = (s->options->max_delay_ms*s->avctx->sample_rate)/1000;
  139. int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960);
  140. /* These don't change for now */
  141. s->p.mode = OPUS_MODE_CELT;
  142. s->p.bandwidth = OPUS_BANDWIDTH_FULLBAND;
  143. /* Flush silent frames ASAP */
  144. if (s->steps[0]->silence && flush_silent_frames(s))
  145. return;
  146. s->p.framesize = FFMIN(max_bsize, CELT_BLOCK_960);
  147. s->p.frames = 1;
  148. }
  149. int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo *p)
  150. {
  151. int i;
  152. float total_energy_change = 0.0f;
  153. if (s->buffered_steps < s->max_steps && !s->eof) {
  154. const int awin = (1 << s->bsize_analysis);
  155. if (++s->steps_to_process >= awin) {
  156. step_collect_psy_metrics(s, s->buffered_steps - awin + 1);
  157. s->steps_to_process = 0;
  158. }
  159. if ((++s->buffered_steps) < s->max_steps)
  160. return 1;
  161. }
  162. for (i = 0; i < s->buffered_steps; i++)
  163. total_energy_change += s->steps[i]->total_change;
  164. search_for_change_points(s, total_energy_change / 2.0f, 0,
  165. s->buffered_steps, 1, 0);
  166. psy_output_groups(s);
  167. p->frames = s->p.frames;
  168. p->framesize = s->p.framesize;
  169. p->mode = s->p.mode;
  170. p->bandwidth = s->p.bandwidth;
  171. return 0;
  172. }
  173. void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index)
  174. {
  175. int i, neighbouring_points = 0, start_offset = 0;
  176. int radius = (1 << s->p.framesize), step_offset = radius*index;
  177. int silence = 1;
  178. f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0;
  179. f->end_band = ff_celt_band_end[s->p.bandwidth];
  180. f->channels = s->avctx->channels;
  181. f->size = s->p.framesize;
  182. for (i = 0; i < (1 << f->size); i++)
  183. silence &= s->steps[index*(1 << f->size) + i]->silence;
  184. f->silence = silence;
  185. if (f->silence) {
  186. f->framebits = 0; /* Otherwise the silence flag eats up 16(!) bits */
  187. return;
  188. }
  189. for (i = 0; i < s->inflection_points_count; i++) {
  190. if (s->inflection_points[i] >= step_offset) {
  191. start_offset = i;
  192. break;
  193. }
  194. }
  195. for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - start_offset); i++) {
  196. if (s->inflection_points[i] < (step_offset + radius)) {
  197. neighbouring_points++;
  198. }
  199. }
  200. /* Transient flagging */
  201. f->transient = neighbouring_points > 0;
  202. f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
  203. /* Some sane defaults */
  204. f->pfilter = 0;
  205. f->pf_gain = 0.5f;
  206. f->pf_octave = 2;
  207. f->pf_period = 1;
  208. f->pf_tapset = 2;
  209. /* More sane defaults */
  210. f->tf_select = 0;
  211. f->anticollapse = 1;
  212. f->alloc_trim = 5;
  213. f->skip_band_floor = f->end_band;
  214. f->intensity_stereo = f->end_band;
  215. f->dual_stereo = 0;
  216. f->spread = CELT_SPREAD_NORMAL;
  217. memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS);
  218. memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS);
  219. }
  220. static void celt_gauge_psy_weight(OpusPsyContext *s, OpusPsyStep **start,
  221. CeltFrame *f_out)
  222. {
  223. int i, f, ch;
  224. int frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
  225. float rate, frame_bits = 0;
  226. /* Used for the global ROTATE flag */
  227. float tonal = 0.0f;
  228. /* Pseudo-weights */
  229. float band_score[CELT_MAX_BANDS] = { 0 };
  230. float max_score = 1.0f;
  231. /* Pass one - one loop around each band, computing unquant stuff */
  232. for (i = 0; i < CELT_MAX_BANDS; i++) {
  233. float weight = 0.0f;
  234. float tonal_contrib = 0.0f;
  235. for (f = 0; f < (1 << s->p.framesize); f++) {
  236. weight = start[f]->stereo[i];
  237. for (ch = 0; ch < s->avctx->channels; ch++) {
  238. weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] + start[f]->energy[ch][i];
  239. tonal_contrib += start[f]->tone[ch][i];
  240. }
  241. }
  242. tonal += tonal_contrib;
  243. band_score[i] = weight;
  244. }
  245. tonal /= (float)CELT_MAX_BANDS;
  246. for (i = 0; i < CELT_MAX_BANDS; i++) {
  247. if (band_score[i] > max_score)
  248. max_score = band_score[i];
  249. }
  250. for (i = 0; i < CELT_MAX_BANDS; i++) {
  251. f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f);
  252. frame_bits += band_score[i]*8.0f;
  253. }
  254. tonal /= 1333136.0f;
  255. f_out->spread = av_clip_uintp2(lrintf(tonal), 2);
  256. rate = ((float)s->avctx->bit_rate) + frame_bits*frame_size*16;
  257. rate *= s->lambda;
  258. rate /= s->avctx->sample_rate/frame_size;
  259. f_out->framebits = lrintf(rate);
  260. f_out->framebits = FFMIN(f_out->framebits, OPUS_MAX_PACKET_SIZE*8);
  261. f_out->framebits = FFALIGN(f_out->framebits, 8);
  262. }
  263. static int bands_dist(OpusPsyContext *s, CeltFrame *f, float *total_dist)
  264. {
  265. int i, tdist = 0.0f;
  266. OpusRangeCoder dump;
  267. ff_opus_rc_enc_init(&dump);
  268. ff_celt_enc_bitalloc(f, &dump);
  269. for (i = 0; i < CELT_MAX_BANDS; i++) {
  270. float bits = 0.0f;
  271. float dist = f->pvq->band_cost(f->pvq, f, &dump, i, &bits, s->lambda);
  272. tdist += dist;
  273. }
  274. *total_dist = tdist;
  275. return 0;
  276. }
  277. static void celt_search_for_dual_stereo(OpusPsyContext *s, CeltFrame *f)
  278. {
  279. float td1, td2;
  280. f->dual_stereo = 0;
  281. bands_dist(s, f, &td1);
  282. f->dual_stereo = 1;
  283. bands_dist(s, f, &td2);
  284. f->dual_stereo = td2 < td1;
  285. s->dual_stereo_used += td2 < td1;
  286. }
  287. static void celt_search_for_intensity(OpusPsyContext *s, CeltFrame *f)
  288. {
  289. int i, best_band = CELT_MAX_BANDS - 1;
  290. float dist, best_dist = FLT_MAX;
  291. /* TODO: fix, make some heuristic up here using the lambda value */
  292. float end_band = 0;
  293. for (i = f->end_band; i >= end_band; i--) {
  294. f->intensity_stereo = i;
  295. bands_dist(s, f, &dist);
  296. if (best_dist > dist) {
  297. best_dist = dist;
  298. best_band = i;
  299. }
  300. }
  301. f->intensity_stereo = best_band;
  302. s->avg_is_band = (s->avg_is_band + f->intensity_stereo)/2.0f;
  303. }
  304. static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, CeltFrame *f)
  305. {
  306. int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } };
  307. float score[2] = { 0 };
  308. for (cway = 0; cway < 2; cway++) {
  309. int mag[2];
  310. int base = f->transient ? 120 : 960;
  311. for (i = 0; i < 2; i++) {
  312. int c = ff_celt_tf_select[f->size][f->transient][cway][i];
  313. mag[i] = c < 0 ? base >> FFABS(c) : base << FFABS(c);
  314. }
  315. for (i = 0; i < CELT_MAX_BANDS; i++) {
  316. float iscore0 = 0.0f;
  317. float iscore1 = 0.0f;
  318. for (j = 0; j < (1 << f->size); j++) {
  319. for (k = 0; k < s->avctx->channels; k++) {
  320. iscore0 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0];
  321. iscore1 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1];
  322. }
  323. }
  324. config[cway][i] = FFABS(iscore0 - 1.0f) < FFABS(iscore1 - 1.0f);
  325. score[cway] += config[cway][i] ? iscore1 : iscore0;
  326. }
  327. }
  328. f->tf_select = score[0] < score[1];
  329. memcpy(f->tf_change, config[f->tf_select], sizeof(int)*CELT_MAX_BANDS);
  330. return 0;
  331. }
  332. int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index)
  333. {
  334. int start_transient_flag = f->transient;
  335. OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)];
  336. if (f->silence)
  337. return 0;
  338. celt_gauge_psy_weight(s, start, f);
  339. celt_search_for_intensity(s, f);
  340. celt_search_for_dual_stereo(s, f);
  341. celt_search_for_tf(s, start, f);
  342. if (f->transient != start_transient_flag) {
  343. f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
  344. s->redo_analysis = 1;
  345. return 1;
  346. }
  347. s->redo_analysis = 0;
  348. return 0;
  349. }
  350. void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc)
  351. {
  352. int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
  353. int steps_out = s->p.frames*(frame_size/120);
  354. void *tmp[FF_BUFQUEUE_SIZE];
  355. float ideal_fbits;
  356. for (i = 0; i < steps_out; i++)
  357. memset(s->steps[i], 0, sizeof(OpusPsyStep));
  358. for (i = 0; i < s->max_steps; i++)
  359. tmp[i] = s->steps[i];
  360. for (i = 0; i < s->max_steps; i++) {
  361. const int i_new = i - steps_out;
  362. s->steps[i_new < 0 ? s->max_steps + i_new : i_new] = tmp[i];
  363. }
  364. for (i = steps_out; i < s->buffered_steps; i++)
  365. s->steps[i]->index -= steps_out;
  366. ideal_fbits = s->avctx->bit_rate/(s->avctx->sample_rate/frame_size);
  367. for (i = 0; i < s->p.frames; i++) {
  368. s->avg_is_band += f[i].intensity_stereo;
  369. s->lambda *= ideal_fbits / f[i].framebits;
  370. }
  371. s->avg_is_band /= (s->p.frames + 1);
  372. s->cs_num = 0;
  373. s->steps_to_process = 0;
  374. s->buffered_steps -= steps_out;
  375. s->total_packets_out += s->p.frames;
  376. s->inflection_points_count = 0;
  377. }
  378. av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
  379. struct FFBufQueue *bufqueue, OpusEncOptions *options)
  380. {
  381. int i, ch, ret;
  382. s->redo_analysis = 0;
  383. s->lambda = 1.0f;
  384. s->options = options;
  385. s->avctx = avctx;
  386. s->bufqueue = bufqueue;
  387. s->max_steps = ceilf(s->options->max_delay_ms/2.5f);
  388. s->bsize_analysis = CELT_BLOCK_960;
  389. s->avg_is_band = CELT_MAX_BANDS - 1;
  390. s->inflection_points_count = 0;
  391. s->inflection_points = av_mallocz(sizeof(*s->inflection_points)*s->max_steps);
  392. if (!s->inflection_points) {
  393. ret = AVERROR(ENOMEM);
  394. goto fail;
  395. }
  396. s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
  397. if (!s->dsp) {
  398. ret = AVERROR(ENOMEM);
  399. goto fail;
  400. }
  401. for (ch = 0; ch < s->avctx->channels; ch++) {
  402. for (i = 0; i < CELT_MAX_BANDS; i++) {
  403. bessel_init(&s->bfilter_hi[ch][i], 1.0f, 19.0f, 100.0f, 1);
  404. bessel_init(&s->bfilter_lo[ch][i], 1.0f, 20.0f, 100.0f, 0);
  405. }
  406. }
  407. for (i = 0; i < s->max_steps; i++) {
  408. s->steps[i] = av_mallocz(sizeof(OpusPsyStep));
  409. if (!s->steps[i]) {
  410. ret = AVERROR(ENOMEM);
  411. goto fail;
  412. }
  413. }
  414. for (i = 0; i < CELT_BLOCK_NB; i++) {
  415. float tmp;
  416. const int len = OPUS_BLOCK_SIZE(i);
  417. s->window[i] = av_malloc(2*len*sizeof(float));
  418. if (!s->window[i]) {
  419. ret = AVERROR(ENOMEM);
  420. goto fail;
  421. }
  422. generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
  423. if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
  424. goto fail;
  425. }
  426. return 0;
  427. fail:
  428. av_freep(&s->inflection_points);
  429. av_freep(&s->dsp);
  430. for (i = 0; i < CELT_BLOCK_NB; i++) {
  431. ff_mdct15_uninit(&s->mdct[i]);
  432. av_freep(&s->window[i]);
  433. }
  434. for (i = 0; i < s->max_steps; i++)
  435. av_freep(&s->steps[i]);
  436. return ret;
  437. }
  438. void ff_opus_psy_signal_eof(OpusPsyContext *s)
  439. {
  440. s->eof = 1;
  441. }
  442. av_cold int ff_opus_psy_end(OpusPsyContext *s)
  443. {
  444. int i;
  445. av_freep(&s->inflection_points);
  446. av_freep(&s->dsp);
  447. for (i = 0; i < CELT_BLOCK_NB; i++) {
  448. ff_mdct15_uninit(&s->mdct[i]);
  449. av_freep(&s->window[i]);
  450. }
  451. for (i = 0; i < s->max_steps; i++)
  452. av_freep(&s->steps[i]);
  453. av_log(s->avctx, AV_LOG_INFO, "Average Intensity Stereo band: %0.1f\n", s->avg_is_band);
  454. av_log(s->avctx, AV_LOG_INFO, "Dual Stereo used: %0.2f%%\n", ((float)s->dual_stereo_used/s->total_packets_out)*100.0f);
  455. return 0;
  456. }