You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2087 lines
86KB

  1. /*
  2. * VP9 compatible video decoder
  3. *
  4. * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
  5. * Copyright (C) 2013 Clément Bœsch <u pkh me>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include "libavutil/avassert.h"
  24. #include "avcodec.h"
  25. #include "internal.h"
  26. #include "videodsp.h"
  27. #include "vp56.h"
  28. #include "vp9.h"
  29. #include "vp9data.h"
  30. #include "vp9dec.h"
  31. static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
  32. {
  33. { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
  34. { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
  35. }, {
  36. { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
  37. { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
  38. }
  39. };
  40. static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
  41. ptrdiff_t stride, int v)
  42. {
  43. switch (w) {
  44. case 1:
  45. do {
  46. *ptr = v;
  47. ptr += stride;
  48. } while (--h);
  49. break;
  50. case 2: {
  51. int v16 = v * 0x0101;
  52. do {
  53. AV_WN16A(ptr, v16);
  54. ptr += stride;
  55. } while (--h);
  56. break;
  57. }
  58. case 4: {
  59. uint32_t v32 = v * 0x01010101;
  60. do {
  61. AV_WN32A(ptr, v32);
  62. ptr += stride;
  63. } while (--h);
  64. break;
  65. }
  66. case 8: {
  67. #if HAVE_FAST_64BIT
  68. uint64_t v64 = v * 0x0101010101010101ULL;
  69. do {
  70. AV_WN64A(ptr, v64);
  71. ptr += stride;
  72. } while (--h);
  73. #else
  74. uint32_t v32 = v * 0x01010101;
  75. do {
  76. AV_WN32A(ptr, v32);
  77. AV_WN32A(ptr + 4, v32);
  78. ptr += stride;
  79. } while (--h);
  80. #endif
  81. break;
  82. }
  83. }
  84. }
  85. static void decode_mode(AVCodecContext *avctx)
  86. {
  87. static const uint8_t left_ctx[N_BS_SIZES] = {
  88. 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
  89. };
  90. static const uint8_t above_ctx[N_BS_SIZES] = {
  91. 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
  92. };
  93. static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
  94. TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
  95. TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
  96. };
  97. VP9Context *s = avctx->priv_data;
  98. VP9Block *b = s->b;
  99. int row = s->row, col = s->col, row7 = s->row7;
  100. enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
  101. int bw4 = bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
  102. int bh4 = bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
  103. int have_a = row > 0, have_l = col > s->tile_col_start;
  104. int vref, filter_id;
  105. if (!s->s.h.segmentation.enabled) {
  106. b->seg_id = 0;
  107. } else if (s->s.h.keyframe || s->s.h.intraonly) {
  108. b->seg_id = !s->s.h.segmentation.update_map ? 0 :
  109. vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree, s->s.h.segmentation.prob);
  110. } else if (!s->s.h.segmentation.update_map ||
  111. (s->s.h.segmentation.temporal &&
  112. vp56_rac_get_prob_branchy(&s->c,
  113. s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
  114. s->left_segpred_ctx[row7]]))) {
  115. if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
  116. int pred = 8, x;
  117. uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
  118. if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
  119. ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
  120. for (y = 0; y < h4; y++) {
  121. int idx_base = (y + row) * 8 * s->sb_cols + col;
  122. for (x = 0; x < w4; x++)
  123. pred = FFMIN(pred, refsegmap[idx_base + x]);
  124. }
  125. av_assert1(pred < 8);
  126. b->seg_id = pred;
  127. } else {
  128. b->seg_id = 0;
  129. }
  130. memset(&s->above_segpred_ctx[col], 1, w4);
  131. memset(&s->left_segpred_ctx[row7], 1, h4);
  132. } else {
  133. b->seg_id = vp8_rac_get_tree(&s->c, ff_vp9_segmentation_tree,
  134. s->s.h.segmentation.prob);
  135. memset(&s->above_segpred_ctx[col], 0, w4);
  136. memset(&s->left_segpred_ctx[row7], 0, h4);
  137. }
  138. if (s->s.h.segmentation.enabled &&
  139. (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
  140. setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
  141. bw4, bh4, 8 * s->sb_cols, b->seg_id);
  142. }
  143. b->skip = s->s.h.segmentation.enabled &&
  144. s->s.h.segmentation.feat[b->seg_id].skip_enabled;
  145. if (!b->skip) {
  146. int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
  147. b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
  148. s->counts.skip[c][b->skip]++;
  149. }
  150. if (s->s.h.keyframe || s->s.h.intraonly) {
  151. b->intra = 1;
  152. } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
  153. b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
  154. } else {
  155. int c, bit;
  156. if (have_a && have_l) {
  157. c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
  158. c += (c == 2);
  159. } else {
  160. c = have_a ? 2 * s->above_intra_ctx[col] :
  161. have_l ? 2 * s->left_intra_ctx[row7] : 0;
  162. }
  163. bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
  164. s->counts.intra[c][bit]++;
  165. b->intra = !bit;
  166. }
  167. if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
  168. int c;
  169. if (have_a) {
  170. if (have_l) {
  171. c = (s->above_skip_ctx[col] ? max_tx :
  172. s->above_txfm_ctx[col]) +
  173. (s->left_skip_ctx[row7] ? max_tx :
  174. s->left_txfm_ctx[row7]) > max_tx;
  175. } else {
  176. c = s->above_skip_ctx[col] ? 1 :
  177. (s->above_txfm_ctx[col] * 2 > max_tx);
  178. }
  179. } else if (have_l) {
  180. c = s->left_skip_ctx[row7] ? 1 :
  181. (s->left_txfm_ctx[row7] * 2 > max_tx);
  182. } else {
  183. c = 1;
  184. }
  185. switch (max_tx) {
  186. case TX_32X32:
  187. b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
  188. if (b->tx) {
  189. b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
  190. if (b->tx == 2)
  191. b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
  192. }
  193. s->counts.tx32p[c][b->tx]++;
  194. break;
  195. case TX_16X16:
  196. b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
  197. if (b->tx)
  198. b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
  199. s->counts.tx16p[c][b->tx]++;
  200. break;
  201. case TX_8X8:
  202. b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
  203. s->counts.tx8p[c][b->tx]++;
  204. break;
  205. case TX_4X4:
  206. b->tx = TX_4X4;
  207. break;
  208. }
  209. } else {
  210. b->tx = FFMIN(max_tx, s->s.h.txfmmode);
  211. }
  212. if (s->s.h.keyframe || s->s.h.intraonly) {
  213. uint8_t *a = &s->above_mode_ctx[col * 2];
  214. uint8_t *l = &s->left_mode_ctx[(row7) << 1];
  215. b->comp = 0;
  216. if (b->bs > BS_8x8) {
  217. // FIXME the memory storage intermediates here aren't really
  218. // necessary, they're just there to make the code slightly
  219. // simpler for now
  220. b->mode[0] =
  221. a[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  222. ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
  223. if (b->bs != BS_8x4) {
  224. b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  225. ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
  226. l[0] =
  227. a[1] = b->mode[1];
  228. } else {
  229. l[0] =
  230. a[1] =
  231. b->mode[1] = b->mode[0];
  232. }
  233. if (b->bs != BS_4x8) {
  234. b->mode[2] =
  235. a[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  236. ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
  237. if (b->bs != BS_8x4) {
  238. b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  239. ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
  240. l[1] =
  241. a[1] = b->mode[3];
  242. } else {
  243. l[1] =
  244. a[1] =
  245. b->mode[3] = b->mode[2];
  246. }
  247. } else {
  248. b->mode[2] = b->mode[0];
  249. l[1] =
  250. a[1] =
  251. b->mode[3] = b->mode[1];
  252. }
  253. } else {
  254. b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  255. ff_vp9_default_kf_ymode_probs[*a][*l]);
  256. b->mode[3] =
  257. b->mode[2] =
  258. b->mode[1] = b->mode[0];
  259. // FIXME this can probably be optimized
  260. memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
  261. memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
  262. }
  263. b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  264. ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
  265. } else if (b->intra) {
  266. b->comp = 0;
  267. if (b->bs > BS_8x8) {
  268. b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  269. s->prob.p.y_mode[0]);
  270. s->counts.y_mode[0][b->mode[0]]++;
  271. if (b->bs != BS_8x4) {
  272. b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  273. s->prob.p.y_mode[0]);
  274. s->counts.y_mode[0][b->mode[1]]++;
  275. } else {
  276. b->mode[1] = b->mode[0];
  277. }
  278. if (b->bs != BS_4x8) {
  279. b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  280. s->prob.p.y_mode[0]);
  281. s->counts.y_mode[0][b->mode[2]]++;
  282. if (b->bs != BS_8x4) {
  283. b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  284. s->prob.p.y_mode[0]);
  285. s->counts.y_mode[0][b->mode[3]]++;
  286. } else {
  287. b->mode[3] = b->mode[2];
  288. }
  289. } else {
  290. b->mode[2] = b->mode[0];
  291. b->mode[3] = b->mode[1];
  292. }
  293. } else {
  294. static const uint8_t size_group[10] = {
  295. 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
  296. };
  297. int sz = size_group[b->bs];
  298. b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  299. s->prob.p.y_mode[sz]);
  300. b->mode[1] =
  301. b->mode[2] =
  302. b->mode[3] = b->mode[0];
  303. s->counts.y_mode[sz][b->mode[3]]++;
  304. }
  305. b->uvmode = vp8_rac_get_tree(&s->c, ff_vp9_intramode_tree,
  306. s->prob.p.uv_mode[b->mode[3]]);
  307. s->counts.uv_mode[b->mode[3]][b->uvmode]++;
  308. } else {
  309. static const uint8_t inter_mode_ctx_lut[14][14] = {
  310. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  311. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  312. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  313. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  314. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  315. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  316. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  317. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  318. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  319. { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
  320. { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
  321. { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
  322. { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
  323. { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
  324. };
  325. if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
  326. av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
  327. b->comp = 0;
  328. b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
  329. } else {
  330. // read comp_pred flag
  331. if (s->s.h.comppredmode != PRED_SWITCHABLE) {
  332. b->comp = s->s.h.comppredmode == PRED_COMPREF;
  333. } else {
  334. int c;
  335. // FIXME add intra as ref=0xff (or -1) to make these easier?
  336. if (have_a) {
  337. if (have_l) {
  338. if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
  339. c = 4;
  340. } else if (s->above_comp_ctx[col]) {
  341. c = 2 + (s->left_intra_ctx[row7] ||
  342. s->left_ref_ctx[row7] == s->s.h.fixcompref);
  343. } else if (s->left_comp_ctx[row7]) {
  344. c = 2 + (s->above_intra_ctx[col] ||
  345. s->above_ref_ctx[col] == s->s.h.fixcompref);
  346. } else {
  347. c = (!s->above_intra_ctx[col] &&
  348. s->above_ref_ctx[col] == s->s.h.fixcompref) ^
  349. (!s->left_intra_ctx[row7] &&
  350. s->left_ref_ctx[row & 7] == s->s.h.fixcompref);
  351. }
  352. } else {
  353. c = s->above_comp_ctx[col] ? 3 :
  354. (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
  355. }
  356. } else if (have_l) {
  357. c = s->left_comp_ctx[row7] ? 3 :
  358. (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->s.h.fixcompref);
  359. } else {
  360. c = 1;
  361. }
  362. b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
  363. s->counts.comp[c][b->comp]++;
  364. }
  365. // read actual references
  366. // FIXME probably cache a few variables here to prevent repetitive
  367. // memory accesses below
  368. if (b->comp) { /* two references */
  369. int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
  370. b->ref[fix_idx] = s->s.h.fixcompref;
  371. // FIXME can this codeblob be replaced by some sort of LUT?
  372. if (have_a) {
  373. if (have_l) {
  374. if (s->above_intra_ctx[col]) {
  375. if (s->left_intra_ctx[row7]) {
  376. c = 2;
  377. } else {
  378. c = 1 + 2 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
  379. }
  380. } else if (s->left_intra_ctx[row7]) {
  381. c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
  382. } else {
  383. int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
  384. if (refl == refa && refa == s->s.h.varcompref[1]) {
  385. c = 0;
  386. } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
  387. if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
  388. (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
  389. c = 4;
  390. } else {
  391. c = (refa == refl) ? 3 : 1;
  392. }
  393. } else if (!s->left_comp_ctx[row7]) {
  394. if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
  395. c = 1;
  396. } else {
  397. c = (refl == s->s.h.varcompref[1] &&
  398. refa != s->s.h.varcompref[1]) ? 2 : 4;
  399. }
  400. } else if (!s->above_comp_ctx[col]) {
  401. if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
  402. c = 1;
  403. } else {
  404. c = (refa == s->s.h.varcompref[1] &&
  405. refl != s->s.h.varcompref[1]) ? 2 : 4;
  406. }
  407. } else {
  408. c = (refl == refa) ? 4 : 2;
  409. }
  410. }
  411. } else {
  412. if (s->above_intra_ctx[col]) {
  413. c = 2;
  414. } else if (s->above_comp_ctx[col]) {
  415. c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
  416. } else {
  417. c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
  418. }
  419. }
  420. } else if (have_l) {
  421. if (s->left_intra_ctx[row7]) {
  422. c = 2;
  423. } else if (s->left_comp_ctx[row7]) {
  424. c = 4 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
  425. } else {
  426. c = 3 * (s->left_ref_ctx[row7] != s->s.h.varcompref[1]);
  427. }
  428. } else {
  429. c = 2;
  430. }
  431. bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
  432. b->ref[var_idx] = s->s.h.varcompref[bit];
  433. s->counts.comp_ref[c][bit]++;
  434. } else /* single reference */ {
  435. int bit, c;
  436. if (have_a && !s->above_intra_ctx[col]) {
  437. if (have_l && !s->left_intra_ctx[row7]) {
  438. if (s->left_comp_ctx[row7]) {
  439. if (s->above_comp_ctx[col]) {
  440. c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7] ||
  441. !s->above_ref_ctx[col]);
  442. } else {
  443. c = (3 * !s->above_ref_ctx[col]) +
  444. (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
  445. }
  446. } else if (s->above_comp_ctx[col]) {
  447. c = (3 * !s->left_ref_ctx[row7]) +
  448. (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
  449. } else {
  450. c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
  451. }
  452. } else if (s->above_intra_ctx[col]) {
  453. c = 2;
  454. } else if (s->above_comp_ctx[col]) {
  455. c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
  456. } else {
  457. c = 4 * (!s->above_ref_ctx[col]);
  458. }
  459. } else if (have_l && !s->left_intra_ctx[row7]) {
  460. if (s->left_intra_ctx[row7]) {
  461. c = 2;
  462. } else if (s->left_comp_ctx[row7]) {
  463. c = 1 + (!s->s.h.fixcompref || !s->left_ref_ctx[row7]);
  464. } else {
  465. c = 4 * (!s->left_ref_ctx[row7]);
  466. }
  467. } else {
  468. c = 2;
  469. }
  470. bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
  471. s->counts.single_ref[c][0][bit]++;
  472. if (!bit) {
  473. b->ref[0] = 0;
  474. } else {
  475. // FIXME can this codeblob be replaced by some sort of LUT?
  476. if (have_a) {
  477. if (have_l) {
  478. if (s->left_intra_ctx[row7]) {
  479. if (s->above_intra_ctx[col]) {
  480. c = 2;
  481. } else if (s->above_comp_ctx[col]) {
  482. c = 1 + 2 * (s->s.h.fixcompref == 1 ||
  483. s->above_ref_ctx[col] == 1);
  484. } else if (!s->above_ref_ctx[col]) {
  485. c = 3;
  486. } else {
  487. c = 4 * (s->above_ref_ctx[col] == 1);
  488. }
  489. } else if (s->above_intra_ctx[col]) {
  490. if (s->left_intra_ctx[row7]) {
  491. c = 2;
  492. } else if (s->left_comp_ctx[row7]) {
  493. c = 1 + 2 * (s->s.h.fixcompref == 1 ||
  494. s->left_ref_ctx[row7] == 1);
  495. } else if (!s->left_ref_ctx[row7]) {
  496. c = 3;
  497. } else {
  498. c = 4 * (s->left_ref_ctx[row7] == 1);
  499. }
  500. } else if (s->above_comp_ctx[col]) {
  501. if (s->left_comp_ctx[row7]) {
  502. if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
  503. c = 3 * (s->s.h.fixcompref == 1 ||
  504. s->left_ref_ctx[row7] == 1);
  505. } else {
  506. c = 2;
  507. }
  508. } else if (!s->left_ref_ctx[row7]) {
  509. c = 1 + 2 * (s->s.h.fixcompref == 1 ||
  510. s->above_ref_ctx[col] == 1);
  511. } else {
  512. c = 3 * (s->left_ref_ctx[row7] == 1) +
  513. (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
  514. }
  515. } else if (s->left_comp_ctx[row7]) {
  516. if (!s->above_ref_ctx[col]) {
  517. c = 1 + 2 * (s->s.h.fixcompref == 1 ||
  518. s->left_ref_ctx[row7] == 1);
  519. } else {
  520. c = 3 * (s->above_ref_ctx[col] == 1) +
  521. (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
  522. }
  523. } else if (!s->above_ref_ctx[col]) {
  524. if (!s->left_ref_ctx[row7]) {
  525. c = 3;
  526. } else {
  527. c = 4 * (s->left_ref_ctx[row7] == 1);
  528. }
  529. } else if (!s->left_ref_ctx[row7]) {
  530. c = 4 * (s->above_ref_ctx[col] == 1);
  531. } else {
  532. c = 2 * (s->left_ref_ctx[row7] == 1) +
  533. 2 * (s->above_ref_ctx[col] == 1);
  534. }
  535. } else {
  536. if (s->above_intra_ctx[col] ||
  537. (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
  538. c = 2;
  539. } else if (s->above_comp_ctx[col]) {
  540. c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
  541. } else {
  542. c = 4 * (s->above_ref_ctx[col] == 1);
  543. }
  544. }
  545. } else if (have_l) {
  546. if (s->left_intra_ctx[row7] ||
  547. (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
  548. c = 2;
  549. } else if (s->left_comp_ctx[row7]) {
  550. c = 3 * (s->s.h.fixcompref == 1 || s->left_ref_ctx[row7] == 1);
  551. } else {
  552. c = 4 * (s->left_ref_ctx[row7] == 1);
  553. }
  554. } else {
  555. c = 2;
  556. }
  557. bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
  558. s->counts.single_ref[c][1][bit]++;
  559. b->ref[0] = 1 + bit;
  560. }
  561. }
  562. }
  563. if (b->bs <= BS_8x8) {
  564. if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
  565. b->mode[0] =
  566. b->mode[1] =
  567. b->mode[2] =
  568. b->mode[3] = ZEROMV;
  569. } else {
  570. static const uint8_t off[10] = {
  571. 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
  572. };
  573. // FIXME this needs to use the LUT tables from find_ref_mvs
  574. // because not all are -1,0/0,-1
  575. int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
  576. [s->left_mode_ctx[row7 + off[b->bs]]];
  577. b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
  578. s->prob.p.mv_mode[c]);
  579. b->mode[1] =
  580. b->mode[2] =
  581. b->mode[3] = b->mode[0];
  582. s->counts.mv_mode[c][b->mode[0] - 10]++;
  583. }
  584. }
  585. if (s->s.h.filtermode == FILTER_SWITCHABLE) {
  586. int c;
  587. if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
  588. if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
  589. c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
  590. s->left_filter_ctx[row7] : 3;
  591. } else {
  592. c = s->above_filter_ctx[col];
  593. }
  594. } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
  595. c = s->left_filter_ctx[row7];
  596. } else {
  597. c = 3;
  598. }
  599. filter_id = vp8_rac_get_tree(&s->c, ff_vp9_filter_tree,
  600. s->prob.p.filter[c]);
  601. s->counts.filter[c][filter_id]++;
  602. b->filter = ff_vp9_filter_lut[filter_id];
  603. } else {
  604. b->filter = s->s.h.filtermode;
  605. }
  606. if (b->bs > BS_8x8) {
  607. int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
  608. b->mode[0] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
  609. s->prob.p.mv_mode[c]);
  610. s->counts.mv_mode[c][b->mode[0] - 10]++;
  611. ff_vp9_fill_mv(s, b->mv[0], b->mode[0], 0);
  612. if (b->bs != BS_8x4) {
  613. b->mode[1] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
  614. s->prob.p.mv_mode[c]);
  615. s->counts.mv_mode[c][b->mode[1] - 10]++;
  616. ff_vp9_fill_mv(s, b->mv[1], b->mode[1], 1);
  617. } else {
  618. b->mode[1] = b->mode[0];
  619. AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
  620. AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
  621. }
  622. if (b->bs != BS_4x8) {
  623. b->mode[2] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
  624. s->prob.p.mv_mode[c]);
  625. s->counts.mv_mode[c][b->mode[2] - 10]++;
  626. ff_vp9_fill_mv(s, b->mv[2], b->mode[2], 2);
  627. if (b->bs != BS_8x4) {
  628. b->mode[3] = vp8_rac_get_tree(&s->c, ff_vp9_inter_mode_tree,
  629. s->prob.p.mv_mode[c]);
  630. s->counts.mv_mode[c][b->mode[3] - 10]++;
  631. ff_vp9_fill_mv(s, b->mv[3], b->mode[3], 3);
  632. } else {
  633. b->mode[3] = b->mode[2];
  634. AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
  635. AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
  636. }
  637. } else {
  638. b->mode[2] = b->mode[0];
  639. AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
  640. AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
  641. b->mode[3] = b->mode[1];
  642. AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
  643. AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
  644. }
  645. } else {
  646. ff_vp9_fill_mv(s, b->mv[0], b->mode[0], -1);
  647. AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
  648. AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
  649. AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
  650. AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
  651. AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
  652. AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
  653. }
  654. vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
  655. }
  656. #if HAVE_FAST_64BIT
  657. #define SPLAT_CTX(var, val, n) \
  658. switch (n) { \
  659. case 1: var = val; break; \
  660. case 2: AV_WN16A(&var, val * 0x0101); break; \
  661. case 4: AV_WN32A(&var, val * 0x01010101); break; \
  662. case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
  663. case 16: { \
  664. uint64_t v64 = val * 0x0101010101010101ULL; \
  665. AV_WN64A( &var, v64); \
  666. AV_WN64A(&((uint8_t *) &var)[8], v64); \
  667. break; \
  668. } \
  669. }
  670. #else
  671. #define SPLAT_CTX(var, val, n) \
  672. switch (n) { \
  673. case 1: var = val; break; \
  674. case 2: AV_WN16A(&var, val * 0x0101); break; \
  675. case 4: AV_WN32A(&var, val * 0x01010101); break; \
  676. case 8: { \
  677. uint32_t v32 = val * 0x01010101; \
  678. AV_WN32A( &var, v32); \
  679. AV_WN32A(&((uint8_t *) &var)[4], v32); \
  680. break; \
  681. } \
  682. case 16: { \
  683. uint32_t v32 = val * 0x01010101; \
  684. AV_WN32A( &var, v32); \
  685. AV_WN32A(&((uint8_t *) &var)[4], v32); \
  686. AV_WN32A(&((uint8_t *) &var)[8], v32); \
  687. AV_WN32A(&((uint8_t *) &var)[12], v32); \
  688. break; \
  689. } \
  690. }
  691. #endif
  692. switch (bwh_tab[1][b->bs][0]) {
  693. #define SET_CTXS(dir, off, n) \
  694. do { \
  695. SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
  696. SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
  697. SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
  698. if (!s->s.h.keyframe && !s->s.h.intraonly) { \
  699. SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
  700. SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
  701. SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
  702. if (!b->intra) { \
  703. SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
  704. if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
  705. SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
  706. } \
  707. } \
  708. } \
  709. } while (0)
  710. case 1: SET_CTXS(above, col, 1); break;
  711. case 2: SET_CTXS(above, col, 2); break;
  712. case 4: SET_CTXS(above, col, 4); break;
  713. case 8: SET_CTXS(above, col, 8); break;
  714. }
  715. switch (bwh_tab[1][b->bs][1]) {
  716. case 1: SET_CTXS(left, row7, 1); break;
  717. case 2: SET_CTXS(left, row7, 2); break;
  718. case 4: SET_CTXS(left, row7, 4); break;
  719. case 8: SET_CTXS(left, row7, 8); break;
  720. }
  721. #undef SPLAT_CTX
  722. #undef SET_CTXS
  723. if (!s->s.h.keyframe && !s->s.h.intraonly) {
  724. if (b->bs > BS_8x8) {
  725. int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
  726. AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
  727. AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
  728. AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
  729. AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
  730. AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
  731. AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
  732. AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
  733. AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
  734. } else {
  735. int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
  736. for (n = 0; n < w4 * 2; n++) {
  737. AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
  738. AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
  739. }
  740. for (n = 0; n < h4 * 2; n++) {
  741. AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
  742. AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
  743. }
  744. }
  745. }
  746. // FIXME kinda ugly
  747. for (y = 0; y < h4; y++) {
  748. int x, o = (row + y) * s->sb_cols * 8 + col;
  749. VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
  750. if (b->intra) {
  751. for (x = 0; x < w4; x++) {
  752. mv[x].ref[0] =
  753. mv[x].ref[1] = -1;
  754. }
  755. } else if (b->comp) {
  756. for (x = 0; x < w4; x++) {
  757. mv[x].ref[0] = b->ref[0];
  758. mv[x].ref[1] = b->ref[1];
  759. AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
  760. AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
  761. }
  762. } else {
  763. for (x = 0; x < w4; x++) {
  764. mv[x].ref[0] = b->ref[0];
  765. mv[x].ref[1] = -1;
  766. AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
  767. }
  768. }
  769. }
  770. }
  771. // FIXME merge cnt/eob arguments?
  772. static av_always_inline int
  773. decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
  774. int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
  775. unsigned (*eob)[6][2], uint8_t (*p)[6][11],
  776. int nnz, const int16_t *scan, const int16_t (*nb)[2],
  777. const int16_t *band_counts, const int16_t *qmul)
  778. {
  779. int i = 0, band = 0, band_left = band_counts[band];
  780. uint8_t *tp = p[0][nnz];
  781. uint8_t cache[1024];
  782. do {
  783. int val, rc;
  784. val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
  785. eob[band][nnz][val]++;
  786. if (!val)
  787. break;
  788. skip_eob:
  789. if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
  790. cnt[band][nnz][0]++;
  791. if (!--band_left)
  792. band_left = band_counts[++band];
  793. cache[scan[i]] = 0;
  794. nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
  795. tp = p[band][nnz];
  796. if (++i == n_coeffs)
  797. break; //invalid input; blocks should end with EOB
  798. goto skip_eob;
  799. }
  800. rc = scan[i];
  801. if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
  802. cnt[band][nnz][1]++;
  803. val = 1;
  804. cache[rc] = 1;
  805. } else {
  806. // fill in p[3-10] (model fill) - only once per frame for each pos
  807. if (!tp[3])
  808. memcpy(&tp[3], ff_vp9_model_pareto8[tp[2]], 8);
  809. cnt[band][nnz][2]++;
  810. if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
  811. if (!vp56_rac_get_prob_branchy(c, tp[4])) {
  812. cache[rc] = val = 2;
  813. } else {
  814. val = 3 + vp56_rac_get_prob(c, tp[5]);
  815. cache[rc] = 3;
  816. }
  817. } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
  818. cache[rc] = 4;
  819. if (!vp56_rac_get_prob_branchy(c, tp[7])) {
  820. val = vp56_rac_get_prob(c, 159) + 5;
  821. } else {
  822. val = (vp56_rac_get_prob(c, 165) << 1) + 7;
  823. val += vp56_rac_get_prob(c, 145);
  824. }
  825. } else { // cat 3-6
  826. cache[rc] = 5;
  827. if (!vp56_rac_get_prob_branchy(c, tp[8])) {
  828. if (!vp56_rac_get_prob_branchy(c, tp[9])) {
  829. val = 11 + (vp56_rac_get_prob(c, 173) << 2);
  830. val += (vp56_rac_get_prob(c, 148) << 1);
  831. val += vp56_rac_get_prob(c, 140);
  832. } else {
  833. val = 19 + (vp56_rac_get_prob(c, 176) << 3);
  834. val += (vp56_rac_get_prob(c, 155) << 2);
  835. val += (vp56_rac_get_prob(c, 140) << 1);
  836. val += vp56_rac_get_prob(c, 135);
  837. }
  838. } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
  839. val = (vp56_rac_get_prob(c, 180) << 4) + 35;
  840. val += (vp56_rac_get_prob(c, 157) << 3);
  841. val += (vp56_rac_get_prob(c, 141) << 2);
  842. val += (vp56_rac_get_prob(c, 134) << 1);
  843. val += vp56_rac_get_prob(c, 130);
  844. } else {
  845. val = 67;
  846. if (!is8bitsperpixel) {
  847. if (bpp == 12) {
  848. val += vp56_rac_get_prob(c, 255) << 17;
  849. val += vp56_rac_get_prob(c, 255) << 16;
  850. }
  851. val += (vp56_rac_get_prob(c, 255) << 15);
  852. val += (vp56_rac_get_prob(c, 255) << 14);
  853. }
  854. val += (vp56_rac_get_prob(c, 254) << 13);
  855. val += (vp56_rac_get_prob(c, 254) << 12);
  856. val += (vp56_rac_get_prob(c, 254) << 11);
  857. val += (vp56_rac_get_prob(c, 252) << 10);
  858. val += (vp56_rac_get_prob(c, 249) << 9);
  859. val += (vp56_rac_get_prob(c, 243) << 8);
  860. val += (vp56_rac_get_prob(c, 230) << 7);
  861. val += (vp56_rac_get_prob(c, 196) << 6);
  862. val += (vp56_rac_get_prob(c, 177) << 5);
  863. val += (vp56_rac_get_prob(c, 153) << 4);
  864. val += (vp56_rac_get_prob(c, 140) << 3);
  865. val += (vp56_rac_get_prob(c, 133) << 2);
  866. val += (vp56_rac_get_prob(c, 130) << 1);
  867. val += vp56_rac_get_prob(c, 129);
  868. }
  869. }
  870. }
  871. #define STORE_COEF(c, i, v) do { \
  872. if (is8bitsperpixel) { \
  873. c[i] = v; \
  874. } else { \
  875. AV_WN32A(&c[i * 2], v); \
  876. } \
  877. } while (0)
  878. if (!--band_left)
  879. band_left = band_counts[++band];
  880. if (is_tx32x32)
  881. STORE_COEF(coef, rc, ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2);
  882. else
  883. STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * qmul[!!i]);
  884. nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
  885. tp = p[band][nnz];
  886. } while (++i < n_coeffs);
  887. return i;
  888. }
  889. static int decode_coeffs_b_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
  890. unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
  891. uint8_t (*p)[6][11], int nnz, const int16_t *scan,
  892. const int16_t (*nb)[2], const int16_t *band_counts,
  893. const int16_t *qmul)
  894. {
  895. return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
  896. nnz, scan, nb, band_counts, qmul);
  897. }
  898. static int decode_coeffs_b32_8bpp(VP9Context *s, int16_t *coef, int n_coeffs,
  899. unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
  900. uint8_t (*p)[6][11], int nnz, const int16_t *scan,
  901. const int16_t (*nb)[2], const int16_t *band_counts,
  902. const int16_t *qmul)
  903. {
  904. return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
  905. nnz, scan, nb, band_counts, qmul);
  906. }
  907. static int decode_coeffs_b_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
  908. unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
  909. uint8_t (*p)[6][11], int nnz, const int16_t *scan,
  910. const int16_t (*nb)[2], const int16_t *band_counts,
  911. const int16_t *qmul)
  912. {
  913. return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 0, 0, s->s.h.bpp, cnt, eob, p,
  914. nnz, scan, nb, band_counts, qmul);
  915. }
  916. static int decode_coeffs_b32_16bpp(VP9Context *s, int16_t *coef, int n_coeffs,
  917. unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
  918. uint8_t (*p)[6][11], int nnz, const int16_t *scan,
  919. const int16_t (*nb)[2], const int16_t *band_counts,
  920. const int16_t *qmul)
  921. {
  922. return decode_coeffs_b_generic(&s->c, coef, n_coeffs, 1, 0, s->s.h.bpp, cnt, eob, p,
  923. nnz, scan, nb, band_counts, qmul);
  924. }
  925. static av_always_inline int decode_coeffs(AVCodecContext *avctx, int is8bitsperpixel)
  926. {
  927. VP9Context *s = avctx->priv_data;
  928. VP9Block *b = s->b;
  929. int row = s->row, col = s->col;
  930. uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
  931. unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
  932. unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
  933. int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
  934. int end_x = FFMIN(2 * (s->cols - col), w4);
  935. int end_y = FFMIN(2 * (s->rows - row), h4);
  936. int n, pl, x, y, ret;
  937. int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
  938. int tx = 4 * s->s.h.lossless + b->tx;
  939. const int16_t * const *yscans = ff_vp9_scans[tx];
  940. const int16_t (* const *ynbs)[2] = ff_vp9_scans_nb[tx];
  941. const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
  942. const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
  943. uint8_t *a = &s->above_y_nnz_ctx[col * 2];
  944. uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
  945. static const int16_t band_counts[4][8] = {
  946. { 1, 2, 3, 4, 3, 16 - 13 },
  947. { 1, 2, 3, 4, 11, 64 - 21 },
  948. { 1, 2, 3, 4, 11, 256 - 21 },
  949. { 1, 2, 3, 4, 11, 1024 - 21 },
  950. };
  951. const int16_t *y_band_counts = band_counts[b->tx];
  952. const int16_t *uv_band_counts = band_counts[b->uvtx];
  953. int bytesperpixel = is8bitsperpixel ? 1 : 2;
  954. int total_coeff = 0;
  955. #define MERGE(la, end, step, rd) \
  956. for (n = 0; n < end; n += step) \
  957. la[n] = !!rd(&la[n])
  958. #define MERGE_CTX(step, rd) \
  959. do { \
  960. MERGE(l, end_y, step, rd); \
  961. MERGE(a, end_x, step, rd); \
  962. } while (0)
  963. #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
  964. for (n = 0, y = 0; y < end_y; y += step) { \
  965. for (x = 0; x < end_x; x += step, n += step * step) { \
  966. enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[mode_index]]; \
  967. ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
  968. (s, s->block + 16 * n * bytesperpixel, 16 * step * step, \
  969. c, e, p, a[x] + l[y], yscans[txtp], \
  970. ynbs[txtp], y_band_counts, qmul[0]); \
  971. a[x] = l[y] = !!ret; \
  972. total_coeff |= !!ret; \
  973. if (step >= 4) { \
  974. AV_WN16A(&s->eob[n], ret); \
  975. } else { \
  976. s->eob[n] = ret; \
  977. } \
  978. } \
  979. }
  980. #define SPLAT(la, end, step, cond) \
  981. if (step == 2) { \
  982. for (n = 1; n < end; n += step) \
  983. la[n] = la[n - 1]; \
  984. } else if (step == 4) { \
  985. if (cond) { \
  986. for (n = 0; n < end; n += step) \
  987. AV_WN32A(&la[n], la[n] * 0x01010101); \
  988. } else { \
  989. for (n = 0; n < end; n += step) \
  990. memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
  991. } \
  992. } else /* step == 8 */ { \
  993. if (cond) { \
  994. if (HAVE_FAST_64BIT) { \
  995. for (n = 0; n < end; n += step) \
  996. AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
  997. } else { \
  998. for (n = 0; n < end; n += step) { \
  999. uint32_t v32 = la[n] * 0x01010101; \
  1000. AV_WN32A(&la[n], v32); \
  1001. AV_WN32A(&la[n + 4], v32); \
  1002. } \
  1003. } \
  1004. } else { \
  1005. for (n = 0; n < end; n += step) \
  1006. memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
  1007. } \
  1008. }
  1009. #define SPLAT_CTX(step) \
  1010. do { \
  1011. SPLAT(a, end_x, step, end_x == w4); \
  1012. SPLAT(l, end_y, step, end_y == h4); \
  1013. } while (0)
  1014. /* y tokens */
  1015. switch (b->tx) {
  1016. case TX_4X4:
  1017. DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
  1018. break;
  1019. case TX_8X8:
  1020. MERGE_CTX(2, AV_RN16A);
  1021. DECODE_Y_COEF_LOOP(2, 0,);
  1022. SPLAT_CTX(2);
  1023. break;
  1024. case TX_16X16:
  1025. MERGE_CTX(4, AV_RN32A);
  1026. DECODE_Y_COEF_LOOP(4, 0,);
  1027. SPLAT_CTX(4);
  1028. break;
  1029. case TX_32X32:
  1030. MERGE_CTX(8, AV_RN64A);
  1031. DECODE_Y_COEF_LOOP(8, 0, 32);
  1032. SPLAT_CTX(8);
  1033. break;
  1034. }
  1035. #define DECODE_UV_COEF_LOOP(step, v) \
  1036. for (n = 0, y = 0; y < end_y; y += step) { \
  1037. for (x = 0; x < end_x; x += step, n += step * step) { \
  1038. ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
  1039. (s, s->uvblock[pl] + 16 * n * bytesperpixel, \
  1040. 16 * step * step, c, e, p, a[x] + l[y], \
  1041. uvscan, uvnb, uv_band_counts, qmul[1]); \
  1042. a[x] = l[y] = !!ret; \
  1043. total_coeff |= !!ret; \
  1044. if (step >= 4) { \
  1045. AV_WN16A(&s->uveob[pl][n], ret); \
  1046. } else { \
  1047. s->uveob[pl][n] = ret; \
  1048. } \
  1049. } \
  1050. }
  1051. p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
  1052. c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
  1053. e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
  1054. w4 >>= s->ss_h;
  1055. end_x >>= s->ss_h;
  1056. h4 >>= s->ss_v;
  1057. end_y >>= s->ss_v;
  1058. for (pl = 0; pl < 2; pl++) {
  1059. a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
  1060. l = &s->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
  1061. switch (b->uvtx) {
  1062. case TX_4X4:
  1063. DECODE_UV_COEF_LOOP(1,);
  1064. break;
  1065. case TX_8X8:
  1066. MERGE_CTX(2, AV_RN16A);
  1067. DECODE_UV_COEF_LOOP(2,);
  1068. SPLAT_CTX(2);
  1069. break;
  1070. case TX_16X16:
  1071. MERGE_CTX(4, AV_RN32A);
  1072. DECODE_UV_COEF_LOOP(4,);
  1073. SPLAT_CTX(4);
  1074. break;
  1075. case TX_32X32:
  1076. MERGE_CTX(8, AV_RN64A);
  1077. DECODE_UV_COEF_LOOP(8, 32);
  1078. SPLAT_CTX(8);
  1079. break;
  1080. }
  1081. }
  1082. return total_coeff;
  1083. }
  1084. static int decode_coeffs_8bpp(AVCodecContext *avctx)
  1085. {
  1086. return decode_coeffs(avctx, 1);
  1087. }
  1088. static int decode_coeffs_16bpp(AVCodecContext *avctx)
  1089. {
  1090. return decode_coeffs(avctx, 0);
  1091. }
  1092. static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
  1093. uint8_t *dst_edge, ptrdiff_t stride_edge,
  1094. uint8_t *dst_inner, ptrdiff_t stride_inner,
  1095. uint8_t *l, int col, int x, int w,
  1096. int row, int y, enum TxfmMode tx,
  1097. int p, int ss_h, int ss_v, int bytesperpixel)
  1098. {
  1099. int have_top = row > 0 || y > 0;
  1100. int have_left = col > s->tile_col_start || x > 0;
  1101. int have_right = x < w - 1;
  1102. int bpp = s->s.h.bpp;
  1103. static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
  1104. [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
  1105. { DC_127_PRED, VERT_PRED } },
  1106. [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
  1107. { HOR_PRED, HOR_PRED } },
  1108. [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
  1109. { LEFT_DC_PRED, DC_PRED } },
  1110. [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
  1111. { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
  1112. [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
  1113. { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
  1114. [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
  1115. { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
  1116. [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
  1117. { HOR_DOWN_PRED, HOR_DOWN_PRED } },
  1118. [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
  1119. { DC_127_PRED, VERT_LEFT_PRED } },
  1120. [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
  1121. { HOR_UP_PRED, HOR_UP_PRED } },
  1122. [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
  1123. { HOR_PRED, TM_VP8_PRED } },
  1124. };
  1125. static const struct {
  1126. uint8_t needs_left:1;
  1127. uint8_t needs_top:1;
  1128. uint8_t needs_topleft:1;
  1129. uint8_t needs_topright:1;
  1130. uint8_t invert_left:1;
  1131. } edges[N_INTRA_PRED_MODES] = {
  1132. [VERT_PRED] = { .needs_top = 1 },
  1133. [HOR_PRED] = { .needs_left = 1 },
  1134. [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
  1135. [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
  1136. [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
  1137. .needs_topleft = 1 },
  1138. [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1,
  1139. .needs_topleft = 1 },
  1140. [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1,
  1141. .needs_topleft = 1 },
  1142. [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
  1143. [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
  1144. [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1,
  1145. .needs_topleft = 1 },
  1146. [LEFT_DC_PRED] = { .needs_left = 1 },
  1147. [TOP_DC_PRED] = { .needs_top = 1 },
  1148. [DC_128_PRED] = { 0 },
  1149. [DC_127_PRED] = { 0 },
  1150. [DC_129_PRED] = { 0 }
  1151. };
  1152. av_assert2(mode >= 0 && mode < 10);
  1153. mode = mode_conv[mode][have_left][have_top];
  1154. if (edges[mode].needs_top) {
  1155. uint8_t *top, *topleft;
  1156. int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !ss_h) - x) * 4;
  1157. int n_px_need_tr = 0;
  1158. if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
  1159. n_px_need_tr = 4;
  1160. // if top of sb64-row, use s->intra_pred_data[] instead of
  1161. // dst[-stride] for intra prediction (it contains pre- instead of
  1162. // post-loopfilter data)
  1163. if (have_top) {
  1164. top = !(row & 7) && !y ?
  1165. s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
  1166. y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
  1167. if (have_left)
  1168. topleft = !(row & 7) && !y ?
  1169. s->intra_pred_data[p] + (col * (8 >> ss_h) + x * 4) * bytesperpixel :
  1170. y == 0 || x == 0 ? &dst_edge[-stride_edge] :
  1171. &dst_inner[-stride_inner];
  1172. }
  1173. if (have_top &&
  1174. (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
  1175. (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
  1176. n_px_need + n_px_need_tr <= n_px_have) {
  1177. *a = top;
  1178. } else {
  1179. if (have_top) {
  1180. if (n_px_need <= n_px_have) {
  1181. memcpy(*a, top, n_px_need * bytesperpixel);
  1182. } else {
  1183. #define memset_bpp(c, i1, v, i2, num) do { \
  1184. if (bytesperpixel == 1) { \
  1185. memset(&(c)[(i1)], (v)[(i2)], (num)); \
  1186. } else { \
  1187. int n, val = AV_RN16A(&(v)[(i2) * 2]); \
  1188. for (n = 0; n < (num); n++) { \
  1189. AV_WN16A(&(c)[((i1) + n) * 2], val); \
  1190. } \
  1191. } \
  1192. } while (0)
  1193. memcpy(*a, top, n_px_have * bytesperpixel);
  1194. memset_bpp(*a, n_px_have, (*a), n_px_have - 1, n_px_need - n_px_have);
  1195. }
  1196. } else {
  1197. #define memset_val(c, val, num) do { \
  1198. if (bytesperpixel == 1) { \
  1199. memset((c), (val), (num)); \
  1200. } else { \
  1201. int n; \
  1202. for (n = 0; n < (num); n++) { \
  1203. AV_WN16A(&(c)[n * 2], (val)); \
  1204. } \
  1205. } \
  1206. } while (0)
  1207. memset_val(*a, (128 << (bpp - 8)) - 1, n_px_need);
  1208. }
  1209. if (edges[mode].needs_topleft) {
  1210. if (have_left && have_top) {
  1211. #define assign_bpp(c, i1, v, i2) do { \
  1212. if (bytesperpixel == 1) { \
  1213. (c)[(i1)] = (v)[(i2)]; \
  1214. } else { \
  1215. AV_COPY16(&(c)[(i1) * 2], &(v)[(i2) * 2]); \
  1216. } \
  1217. } while (0)
  1218. assign_bpp(*a, -1, topleft, -1);
  1219. } else {
  1220. #define assign_val(c, i, v) do { \
  1221. if (bytesperpixel == 1) { \
  1222. (c)[(i)] = (v); \
  1223. } else { \
  1224. AV_WN16A(&(c)[(i) * 2], (v)); \
  1225. } \
  1226. } while (0)
  1227. assign_val((*a), -1, (128 << (bpp - 8)) + (have_top ? +1 : -1));
  1228. }
  1229. }
  1230. if (tx == TX_4X4 && edges[mode].needs_topright) {
  1231. if (have_top && have_right &&
  1232. n_px_need + n_px_need_tr <= n_px_have) {
  1233. memcpy(&(*a)[4 * bytesperpixel], &top[4 * bytesperpixel], 4 * bytesperpixel);
  1234. } else {
  1235. memset_bpp(*a, 4, *a, 3, 4);
  1236. }
  1237. }
  1238. }
  1239. }
  1240. if (edges[mode].needs_left) {
  1241. if (have_left) {
  1242. int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !ss_v) - y) * 4;
  1243. uint8_t *dst = x == 0 ? dst_edge : dst_inner;
  1244. ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
  1245. if (edges[mode].invert_left) {
  1246. if (n_px_need <= n_px_have) {
  1247. for (i = 0; i < n_px_need; i++)
  1248. assign_bpp(l, i, &dst[i * stride], -1);
  1249. } else {
  1250. for (i = 0; i < n_px_have; i++)
  1251. assign_bpp(l, i, &dst[i * stride], -1);
  1252. memset_bpp(l, n_px_have, l, n_px_have - 1, n_px_need - n_px_have);
  1253. }
  1254. } else {
  1255. if (n_px_need <= n_px_have) {
  1256. for (i = 0; i < n_px_need; i++)
  1257. assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
  1258. } else {
  1259. for (i = 0; i < n_px_have; i++)
  1260. assign_bpp(l, n_px_need - 1 - i, &dst[i * stride], -1);
  1261. memset_bpp(l, 0, l, n_px_need - n_px_have, n_px_need - n_px_have);
  1262. }
  1263. }
  1264. } else {
  1265. memset_val(l, (128 << (bpp - 8)) + 1, 4 << tx);
  1266. }
  1267. }
  1268. return mode;
  1269. }
  1270. static av_always_inline void intra_recon(AVCodecContext *avctx, ptrdiff_t y_off,
  1271. ptrdiff_t uv_off, int bytesperpixel)
  1272. {
  1273. VP9Context *s = avctx->priv_data;
  1274. VP9Block *b = s->b;
  1275. int row = s->row, col = s->col;
  1276. int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
  1277. int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
  1278. int end_x = FFMIN(2 * (s->cols - col), w4);
  1279. int end_y = FFMIN(2 * (s->rows - row), h4);
  1280. int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
  1281. int uvstep1d = 1 << b->uvtx, p;
  1282. uint8_t *dst = s->dst[0], *dst_r = s->s.frames[CUR_FRAME].tf.f->data[0] + y_off;
  1283. LOCAL_ALIGNED_32(uint8_t, a_buf, [96]);
  1284. LOCAL_ALIGNED_32(uint8_t, l, [64]);
  1285. for (n = 0, y = 0; y < end_y; y += step1d) {
  1286. uint8_t *ptr = dst, *ptr_r = dst_r;
  1287. for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d * bytesperpixel,
  1288. ptr_r += 4 * step1d * bytesperpixel, n += step) {
  1289. int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
  1290. y * 2 + x : 0];
  1291. uint8_t *a = &a_buf[32];
  1292. enum TxfmType txtp = ff_vp9_intra_txfm_type[mode];
  1293. int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
  1294. mode = check_intra_mode(s, mode, &a, ptr_r,
  1295. s->s.frames[CUR_FRAME].tf.f->linesize[0],
  1296. ptr, s->y_stride, l,
  1297. col, x, w4, row, y, b->tx, 0, 0, 0, bytesperpixel);
  1298. s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
  1299. if (eob)
  1300. s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
  1301. s->block + 16 * n * bytesperpixel, eob);
  1302. }
  1303. dst_r += 4 * step1d * s->s.frames[CUR_FRAME].tf.f->linesize[0];
  1304. dst += 4 * step1d * s->y_stride;
  1305. }
  1306. // U/V
  1307. w4 >>= s->ss_h;
  1308. end_x >>= s->ss_h;
  1309. end_y >>= s->ss_v;
  1310. step = 1 << (b->uvtx * 2);
  1311. for (p = 0; p < 2; p++) {
  1312. dst = s->dst[1 + p];
  1313. dst_r = s->s.frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
  1314. for (n = 0, y = 0; y < end_y; y += uvstep1d) {
  1315. uint8_t *ptr = dst, *ptr_r = dst_r;
  1316. for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d * bytesperpixel,
  1317. ptr_r += 4 * uvstep1d * bytesperpixel, n += step) {
  1318. int mode = b->uvmode;
  1319. uint8_t *a = &a_buf[32];
  1320. int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
  1321. mode = check_intra_mode(s, mode, &a, ptr_r,
  1322. s->s.frames[CUR_FRAME].tf.f->linesize[1],
  1323. ptr, s->uv_stride, l, col, x, w4, row, y,
  1324. b->uvtx, p + 1, s->ss_h, s->ss_v, bytesperpixel);
  1325. s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
  1326. if (eob)
  1327. s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
  1328. s->uvblock[p] + 16 * n * bytesperpixel, eob);
  1329. }
  1330. dst_r += 4 * uvstep1d * s->s.frames[CUR_FRAME].tf.f->linesize[1];
  1331. dst += 4 * uvstep1d * s->uv_stride;
  1332. }
  1333. }
  1334. }
  1335. static void intra_recon_8bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
  1336. {
  1337. intra_recon(avctx, y_off, uv_off, 1);
  1338. }
  1339. static void intra_recon_16bpp(AVCodecContext *avctx, ptrdiff_t y_off, ptrdiff_t uv_off)
  1340. {
  1341. intra_recon(avctx, y_off, uv_off, 2);
  1342. }
  1343. static av_always_inline void mc_luma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
  1344. uint8_t *dst, ptrdiff_t dst_stride,
  1345. const uint8_t *ref, ptrdiff_t ref_stride,
  1346. ThreadFrame *ref_frame,
  1347. ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
  1348. int bw, int bh, int w, int h, int bytesperpixel)
  1349. {
  1350. int mx = mv->x, my = mv->y, th;
  1351. y += my >> 3;
  1352. x += mx >> 3;
  1353. ref += y * ref_stride + x * bytesperpixel;
  1354. mx &= 7;
  1355. my &= 7;
  1356. // FIXME bilinear filter only needs 0/1 pixels, not 3/4
  1357. // we use +7 because the last 7 pixels of each sbrow can be changed in
  1358. // the longest loopfilter of the next sbrow
  1359. th = (y + bh + 4 * !!my + 7) >> 6;
  1360. ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
  1361. // The arm/aarch64 _hv filters read one more row than what actually is
  1362. // needed, so switch to emulated edge one pixel sooner vertically
  1363. // (!!my * 5) than horizontally (!!mx * 4).
  1364. if (x < !!mx * 3 || y < !!my * 3 ||
  1365. x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
  1366. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1367. ref - !!my * 3 * ref_stride - !!mx * 3 * bytesperpixel,
  1368. 160, ref_stride,
  1369. bw + !!mx * 7, bh + !!my * 7,
  1370. x - !!mx * 3, y - !!my * 3, w, h);
  1371. ref = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
  1372. ref_stride = 160;
  1373. }
  1374. mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
  1375. }
  1376. static av_always_inline void mc_chroma_unscaled(VP9Context *s, vp9_mc_func (*mc)[2],
  1377. uint8_t *dst_u, uint8_t *dst_v,
  1378. ptrdiff_t dst_stride,
  1379. const uint8_t *ref_u, ptrdiff_t src_stride_u,
  1380. const uint8_t *ref_v, ptrdiff_t src_stride_v,
  1381. ThreadFrame *ref_frame,
  1382. ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
  1383. int bw, int bh, int w, int h, int bytesperpixel)
  1384. {
  1385. int mx = mv->x * (1 << !s->ss_h), my = mv->y * (1 << !s->ss_v), th;
  1386. y += my >> 4;
  1387. x += mx >> 4;
  1388. ref_u += y * src_stride_u + x * bytesperpixel;
  1389. ref_v += y * src_stride_v + x * bytesperpixel;
  1390. mx &= 15;
  1391. my &= 15;
  1392. // FIXME bilinear filter only needs 0/1 pixels, not 3/4
  1393. // we use +7 because the last 7 pixels of each sbrow can be changed in
  1394. // the longest loopfilter of the next sbrow
  1395. th = (y + bh + 4 * !!my + 7) >> (6 - s->ss_v);
  1396. ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
  1397. // The arm/aarch64 _hv filters read one more row than what actually is
  1398. // needed, so switch to emulated edge one pixel sooner vertically
  1399. // (!!my * 5) than horizontally (!!mx * 4).
  1400. if (x < !!mx * 3 || y < !!my * 3 ||
  1401. x + !!mx * 4 > w - bw || y + !!my * 5 > h - bh) {
  1402. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1403. ref_u - !!my * 3 * src_stride_u - !!mx * 3 * bytesperpixel,
  1404. 160, src_stride_u,
  1405. bw + !!mx * 7, bh + !!my * 7,
  1406. x - !!mx * 3, y - !!my * 3, w, h);
  1407. ref_u = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
  1408. mc[!!mx][!!my](dst_u, dst_stride, ref_u, 160, bh, mx, my);
  1409. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1410. ref_v - !!my * 3 * src_stride_v - !!mx * 3 * bytesperpixel,
  1411. 160, src_stride_v,
  1412. bw + !!mx * 7, bh + !!my * 7,
  1413. x - !!mx * 3, y - !!my * 3, w, h);
  1414. ref_v = s->edge_emu_buffer + !!my * 3 * 160 + !!mx * 3 * bytesperpixel;
  1415. mc[!!mx][!!my](dst_v, dst_stride, ref_v, 160, bh, mx, my);
  1416. } else {
  1417. mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
  1418. mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
  1419. }
  1420. }
  1421. #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
  1422. px, py, pw, ph, bw, bh, w, h, i) \
  1423. mc_luma_unscaled(s, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
  1424. mv, bw, bh, w, h, bytesperpixel)
  1425. #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
  1426. row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
  1427. mc_chroma_unscaled(s, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
  1428. row, col, mv, bw, bh, w, h, bytesperpixel)
  1429. #define SCALED 0
  1430. #define FN(x) x##_8bpp
  1431. #define BYTES_PER_PIXEL 1
  1432. #include "vp9_mc_template.c"
  1433. #undef FN
  1434. #undef BYTES_PER_PIXEL
  1435. #define FN(x) x##_16bpp
  1436. #define BYTES_PER_PIXEL 2
  1437. #include "vp9_mc_template.c"
  1438. #undef mc_luma_dir
  1439. #undef mc_chroma_dir
  1440. #undef FN
  1441. #undef BYTES_PER_PIXEL
  1442. #undef SCALED
  1443. static av_always_inline void mc_luma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
  1444. vp9_mc_func (*mc)[2],
  1445. uint8_t *dst, ptrdiff_t dst_stride,
  1446. const uint8_t *ref, ptrdiff_t ref_stride,
  1447. ThreadFrame *ref_frame,
  1448. ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
  1449. int px, int py, int pw, int ph,
  1450. int bw, int bh, int w, int h, int bytesperpixel,
  1451. const uint16_t *scale, const uint8_t *step)
  1452. {
  1453. if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
  1454. s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
  1455. mc_luma_unscaled(s, mc, dst, dst_stride, ref, ref_stride, ref_frame,
  1456. y, x, in_mv, bw, bh, w, h, bytesperpixel);
  1457. } else {
  1458. #define scale_mv(n, dim) (((int64_t)(n) * scale[dim]) >> 14)
  1459. int mx, my;
  1460. int refbw_m1, refbh_m1;
  1461. int th;
  1462. VP56mv mv;
  1463. mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
  1464. mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
  1465. // BUG libvpx seems to scale the two components separately. This introduces
  1466. // rounding errors but we have to reproduce them to be exactly compatible
  1467. // with the output from libvpx...
  1468. mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
  1469. my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
  1470. y = my >> 4;
  1471. x = mx >> 4;
  1472. ref += y * ref_stride + x * bytesperpixel;
  1473. mx &= 15;
  1474. my &= 15;
  1475. refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
  1476. refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
  1477. // FIXME bilinear filter only needs 0/1 pixels, not 3/4
  1478. // we use +7 because the last 7 pixels of each sbrow can be changed in
  1479. // the longest loopfilter of the next sbrow
  1480. th = (y + refbh_m1 + 4 + 7) >> 6;
  1481. ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
  1482. // The arm/aarch64 _hv filters read one more row than what actually is
  1483. // needed, so switch to emulated edge one pixel sooner vertically
  1484. // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
  1485. if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
  1486. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1487. ref - 3 * ref_stride - 3 * bytesperpixel,
  1488. 288, ref_stride,
  1489. refbw_m1 + 8, refbh_m1 + 8,
  1490. x - 3, y - 3, w, h);
  1491. ref = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
  1492. ref_stride = 288;
  1493. }
  1494. smc(dst, dst_stride, ref, ref_stride, bh, mx, my, step[0], step[1]);
  1495. }
  1496. }
  1497. static av_always_inline void mc_chroma_scaled(VP9Context *s, vp9_scaled_mc_func smc,
  1498. vp9_mc_func (*mc)[2],
  1499. uint8_t *dst_u, uint8_t *dst_v,
  1500. ptrdiff_t dst_stride,
  1501. const uint8_t *ref_u, ptrdiff_t src_stride_u,
  1502. const uint8_t *ref_v, ptrdiff_t src_stride_v,
  1503. ThreadFrame *ref_frame,
  1504. ptrdiff_t y, ptrdiff_t x, const VP56mv *in_mv,
  1505. int px, int py, int pw, int ph,
  1506. int bw, int bh, int w, int h, int bytesperpixel,
  1507. const uint16_t *scale, const uint8_t *step)
  1508. {
  1509. if (s->s.frames[CUR_FRAME].tf.f->width == ref_frame->f->width &&
  1510. s->s.frames[CUR_FRAME].tf.f->height == ref_frame->f->height) {
  1511. mc_chroma_unscaled(s, mc, dst_u, dst_v, dst_stride, ref_u, src_stride_u,
  1512. ref_v, src_stride_v, ref_frame,
  1513. y, x, in_mv, bw, bh, w, h, bytesperpixel);
  1514. } else {
  1515. int mx, my;
  1516. int refbw_m1, refbh_m1;
  1517. int th;
  1518. VP56mv mv;
  1519. if (s->ss_h) {
  1520. // BUG https://code.google.com/p/webm/issues/detail?id=820
  1521. mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 16, (s->cols * 4 - x + px + 3) * 16);
  1522. mx = scale_mv(mv.x, 0) + (scale_mv(x * 16, 0) & ~15) + (scale_mv(x * 32, 0) & 15);
  1523. } else {
  1524. mv.x = av_clip(in_mv->x, -(x + pw - px + 4) * 8, (s->cols * 8 - x + px + 3) * 8);
  1525. mx = scale_mv(mv.x * 2, 0) + scale_mv(x * 16, 0);
  1526. }
  1527. if (s->ss_v) {
  1528. // BUG https://code.google.com/p/webm/issues/detail?id=820
  1529. mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 16, (s->rows * 4 - y + py + 3) * 16);
  1530. my = scale_mv(mv.y, 1) + (scale_mv(y * 16, 1) & ~15) + (scale_mv(y * 32, 1) & 15);
  1531. } else {
  1532. mv.y = av_clip(in_mv->y, -(y + ph - py + 4) * 8, (s->rows * 8 - y + py + 3) * 8);
  1533. my = scale_mv(mv.y * 2, 1) + scale_mv(y * 16, 1);
  1534. }
  1535. #undef scale_mv
  1536. y = my >> 4;
  1537. x = mx >> 4;
  1538. ref_u += y * src_stride_u + x * bytesperpixel;
  1539. ref_v += y * src_stride_v + x * bytesperpixel;
  1540. mx &= 15;
  1541. my &= 15;
  1542. refbw_m1 = ((bw - 1) * step[0] + mx) >> 4;
  1543. refbh_m1 = ((bh - 1) * step[1] + my) >> 4;
  1544. // FIXME bilinear filter only needs 0/1 pixels, not 3/4
  1545. // we use +7 because the last 7 pixels of each sbrow can be changed in
  1546. // the longest loopfilter of the next sbrow
  1547. th = (y + refbh_m1 + 4 + 7) >> (6 - s->ss_v);
  1548. ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
  1549. // The arm/aarch64 _hv filters read one more row than what actually is
  1550. // needed, so switch to emulated edge one pixel sooner vertically
  1551. // (y + 5 >= h - refbh_m1) than horizontally (x + 4 >= w - refbw_m1).
  1552. if (x < 3 || y < 3 || x + 4 >= w - refbw_m1 || y + 5 >= h - refbh_m1) {
  1553. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1554. ref_u - 3 * src_stride_u - 3 * bytesperpixel,
  1555. 288, src_stride_u,
  1556. refbw_m1 + 8, refbh_m1 + 8,
  1557. x - 3, y - 3, w, h);
  1558. ref_u = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
  1559. smc(dst_u, dst_stride, ref_u, 288, bh, mx, my, step[0], step[1]);
  1560. s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
  1561. ref_v - 3 * src_stride_v - 3 * bytesperpixel,
  1562. 288, src_stride_v,
  1563. refbw_m1 + 8, refbh_m1 + 8,
  1564. x - 3, y - 3, w, h);
  1565. ref_v = s->edge_emu_buffer + 3 * 288 + 3 * bytesperpixel;
  1566. smc(dst_v, dst_stride, ref_v, 288, bh, mx, my, step[0], step[1]);
  1567. } else {
  1568. smc(dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my, step[0], step[1]);
  1569. smc(dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my, step[0], step[1]);
  1570. }
  1571. }
  1572. }
  1573. #define mc_luma_dir(s, mc, dst, dst_ls, src, src_ls, tref, row, col, mv, \
  1574. px, py, pw, ph, bw, bh, w, h, i) \
  1575. mc_luma_scaled(s, s->dsp.s##mc, s->dsp.mc, dst, dst_ls, src, src_ls, tref, row, col, \
  1576. mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
  1577. s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
  1578. #define mc_chroma_dir(s, mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
  1579. row, col, mv, px, py, pw, ph, bw, bh, w, h, i) \
  1580. mc_chroma_scaled(s, s->dsp.s##mc, s->dsp.mc, dstu, dstv, dst_ls, srcu, srcu_ls, srcv, srcv_ls, tref, \
  1581. row, col, mv, px, py, pw, ph, bw, bh, w, h, bytesperpixel, \
  1582. s->mvscale[b->ref[i]], s->mvstep[b->ref[i]])
  1583. #define SCALED 1
  1584. #define FN(x) x##_scaled_8bpp
  1585. #define BYTES_PER_PIXEL 1
  1586. #include "vp9_mc_template.c"
  1587. #undef FN
  1588. #undef BYTES_PER_PIXEL
  1589. #define FN(x) x##_scaled_16bpp
  1590. #define BYTES_PER_PIXEL 2
  1591. #include "vp9_mc_template.c"
  1592. #undef mc_luma_dir
  1593. #undef mc_chroma_dir
  1594. #undef FN
  1595. #undef BYTES_PER_PIXEL
  1596. #undef SCALED
  1597. static av_always_inline void inter_recon(AVCodecContext *avctx, int bytesperpixel)
  1598. {
  1599. VP9Context *s = avctx->priv_data;
  1600. VP9Block *b = s->b;
  1601. int row = s->row, col = s->col;
  1602. if (s->mvscale[b->ref[0]][0] || (b->comp && s->mvscale[b->ref[1]][0])) {
  1603. if (bytesperpixel == 1) {
  1604. inter_pred_scaled_8bpp(avctx);
  1605. } else {
  1606. inter_pred_scaled_16bpp(avctx);
  1607. }
  1608. } else {
  1609. if (bytesperpixel == 1) {
  1610. inter_pred_8bpp(avctx);
  1611. } else {
  1612. inter_pred_16bpp(avctx);
  1613. }
  1614. }
  1615. if (!b->skip) {
  1616. /* mostly copied intra_recon() */
  1617. int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
  1618. int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
  1619. int end_x = FFMIN(2 * (s->cols - col), w4);
  1620. int end_y = FFMIN(2 * (s->rows - row), h4);
  1621. int tx = 4 * s->s.h.lossless + b->tx, uvtx = b->uvtx + 4 * s->s.h.lossless;
  1622. int uvstep1d = 1 << b->uvtx, p;
  1623. uint8_t *dst = s->dst[0];
  1624. // y itxfm add
  1625. for (n = 0, y = 0; y < end_y; y += step1d) {
  1626. uint8_t *ptr = dst;
  1627. for (x = 0; x < end_x; x += step1d,
  1628. ptr += 4 * step1d * bytesperpixel, n += step) {
  1629. int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
  1630. if (eob)
  1631. s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
  1632. s->block + 16 * n * bytesperpixel, eob);
  1633. }
  1634. dst += 4 * s->y_stride * step1d;
  1635. }
  1636. // uv itxfm add
  1637. end_x >>= s->ss_h;
  1638. end_y >>= s->ss_v;
  1639. step = 1 << (b->uvtx * 2);
  1640. for (p = 0; p < 2; p++) {
  1641. dst = s->dst[p + 1];
  1642. for (n = 0, y = 0; y < end_y; y += uvstep1d) {
  1643. uint8_t *ptr = dst;
  1644. for (x = 0; x < end_x; x += uvstep1d,
  1645. ptr += 4 * uvstep1d * bytesperpixel, n += step) {
  1646. int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
  1647. if (eob)
  1648. s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
  1649. s->uvblock[p] + 16 * n * bytesperpixel, eob);
  1650. }
  1651. dst += 4 * uvstep1d * s->uv_stride;
  1652. }
  1653. }
  1654. }
  1655. }
  1656. static void inter_recon_8bpp(AVCodecContext *avctx)
  1657. {
  1658. inter_recon(avctx, 1);
  1659. }
  1660. static void inter_recon_16bpp(AVCodecContext *avctx)
  1661. {
  1662. inter_recon(avctx, 2);
  1663. }
  1664. static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
  1665. int row_and_7, int col_and_7,
  1666. int w, int h, int col_end, int row_end,
  1667. enum TxfmMode tx, int skip_inter)
  1668. {
  1669. static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
  1670. static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
  1671. // FIXME I'm pretty sure all loops can be replaced by a single LUT if
  1672. // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
  1673. // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
  1674. // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
  1675. // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
  1676. // edges. This means that for UV, we work on two subsampled blocks at
  1677. // a time, and we only use the topleft block's mode information to set
  1678. // things like block strength. Thus, for any block size smaller than
  1679. // 16x16, ignore the odd portion of the block.
  1680. if (tx == TX_4X4 && (ss_v | ss_h)) {
  1681. if (h == ss_v) {
  1682. if (row_and_7 & 1)
  1683. return;
  1684. if (!row_end)
  1685. h += 1;
  1686. }
  1687. if (w == ss_h) {
  1688. if (col_and_7 & 1)
  1689. return;
  1690. if (!col_end)
  1691. w += 1;
  1692. }
  1693. }
  1694. if (tx == TX_4X4 && !skip_inter) {
  1695. int t = 1 << col_and_7, m_col = (t << w) - t, y;
  1696. // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
  1697. int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
  1698. for (y = row_and_7; y < h + row_and_7; y++) {
  1699. int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
  1700. mask[0][y][1] |= m_row_8;
  1701. mask[0][y][2] |= m_row_4;
  1702. // for odd lines, if the odd col is not being filtered,
  1703. // skip odd row also:
  1704. // .---. <-- a
  1705. // | |
  1706. // |___| <-- b
  1707. // ^ ^
  1708. // c d
  1709. //
  1710. // if a/c are even row/col and b/d are odd, and d is skipped,
  1711. // e.g. right edge of size-66x66.webm, then skip b also (bug)
  1712. if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
  1713. mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
  1714. } else {
  1715. mask[1][y][col_mask_id] |= m_col;
  1716. }
  1717. if (!ss_h)
  1718. mask[0][y][3] |= m_col;
  1719. if (!ss_v) {
  1720. if (ss_h && (col_end & 1))
  1721. mask[1][y][3] |= (t << (w - 1)) - t;
  1722. else
  1723. mask[1][y][3] |= m_col;
  1724. }
  1725. }
  1726. } else {
  1727. int y, t = 1 << col_and_7, m_col = (t << w) - t;
  1728. if (!skip_inter) {
  1729. int mask_id = (tx == TX_8X8);
  1730. int l2 = tx + ss_h - 1, step1d;
  1731. static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
  1732. int m_row = m_col & masks[l2];
  1733. // at odd UV col/row edges tx16/tx32 loopfilter edges, force
  1734. // 8wd loopfilter to prevent going off the visible edge.
  1735. if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
  1736. int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
  1737. int m_row_8 = m_row - m_row_16;
  1738. for (y = row_and_7; y < h + row_and_7; y++) {
  1739. mask[0][y][0] |= m_row_16;
  1740. mask[0][y][1] |= m_row_8;
  1741. }
  1742. } else {
  1743. for (y = row_and_7; y < h + row_and_7; y++)
  1744. mask[0][y][mask_id] |= m_row;
  1745. }
  1746. l2 = tx + ss_v - 1;
  1747. step1d = 1 << l2;
  1748. if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
  1749. for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
  1750. mask[1][y][0] |= m_col;
  1751. if (y - row_and_7 == h - 1)
  1752. mask[1][y][1] |= m_col;
  1753. } else {
  1754. for (y = row_and_7; y < h + row_and_7; y += step1d)
  1755. mask[1][y][mask_id] |= m_col;
  1756. }
  1757. } else if (tx != TX_4X4) {
  1758. int mask_id;
  1759. mask_id = (tx == TX_8X8) || (h == ss_v);
  1760. mask[1][row_and_7][mask_id] |= m_col;
  1761. mask_id = (tx == TX_8X8) || (w == ss_h);
  1762. for (y = row_and_7; y < h + row_and_7; y++)
  1763. mask[0][y][mask_id] |= t;
  1764. } else {
  1765. int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
  1766. for (y = row_and_7; y < h + row_and_7; y++) {
  1767. mask[0][y][2] |= t4;
  1768. mask[0][y][1] |= t8;
  1769. }
  1770. mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
  1771. }
  1772. }
  1773. }
  1774. void ff_vp9_decode_block(AVCodecContext *avctx, int row, int col,
  1775. VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
  1776. enum BlockLevel bl, enum BlockPartition bp)
  1777. {
  1778. VP9Context *s = avctx->priv_data;
  1779. VP9Block *b = s->b;
  1780. enum BlockSize bs = bl * 3 + bp;
  1781. int bytesperpixel = s->bytesperpixel;
  1782. int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
  1783. int emu[2];
  1784. AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
  1785. s->row = row;
  1786. s->row7 = row & 7;
  1787. s->col = col;
  1788. s->col7 = col & 7;
  1789. s->min_mv.x = -(128 + col * 64);
  1790. s->min_mv.y = -(128 + row * 64);
  1791. s->max_mv.x = 128 + (s->cols - col - w4) * 64;
  1792. s->max_mv.y = 128 + (s->rows - row - h4) * 64;
  1793. if (s->pass < 2) {
  1794. b->bs = bs;
  1795. b->bl = bl;
  1796. b->bp = bp;
  1797. decode_mode(avctx);
  1798. b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
  1799. (s->ss_v && h4 * 2 == (1 << b->tx)));
  1800. if (!b->skip) {
  1801. int has_coeffs;
  1802. if (bytesperpixel == 1) {
  1803. has_coeffs = decode_coeffs_8bpp(avctx);
  1804. } else {
  1805. has_coeffs = decode_coeffs_16bpp(avctx);
  1806. }
  1807. if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
  1808. b->skip = 1;
  1809. memset(&s->above_skip_ctx[col], 1, w4);
  1810. memset(&s->left_skip_ctx[s->row7], 1, h4);
  1811. }
  1812. } else {
  1813. int row7 = s->row7;
  1814. #define SPLAT_ZERO_CTX(v, n) \
  1815. switch (n) { \
  1816. case 1: v = 0; break; \
  1817. case 2: AV_ZERO16(&v); break; \
  1818. case 4: AV_ZERO32(&v); break; \
  1819. case 8: AV_ZERO64(&v); break; \
  1820. case 16: AV_ZERO128(&v); break; \
  1821. }
  1822. #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
  1823. do { \
  1824. SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
  1825. if (s->ss_##dir2) { \
  1826. SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
  1827. SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
  1828. } else { \
  1829. SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off * 2], n * 2); \
  1830. SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off * 2], n * 2); \
  1831. } \
  1832. } while (0)
  1833. switch (w4) {
  1834. case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1, h); break;
  1835. case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2, h); break;
  1836. case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4, h); break;
  1837. case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8, h); break;
  1838. }
  1839. switch (h4) {
  1840. case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1, v); break;
  1841. case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2, v); break;
  1842. case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4, v); break;
  1843. case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8, v); break;
  1844. }
  1845. }
  1846. if (s->pass == 1) {
  1847. s->b++;
  1848. s->block += w4 * h4 * 64 * bytesperpixel;
  1849. s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
  1850. s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
  1851. s->eob += 4 * w4 * h4;
  1852. s->uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
  1853. s->uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
  1854. return;
  1855. }
  1856. }
  1857. // emulated overhangs if the stride of the target buffer can't hold. This
  1858. // makes it possible to support emu-edge and so on even if we have large block
  1859. // overhangs
  1860. emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
  1861. (row + h4) > s->rows;
  1862. emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
  1863. (row + h4) > s->rows;
  1864. if (emu[0]) {
  1865. s->dst[0] = s->tmp_y;
  1866. s->y_stride = 128;
  1867. } else {
  1868. s->dst[0] = f->data[0] + yoff;
  1869. s->y_stride = f->linesize[0];
  1870. }
  1871. if (emu[1]) {
  1872. s->dst[1] = s->tmp_uv[0];
  1873. s->dst[2] = s->tmp_uv[1];
  1874. s->uv_stride = 128;
  1875. } else {
  1876. s->dst[1] = f->data[1] + uvoff;
  1877. s->dst[2] = f->data[2] + uvoff;
  1878. s->uv_stride = f->linesize[1];
  1879. }
  1880. if (b->intra) {
  1881. if (s->s.h.bpp > 8) {
  1882. intra_recon_16bpp(avctx, yoff, uvoff);
  1883. } else {
  1884. intra_recon_8bpp(avctx, yoff, uvoff);
  1885. }
  1886. } else {
  1887. if (s->s.h.bpp > 8) {
  1888. inter_recon_16bpp(avctx);
  1889. } else {
  1890. inter_recon_8bpp(avctx);
  1891. }
  1892. }
  1893. if (emu[0]) {
  1894. int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
  1895. for (n = 0; o < w; n++) {
  1896. int bw = 64 >> n;
  1897. av_assert2(n <= 4);
  1898. if (w & bw) {
  1899. s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
  1900. s->tmp_y + o * bytesperpixel, 128, h, 0, 0);
  1901. o += bw;
  1902. }
  1903. }
  1904. }
  1905. if (emu[1]) {
  1906. int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
  1907. int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
  1908. for (n = s->ss_h; o < w; n++) {
  1909. int bw = 64 >> n;
  1910. av_assert2(n <= 4);
  1911. if (w & bw) {
  1912. s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
  1913. s->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
  1914. s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
  1915. s->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
  1916. o += bw;
  1917. }
  1918. }
  1919. }
  1920. // pick filter level and find edges to apply filter to
  1921. if (s->s.h.filter.level &&
  1922. (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
  1923. [b->mode[3] != ZEROMV]) > 0) {
  1924. int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
  1925. int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
  1926. setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
  1927. mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
  1928. if (s->ss_h || s->ss_v)
  1929. mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
  1930. s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
  1931. s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
  1932. b->uvtx, skip_inter);
  1933. if (!s->filter_lut.lim_lut[lvl]) {
  1934. int sharp = s->s.h.filter.sharpness;
  1935. int limit = lvl;
  1936. if (sharp > 0) {
  1937. limit >>= (sharp + 3) >> 2;
  1938. limit = FFMIN(limit, 9 - sharp);
  1939. }
  1940. limit = FFMAX(limit, 1);
  1941. s->filter_lut.lim_lut[lvl] = limit;
  1942. s->filter_lut.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
  1943. }
  1944. }
  1945. if (s->pass == 2) {
  1946. s->b++;
  1947. s->block += w4 * h4 * 64 * bytesperpixel;
  1948. s->uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
  1949. s->uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
  1950. s->eob += 4 * w4 * h4;
  1951. s->uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
  1952. s->uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
  1953. }
  1954. }