on the huffman tree, instead of traversing the tree in a while loop. Based on the similar optimization in libvpx's detokenize.c 10% faster at normal bitrates, and 30% faster for high-bitrate intra-only Originally committed as revision 24468 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -226,6 +226,24 @@ static inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) | |||||
| return bit; | return bit; | ||||
| } | } | ||||
| // branchy variant, to be used where there's a branch based on the bit decoded | |||||
| static av_always_inline int vp56_rac_get_prob_branchy(VP56RangeCoder *c, int prob) | |||||
| { | |||||
| unsigned long code_word = vp56_rac_renorm(c); | |||||
| unsigned low = 1 + (((c->high - 1) * prob) >> 8); | |||||
| unsigned low_shift = low << 8; | |||||
| if (code_word >= low_shift) { | |||||
| c->high -= low; | |||||
| c->code_word = code_word - low_shift; | |||||
| return 1; | |||||
| } | |||||
| c->high = low; | |||||
| c->code_word = code_word; | |||||
| return 0; | |||||
| } | |||||
| static inline int vp56_rac_get(VP56RangeCoder *c) | static inline int vp56_rac_get(VP56RangeCoder *c) | ||||
| { | { | ||||
| unsigned int code_word = vp56_rac_renorm(c); | unsigned int code_word = vp56_rac_renorm(c); | ||||
| @@ -800,36 +800,61 @@ static int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16], | |||||
| uint8_t probs[8][3][NUM_DCT_TOKENS-1], | uint8_t probs[8][3][NUM_DCT_TOKENS-1], | ||||
| int i, int zero_nhood, int16_t qmul[2]) | int i, int zero_nhood, int16_t qmul[2]) | ||||
| { | { | ||||
| int token, nonzero = 0; | |||||
| int offset = 0; | |||||
| uint8_t *token_prob; | |||||
| int nonzero = 0; | |||||
| int coeff; | |||||
| for (; i < 16; i++) { | |||||
| token = vp8_rac_get_tree_with_offset(c, vp8_coeff_tree, probs[vp8_coeff_band[i]][zero_nhood], offset); | |||||
| do { | |||||
| token_prob = probs[vp8_coeff_band[i]][zero_nhood]; | |||||
| if (token == DCT_EOB) | |||||
| break; | |||||
| else if (token >= DCT_CAT1) { | |||||
| int cat = token-DCT_CAT1; | |||||
| token = vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]); | |||||
| token += 3 + (2<<cat); | |||||
| } | |||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[0])) // DCT_EOB | |||||
| return nonzero; | |||||
| // after the first token, the non-zero prediction context becomes | |||||
| // based on the last decoded coeff | |||||
| if (!token) { | |||||
| skip_eob: | |||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0 | |||||
| zero_nhood = 0; | zero_nhood = 0; | ||||
| offset = 1; | |||||
| continue; | |||||
| } else if (token == 1) | |||||
| token_prob = probs[vp8_coeff_band[++i]][0]; | |||||
| if (i < 16) | |||||
| goto skip_eob; | |||||
| return nonzero; // invalid input; blocks should end with EOB | |||||
| } | |||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1 | |||||
| coeff = 1; | |||||
| zero_nhood = 1; | zero_nhood = 1; | ||||
| else | |||||
| } else { | |||||
| zero_nhood = 2; | zero_nhood = 2; | ||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4 | |||||
| coeff = vp56_rac_get_prob(c, token_prob[4]); | |||||
| if (coeff) | |||||
| coeff += vp56_rac_get_prob(c, token_prob[5]); | |||||
| coeff += 2; | |||||
| } else { | |||||
| // DCT_CAT* | |||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[6])) { | |||||
| if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1 | |||||
| coeff = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]); | |||||
| } else { // DCT_CAT2 | |||||
| coeff = 7; | |||||
| coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1; | |||||
| coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]); | |||||
| } | |||||
| } else { // DCT_CAT3 and up | |||||
| int a = vp56_rac_get_prob(c, token_prob[8]); | |||||
| int b = vp56_rac_get_prob(c, token_prob[9+a]); | |||||
| int cat = (a<<1) + b; | |||||
| coeff = 3 + (8<<cat); | |||||
| coeff += vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]); | |||||
| } | |||||
| } | |||||
| } | |||||
| // todo: full [16] qmat? load into register? | // todo: full [16] qmat? load into register? | ||||
| block[zigzag_scan[i]] = (vp8_rac_get(c) ? -token : token) * qmul[!!i]; | |||||
| nonzero = i+1; | |||||
| offset = 0; | |||||
| } | |||||
| block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i]; | |||||
| nonzero = ++i; | |||||
| } while (i < 16); | |||||
| return nonzero; | return nonzero; | ||||
| } | } | ||||
| @@ -329,21 +329,6 @@ static const uint8_t vp8_coeff_band[16] = | |||||
| 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7 | 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7 | ||||
| }; | }; | ||||
| static const int8_t vp8_coeff_tree[NUM_DCT_TOKENS-1][2] = | |||||
| { | |||||
| { -DCT_EOB, 1 }, // '0' | |||||
| { -DCT_0, 2 }, // '10' | |||||
| { -DCT_1, 3 }, // '110' | |||||
| { 4, 6 }, | |||||
| { -DCT_2, 5 }, // '11100' | |||||
| { -DCT_3, -DCT_4 }, // '111010', '111011' | |||||
| { 7, 8 }, | |||||
| { -DCT_CAT1, -DCT_CAT2 }, // '111100', '111101' | |||||
| { 9, 10 }, | |||||
| { -DCT_CAT3, -DCT_CAT4 }, // '1111100', '1111101' | |||||
| { -DCT_CAT5, -DCT_CAT6 }, // '1111110', '1111111' | |||||
| }; | |||||
| static const uint8_t vp8_dct_cat1_prob[] = { 159, 0 }; | static const uint8_t vp8_dct_cat1_prob[] = { 159, 0 }; | ||||
| static const uint8_t vp8_dct_cat2_prob[] = { 165, 145, 0 }; | static const uint8_t vp8_dct_cat2_prob[] = { 165, 145, 0 }; | ||||
| static const uint8_t vp8_dct_cat3_prob[] = { 173, 148, 140, 0 }; | static const uint8_t vp8_dct_cat3_prob[] = { 173, 148, 140, 0 }; | ||||
| @@ -351,10 +336,9 @@ static const uint8_t vp8_dct_cat4_prob[] = { 176, 155, 140, 135, 0 }; | |||||
| static const uint8_t vp8_dct_cat5_prob[] = { 180, 157, 141, 134, 130, 0 }; | static const uint8_t vp8_dct_cat5_prob[] = { 180, 157, 141, 134, 130, 0 }; | ||||
| static const uint8_t vp8_dct_cat6_prob[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; | static const uint8_t vp8_dct_cat6_prob[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; | ||||
| static const uint8_t * const vp8_dct_cat_prob[6] = | |||||
| // only used for cat3 and above; cat 1 and 2 are referenced directly | |||||
| static const uint8_t * const vp8_dct_cat_prob[] = | |||||
| { | { | ||||
| vp8_dct_cat1_prob, | |||||
| vp8_dct_cat2_prob, | |||||
| vp8_dct_cat3_prob, | vp8_dct_cat3_prob, | ||||
| vp8_dct_cat4_prob, | vp8_dct_cat4_prob, | ||||
| vp8_dct_cat5_prob, | vp8_dct_cat5_prob, | ||||