Merge remote-tracking branch 'qatar/master'

* qatar/master: dnxhddec: optimise dnxhd_decode_dct_block() rtp: remove disabled code eac3enc: use different numbers of blocks per frame to allow higher bitrates dnxhd: add regression test for 10-bit dnxhd: 10-bit support dsputil: update per-arch init funcs for non-h264 high bit depth dsputil: template get_pixels() for different bit depths dsputil: create 16/32-bit dctcoef versions of some functions jfdctint: add 10-bit version mov: add clcp type track as Subtitle stream. mpeg4: add Mpeg4 Profiles names. mpeg4: decode Level Profile for MPEG4 Part 2. ffprobe: display bitstream level. imgconvert: remove unused glue and xglue macros Conflicts: libavcodec/dsputil_template.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
14 years ago · 4095fa9038
--- a/ffprobe.c
+++ b/ffprobe.c
@@ -202,6 +202,7 @@ static void show_stream(AVFormatContext *fmt_ctx, int stream_idx)
            }
            printf("pix_fmt=%s\n",                 dec_ctx->pix_fmt != PIX_FMT_NONE ?
                   av_pix_fmt_descriptors[dec_ctx->pix_fmt].name : "unknown");
            printf("level=%d\n",                   dec_ctx->level);
            break;

        case AVMEDIA_TYPE_AUDIO:
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -186,7 +186,7 @@ void ff_ac3_adjust_frame_size(AC3EncodeContext *s)
    s->frame_size = s->frame_size_min +
                    2 * (s->bits_written * s->sample_rate < s->samples_written * s->bit_rate);
    s->bits_written    += s->frame_size * 8;
    s->samples_written += AC3_FRAME_SIZE;
    s->samples_written += AC3_BLOCK_SIZE * s->num_blocks;
 }


@@ -198,7 +198,7 @@ void ff_ac3_compute_coupling_strategy(AC3EncodeContext *s)

    /* set coupling use flags for each block/channel */
    /* TODO: turn coupling on/off and adjust start band based on bit usage */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = 1; ch <= s->fbw_channels; ch++)
            block->channel_in_cpl[ch] = s->cpl_on;
@@ -208,7 +208,7 @@ void ff_ac3_compute_coupling_strategy(AC3EncodeContext *s)
       enabled for that block */
    got_cpl_snr = 0;
    num_cpl_blocks = 0;
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        block->num_cpl_channels = 0;
        for (ch = 1; ch <= s->fbw_channels; ch++)
@@ -244,7 +244,7 @@ void ff_ac3_compute_coupling_strategy(AC3EncodeContext *s)
        s->cpl_on = 0;

    /* set bandwidth for each channel */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = 1; ch <= s->fbw_channels; ch++) {
            if (block->channel_in_cpl[ch])
@@ -269,7 +269,7 @@ void ff_ac3_apply_rematrixing(AC3EncodeContext *s)
    if (!s->rematrixing_enabled)
        return;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        if (block->new_rematrixing_strategy)
            flags = block->rematrixing_flags;
@@ -318,7 +318,7 @@ static av_cold void exponent_init(AC3EncodeContext *s)
 static void extract_exponents(AC3EncodeContext *s)
 {
    int ch        = !s->cpl_on;
    int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS * (s->channels - ch + 1);
    int chan_size = AC3_MAX_COEFS * s->num_blocks * (s->channels - ch + 1);
    AC3Block *block = &s->blocks[0];

    s->ac3dsp.extract_exponents(block->exp[ch], block->fixed_coef[ch], chan_size);
@@ -331,6 +331,15 @@ static void extract_exponents(AC3EncodeContext *s)
 */
 #define EXP_DIFF_THRESHOLD 500

 /**
 * Table used to select exponent strategy based on exponent reuse block interval.
 */
 static const uint8_t exp_strategy_reuse_tab[4][6] = {
    { EXP_D15, EXP_D15, EXP_D15, EXP_D15, EXP_D15, EXP_D15 },
    { EXP_D15, EXP_D15, EXP_D15, EXP_D15, EXP_D15, EXP_D15 },
    { EXP_D25, EXP_D25, EXP_D15, EXP_D15, EXP_D15, EXP_D15 },
    { EXP_D45, EXP_D25, EXP_D25, EXP_D15, EXP_D15, EXP_D15 }
 };

 /**
 * Calculate exponent strategies for all channels.
@@ -349,7 +358,7 @@ static void compute_exp_strategy(AC3EncodeContext *s)
           reused in the next frame */
        exp_strategy[0] = EXP_NEW;
        exp += AC3_MAX_COEFS;
        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++, exp += AC3_MAX_COEFS) {
        for (blk = 1; blk < s->num_blocks; blk++, exp += AC3_MAX_COEFS) {
            if (ch == CPL_CH) {
                if (!s->blocks[blk-1].cpl_in_use) {
                    exp_strategy[blk] = EXP_NEW;
@@ -373,23 +382,18 @@ static void compute_exp_strategy(AC3EncodeContext *s)
        /* now select the encoding strategy type : if exponents are often
           recoded, we use a coarse encoding */
        blk = 0;
        while (blk < AC3_MAX_BLOCKS) {
        while (blk < s->num_blocks) {
            blk1 = blk + 1;
            while (blk1 < AC3_MAX_BLOCKS && exp_strategy[blk1] == EXP_REUSE)
            while (blk1 < s->num_blocks && exp_strategy[blk1] == EXP_REUSE)
                blk1++;
            switch (blk1 - blk) {
            case 1:  exp_strategy[blk] = EXP_D45; break;
            case 2:
            case 3:  exp_strategy[blk] = EXP_D25; break;
            default: exp_strategy[blk] = EXP_D15; break;
            }
            exp_strategy[blk] = exp_strategy_reuse_tab[s->num_blks_code][blk1-blk-1];
            blk = blk1;
        }
    }
    if (s->lfe_on) {
        ch = s->lfe_channel;
        s->exp_strategy[ch][0] = EXP_D15;
        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 1; blk < s->num_blocks; blk++)
            s->exp_strategy[ch][blk] = EXP_REUSE;
    }

@@ -487,7 +491,7 @@ static void encode_exponents(AC3EncodeContext *s)

        cpl = (ch == CPL_CH);
        blk = 0;
        while (blk < AC3_MAX_BLOCKS) {
        while (blk < s->num_blocks) {
            AC3Block *block = &s->blocks[blk];
            if (cpl && !block->cpl_in_use) {
                exp += AC3_MAX_COEFS;
@@ -500,7 +504,7 @@ static void encode_exponents(AC3EncodeContext *s)
            /* count the number of EXP_REUSE blocks after the current block
               and set exponent reference block numbers */
            s->exp_ref_block[ch][blk] = blk;
            while (blk1 < AC3_MAX_BLOCKS && exp_strategy[blk1] == EXP_REUSE) {
            while (blk1 < s->num_blocks && exp_strategy[blk1] == EXP_REUSE) {
                s->exp_ref_block[ch][blk1] = blk;
                blk1++;
            }
@@ -536,7 +540,7 @@ static void group_exponents(AC3EncodeContext *s)
    int exp0, exp1;

    bit_count = 0;
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
            int exp_strategy = s->exp_strategy[ch][blk];
@@ -625,30 +629,38 @@ static void count_frame_bits_fixed(AC3EncodeContext *s)
    if (s->eac3) {
        /* bitstream info header */
        frame_bits += 35;
        frame_bits += 1 + 1 + 1;
        frame_bits += 1 + 1;
        if (s->num_blocks != 0x6)
            frame_bits++;
        frame_bits++;
        /* audio frame header */
        frame_bits += 2;
        if (s->num_blocks == 6)
            frame_bits += 2;
        frame_bits += 10;
        /* exponent strategy */
        if (s->use_frame_exp_strategy)
            frame_bits += 5 * s->fbw_channels;
        else
            frame_bits += AC3_MAX_BLOCKS * 2 * s->fbw_channels;
            frame_bits += s->num_blocks * 2 * s->fbw_channels;
        if (s->lfe_on)
            frame_bits += AC3_MAX_BLOCKS;
            frame_bits += s->num_blocks;
        /* converter exponent strategy */
        frame_bits += s->fbw_channels * 5;
        if (s->num_blks_code != 0x3)
            frame_bits++;
        else
            frame_bits += s->fbw_channels * 5;
        /* snr offsets */
        frame_bits += 10;
        /* block start info */
        frame_bits++;
        if (s->num_blocks != 1)
            frame_bits++;
    } else {
        frame_bits += 49;
        frame_bits += frame_bits_inc[s->channel_mode];
    }

    /* audio blocks */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        if (!s->eac3) {
            /* block switch flags */
            frame_bits += s->fbw_channels;
@@ -750,7 +762,7 @@ static void count_frame_bits(AC3EncodeContext *s)
        /* coupling */
        if (s->channel_mode > AC3_CHMODE_MONO) {
            frame_bits++;
            for (blk = 1; blk < AC3_MAX_BLOCKS; blk++) {
            for (blk = 1; blk < s->num_blocks; blk++) {
                AC3Block *block = &s->blocks[blk];
                frame_bits++;
                if (block->new_cpl_strategy)
@@ -762,7 +774,7 @@ static void count_frame_bits(AC3EncodeContext *s)
            if (s->use_frame_exp_strategy) {
                frame_bits += 5 * s->cpl_on;
            } else {
                for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
                for (blk = 0; blk < s->num_blocks; blk++)
                    frame_bits += 2 * s->blocks[blk].cpl_in_use;
            }
        }
@@ -778,7 +790,7 @@ static void count_frame_bits(AC3EncodeContext *s)
    }

    /* audio blocks */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];

        /* coupling strategy */
@@ -865,7 +877,7 @@ static void bit_alloc_masking(AC3EncodeContext *s)
 {
    int blk, ch;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
            /* We only need psd and mask for calculating bap.
@@ -901,9 +913,9 @@ static void reset_block_bap(AC3EncodeContext *s)

    ref_bap = s->bap_buffer;
    for (ch = 0; ch <= s->channels; ch++) {
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            s->ref_bap[ch][blk] = ref_bap + AC3_MAX_COEFS * s->exp_ref_block[ch][blk];
        ref_bap += AC3_MAX_COEFS * AC3_MAX_BLOCKS;
        ref_bap += AC3_MAX_COEFS * s->num_blocks;
    }
    s->ref_bap_set = 1;
 }
@@ -936,7 +948,7 @@ static void count_mantissa_bits_update_ch(AC3EncodeContext *s, int ch,
 {
    int blk;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        if (ch == CPL_CH && !block->cpl_in_use)
            continue;
@@ -980,7 +992,7 @@ static int bit_alloc(AC3EncodeContext *s, int snr_offset)
    snr_offset = (snr_offset - 240) << 2;

    reset_block_bap(s);
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];

        for (ch = !block->cpl_in_use; ch <= s->channels; ch++) {
@@ -1194,7 +1206,7 @@ void ff_ac3_quantize_mantissas(AC3EncodeContext *s)
 {
    int blk, ch, ch0=0, got_cpl;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        AC3Mant m = { 0 };

@@ -1557,7 +1569,7 @@ void ff_ac3_output_frame(AC3EncodeContext *s, unsigned char *frame)

    s->output_frame_header(s);

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
    for (blk = 0; blk < s->num_blocks; blk++)
        output_audio_block(s, blk);

    output_frame_end(s);
@@ -1585,6 +1597,7 @@ static void dprint_options(AC3EncodeContext *s)
    av_dlog(avctx, "channel_layout: %s\n", strbuf);
    av_dlog(avctx, "sample_rate: %d\n", s->sample_rate);
    av_dlog(avctx, "bit_rate: %d\n", s->bit_rate);
    av_dlog(avctx, "blocks/frame: %d (code=%d)\n", s->num_blocks, s->num_blks_code);
    if (s->cutoff)
        av_dlog(avctx, "cutoff: %d\n", s->cutoff);

@@ -1851,7 +1864,7 @@ av_cold int ff_ac3_encode_close(AVCodecContext *avctx)
    av_freep(&s->qmant_buffer);
    av_freep(&s->cpl_coord_exp_buffer);
    av_freep(&s->cpl_coord_mant_buffer);
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        av_freep(&block->mdct_coef);
        av_freep(&block->fixed_coef);
@@ -1958,18 +1971,30 @@ static av_cold int validate_options(AC3EncodeContext *s)
    /* validate bit rate */
    if (s->eac3) {
        int max_br, min_br, wpf, min_br_dist, min_br_code;
        int num_blks_code, num_blocks, frame_samples;

        /* calculate min/max bitrate */
        max_br = 2048 * s->sample_rate / AC3_FRAME_SIZE * 16;
        min_br = ((s->sample_rate + (AC3_FRAME_SIZE-1)) / AC3_FRAME_SIZE) * 16;
        /* TODO: More testing with 3 and 2 blocks. All E-AC-3 samples I've
                 found use either 6 blocks or 1 block, even though 2 or 3 blocks
                 would work as far as the bit rate is concerned. */
        for (num_blks_code = 3; num_blks_code >= 0; num_blks_code--) {
            num_blocks = ((int[]){ 1, 2, 3, 6 })[num_blks_code];
            frame_samples  = AC3_BLOCK_SIZE * num_blocks;
            max_br = 2048 * s->sample_rate / frame_samples * 16;
            min_br = ((s->sample_rate + (frame_samples-1)) / frame_samples) * 16;
            if (avctx->bit_rate <= max_br)
                break;
        }
        if (avctx->bit_rate < min_br || avctx->bit_rate > max_br) {
            av_log(avctx, AV_LOG_ERROR, "invalid bit rate. must be %d to %d "
                   "for this sample rate\n", min_br, max_br);
            return AVERROR(EINVAL);
        }
        s->num_blks_code = num_blks_code;
        s->num_blocks    = num_blocks;

        /* calculate words-per-frame for the selected bitrate */
        wpf = (avctx->bit_rate / 16) * AC3_FRAME_SIZE / s->sample_rate;
        wpf = (avctx->bit_rate / 16) * frame_samples / s->sample_rate;
        av_assert1(wpf > 0 && wpf <= 2048);

        /* find the closest AC-3 bitrate code to the selected bitrate.
@@ -2001,6 +2026,8 @@ static av_cold int validate_options(AC3EncodeContext *s)
        }
        s->frame_size_code = i << 1;
        s->frame_size_min  = 2 * ff_ac3_frame_size_tab[s->frame_size_code][s->bit_alloc.sr_code];
        s->num_blks_code   = 0x3;
        s->num_blocks      = 6;
    }
    s->bit_rate   = avctx->bit_rate;
    s->frame_size = s->frame_size_min;
@@ -2065,13 +2092,13 @@ static av_cold void set_bandwidth(AC3EncodeContext *s)
    /* set number of coefficients for each channel */
    for (ch = 1; ch <= s->fbw_channels; ch++) {
        s->start_freq[ch] = 0;
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            s->blocks[blk].end_freq[ch] = s->bandwidth_code * 3 + 73;
    }
    /* LFE channel always has 7 coefs */
    if (s->lfe_on) {
        s->start_freq[s->lfe_channel] = 0;
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            s->blocks[blk].end_freq[ch] = 7;
    }

@@ -2108,7 +2135,7 @@ static av_cold void set_bandwidth(AC3EncodeContext *s)

        s->start_freq[CPL_CH] = cpl_start_band * 12 + 37;
        s->cpl_end_freq       = cpl_end_band   * 12 + 37;
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            s->blocks[blk].end_freq[CPL_CH] = s->cpl_end_freq;
    }
 }
@@ -2119,35 +2146,37 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
    AVCodecContext *avctx = s->avctx;
    int blk, ch;
    int channels = s->channels + 1; /* includes coupling channel */
    int channel_blocks = channels * s->num_blocks;
    int total_coefs    = AC3_MAX_COEFS * channel_blocks;

    if (s->allocate_sample_buffers(s))
        goto alloc_fail;

    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer,  AC3_MAX_BLOCKS * channels *
                     AC3_MAX_COEFS * sizeof(*s->bap_buffer),  alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, AC3_MAX_BLOCKS * channels *
                     AC3_MAX_COEFS * sizeof(*s->bap1_buffer), alloc_fail);
    FF_ALLOCZ_OR_GOTO(avctx, s->mdct_coef_buffer, AC3_MAX_BLOCKS * channels *
                      AC3_MAX_COEFS * sizeof(*s->mdct_coef_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, AC3_MAX_BLOCKS * channels *
                     AC3_MAX_COEFS * sizeof(*s->exp_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, AC3_MAX_BLOCKS * channels *
                     128 * sizeof(*s->grouped_exp_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, AC3_MAX_BLOCKS * channels *
                     AC3_MAX_COEFS * sizeof(*s->psd_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, AC3_MAX_BLOCKS * channels *
                     64 * sizeof(*s->band_psd_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, AC3_MAX_BLOCKS * channels *
                     64 * sizeof(*s->mask_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, AC3_MAX_BLOCKS * channels *
                     AC3_MAX_COEFS * sizeof(*s->qmant_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->bap_buffer, total_coefs *
                     sizeof(*s->bap_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->bap1_buffer, total_coefs *
                     sizeof(*s->bap1_buffer), alloc_fail);
    FF_ALLOCZ_OR_GOTO(avctx, s->mdct_coef_buffer, total_coefs *
                      sizeof(*s->mdct_coef_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->exp_buffer, total_coefs *
                     sizeof(*s->exp_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->grouped_exp_buffer, channel_blocks * 128 *
                     sizeof(*s->grouped_exp_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->psd_buffer, total_coefs *
                     sizeof(*s->psd_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->band_psd_buffer, channel_blocks * 64 *
                     sizeof(*s->band_psd_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->mask_buffer, channel_blocks * 64 *
                     sizeof(*s->mask_buffer), alloc_fail);
    FF_ALLOC_OR_GOTO(avctx, s->qmant_buffer, total_coefs *
                     sizeof(*s->qmant_buffer), alloc_fail);
    if (s->cpl_enabled) {
        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, AC3_MAX_BLOCKS * channels *
                         16 * sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, AC3_MAX_BLOCKS * channels *
                         16 * sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_exp_buffer, channel_blocks * 16 *
                         sizeof(*s->cpl_coord_exp_buffer), alloc_fail);
        FF_ALLOC_OR_GOTO(avctx, s->cpl_coord_mant_buffer, channel_blocks * 16 *
                         sizeof(*s->cpl_coord_mant_buffer), alloc_fail);
    }
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        FF_ALLOCZ_OR_GOTO(avctx, block->mdct_coef, channels * sizeof(*block->mdct_coef),
                          alloc_fail);
@@ -2183,23 +2212,23 @@ static av_cold int allocate_buffers(AC3EncodeContext *s)
            }

            /* arrangement: channel, block, coeff */
            block->exp[ch]         = &s->exp_buffer        [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
            block->exp[ch]         = &s->exp_buffer        [AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
            block->mdct_coef[ch]   = &s->mdct_coef_buffer  [AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
        }
    }

    if (!s->fixed_point) {
        FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, AC3_MAX_BLOCKS * channels *
                          AC3_MAX_COEFS * sizeof(*s->fixed_coef_buffer), alloc_fail);
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
        FF_ALLOCZ_OR_GOTO(avctx, s->fixed_coef_buffer, total_coefs *
                          sizeof(*s->fixed_coef_buffer), alloc_fail);
        for (blk = 0; blk < s->num_blocks; blk++) {
            AC3Block *block = &s->blocks[blk];
            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
                              sizeof(*block->fixed_coef), alloc_fail);
            for (ch = 0; ch < channels; ch++)
                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (AC3_MAX_BLOCKS * ch + blk)];
                block->fixed_coef[ch] = &s->fixed_coef_buffer[AC3_MAX_COEFS * (s->num_blocks * ch + blk)];
        }
    } else {
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
        for (blk = 0; blk < s->num_blocks; blk++) {
            AC3Block *block = &s->blocks[blk];
            FF_ALLOCZ_OR_GOTO(avctx, block->fixed_coef, channels *
                              sizeof(*block->fixed_coef), alloc_fail);
@@ -2226,14 +2255,14 @@ av_cold int ff_ac3_encode_init(AVCodecContext *avctx)

    s->eac3 = avctx->codec_id == CODEC_ID_EAC3;

    avctx->frame_size = AC3_FRAME_SIZE;

    ff_ac3_common_init();

    ret = validate_options(s);
    if (ret)
        return ret;

    avctx->frame_size = AC3_BLOCK_SIZE * s->num_blocks;

    s->bitstream_mode = avctx->audio_service_type;
    if (s->bitstream_mode == AV_AUDIO_SERVICE_TYPE_KARAOKE)
        s->bitstream_mode = 0x7;
--- a/libavcodec/ac3enc.h
+++ b/libavcodec/ac3enc.h
@@ -152,6 +152,8 @@ typedef struct AC3EncodeContext {
    int bit_rate;                           ///< target bit rate, in bits-per-second
    int sample_rate;                        ///< sampling frequency, in Hz

    int num_blks_code;                      ///< number of blocks code                  (numblkscod)
    int num_blocks;                         ///< number of blocks per frame
    int frame_size_min;                     ///< minimum frame size in case rounding is necessary
    int frame_size;                         ///< current frame size in bytes
    int frame_size_code;                    ///< frame size code                        (frmsizecod)
--- a/libavcodec/ac3enc_fixed.c
+++ b/libavcodec/ac3enc_fixed.c
@@ -93,7 +93,7 @@ static void scale_coefficients(AC3EncodeContext *s)
 {
    int blk, ch;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = 1; ch <= s->channels; ch++) {
            s->ac3dsp.ac3_rshift_int32(block->mdct_coef[ch], AC3_MAX_COEFS,
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -103,7 +103,7 @@ static int normalize_samples(AC3EncodeContext *s)
 */
 static void scale_coefficients(AC3EncodeContext *s)
 {
    int chan_size = AC3_MAX_COEFS * AC3_MAX_BLOCKS;
    int chan_size = AC3_MAX_COEFS * s->num_blocks;
    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer + chan_size,
                               s->mdct_coef_buffer  + chan_size,
                               chan_size * s->channels);
--- a/libavcodec/ac3enc_template.c
+++ b/libavcodec/ac3enc_template.c
@@ -79,13 +79,13 @@ static void deinterleave_input_samples(AC3EncodeContext *s,
        int sinc;

        /* copy last 256 samples of previous frame to the start of the current frame */
        memcpy(&s->planar_samples[ch][0], &s->planar_samples[ch][AC3_FRAME_SIZE],
        memcpy(&s->planar_samples[ch][0], &s->planar_samples[ch][AC3_BLOCK_SIZE * s->num_blocks],
               AC3_BLOCK_SIZE * sizeof(s->planar_samples[0][0]));

        /* deinterleave */
        sinc = s->channels;
        sptr = samples + s->channel_map[ch];
        for (i = AC3_BLOCK_SIZE; i < AC3_FRAME_SIZE+AC3_BLOCK_SIZE; i++) {
        for (i = AC3_BLOCK_SIZE; i < AC3_BLOCK_SIZE * (s->num_blocks + 1); i++) {
            s->planar_samples[ch][i] = *sptr;
            sptr += sinc;
        }
@@ -103,7 +103,7 @@ static void apply_mdct(AC3EncodeContext *s)
    int blk, ch;

    for (ch = 0; ch < s->channels; ch++) {
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
        for (blk = 0; blk < s->num_blocks; blk++) {
            AC3Block *block = &s->blocks[blk];
            const SampleType *input_samples = &s->planar_samples[ch][blk * AC3_BLOCK_SIZE];

@@ -159,7 +159,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
    cpl_start     = FFMIN(256, cpl_start + num_cpl_coefs) - num_cpl_coefs;

    /* calculate coupling channel from fbw channels */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        CoefType *cpl_coef = &block->mdct_coef[CPL_CH][cpl_start];
        if (!block->cpl_in_use)
@@ -188,7 +188,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
    while (i < s->cpl_end_freq) {
        int band_size = s->cpl_band_sizes[bnd];
        for (ch = CPL_CH; ch <= s->fbw_channels; ch++) {
            for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
            for (blk = 0; blk < s->num_blocks; blk++) {
                AC3Block *block = &s->blocks[blk];
                if (!block->cpl_in_use || (ch > CPL_CH && !block->channel_in_cpl[ch]))
                    continue;
@@ -203,7 +203,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
    }

    /* determine which blocks to send new coupling coordinates for */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block  = &s->blocks[blk];
        AC3Block *block0 = blk ? &s->blocks[blk-1] : NULL;
        int new_coords = 0;
@@ -261,7 +261,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
       coordinates in successive blocks */
    for (bnd = 0; bnd < s->num_cpl_bands; bnd++) {
        blk = 0;
        while (blk < AC3_MAX_BLOCKS) {
        while (blk < s->num_blocks) {
            int blk1;
            CoefSumType energy_cpl;
            AC3Block *block  = &s->blocks[blk];
@@ -273,7 +273,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)

            energy_cpl = energy[blk][CPL_CH][bnd];
            blk1 = blk+1;
            while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
            while (!s->blocks[blk1].new_cpl_coords && blk1 < s->num_blocks) {
                if (s->blocks[blk1].cpl_in_use)
                    energy_cpl += energy[blk1][CPL_CH][bnd];
                blk1++;
@@ -285,7 +285,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
                    continue;
                energy_ch = energy[blk][ch][bnd];
                blk1 = blk+1;
                while (!s->blocks[blk1].new_cpl_coords && blk1 < AC3_MAX_BLOCKS) {
                while (!s->blocks[blk1].new_cpl_coords && blk1 < s->num_blocks) {
                    if (s->blocks[blk1].cpl_in_use)
                        energy_ch += energy[blk1][ch][bnd];
                    blk1++;
@@ -297,7 +297,7 @@ static void apply_channel_coupling(AC3EncodeContext *s)
    }

    /* calculate exponents/mantissas for coupling coordinates */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        if (!block->cpl_in_use || !block->new_cpl_coords)
            continue;
@@ -362,7 +362,7 @@ static void compute_rematrixing_strategy(AC3EncodeContext *s)
    if (s->channel_mode != AC3_CHMODE_STEREO)
        return;

    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        block = &s->blocks[blk];
        block->new_rematrixing_strategy = !blk;

@@ -440,7 +440,7 @@ int AC3_NAME(encode_frame)(AVCodecContext *avctx, unsigned char *frame,
        scale_coefficients(s);

    clip_coefficients(&s->dsp, s->blocks[0].mdct_coef[1],
                      AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
                      AC3_MAX_COEFS * s->num_blocks * s->channels);

    s->cpl_on = s->cpl_enabled;
    ff_ac3_compute_coupling_strategy(s);
--- a/libavcodec/alpha/dsputil_alpha.c
+++ b/libavcodec/alpha/dsputil_alpha.c
@@ -270,7 +270,7 @@ static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,

 void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (!high_bit_depth) {
    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
@@ -321,7 +321,8 @@ void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;

        c->get_pixels       = get_pixels_mvi;
        if (!high_bit_depth)
            c->get_pixels   = get_pixels_mvi;
        c->diff_pixels      = diff_pixels_mvi;
        c->sad[0]           = pix_abs16x16_mvi_asm;
        c->sad[1]           = pix_abs8x8_mvi;
--- a/libavcodec/arm/dsputil_init_arm.c
+++ b/libavcodec/arm/dsputil_init_arm.c
@@ -75,7 +75,7 @@ static void simple_idct_arm_add(uint8_t *dest, int line_size, DCTELEM *block)

 void dsputil_init_arm(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    ff_put_pixels_clamped = c->put_pixels_clamped;
    ff_add_pixels_clamped = c->add_pixels_clamped;
--- a/libavcodec/arm/dsputil_init_armv6.c
+++ b/libavcodec/arm/dsputil_init_armv6.c
@@ -72,7 +72,7 @@ int ff_pix_sum_armv6(uint8_t *pix, int line_size);

 void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
        (avctx->idct_algo == FF_IDCT_AUTO ||
@@ -106,8 +106,9 @@ void av_cold ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx)
    c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
    }

    if (!high_bit_depth)
        c->get_pixels = ff_get_pixels_armv6;
    c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
    c->get_pixels = ff_get_pixels_armv6;
    c->diff_pixels = ff_diff_pixels_armv6;

    c->pix_abs[0][0] = ff_pix_abs16_armv6;
--- a/libavcodec/arm/dsputil_init_neon.c
+++ b/libavcodec/arm/dsputil_init_neon.c
@@ -175,7 +175,7 @@ void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,

 void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
        if (avctx->idct_algo == FF_IDCT_AUTO ||
--- a/libavcodec/arm/dsputil_iwmmxt.c
+++ b/libavcodec/arm/dsputil_iwmmxt.c
@@ -155,7 +155,7 @@ static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 void ff_dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = AV_CPU_FLAG_IWMMXT; /* multimedia extension flags */
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (avctx->dsp_mask) {
        if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -2278,6 +2278,23 @@ typedef struct AVCodecContext {
 #define FF_PROFILE_VC1_COMPLEX  2
 #define FF_PROFILE_VC1_ADVANCED 3

 #define FF_PROFILE_MPEG4_SIMPLE                     0
 #define FF_PROFILE_MPEG4_SIMPLE_SCALABLE            1
 #define FF_PROFILE_MPEG4_CORE                       2
 #define FF_PROFILE_MPEG4_MAIN                       3
 #define FF_PROFILE_MPEG4_N_BIT                      4
 #define FF_PROFILE_MPEG4_SCALABLE_TEXTURE           5
 #define FF_PROFILE_MPEG4_SIMPLE_FACE_ANIMATION      6
 #define FF_PROFILE_MPEG4_BASIC_ANIMATED_TEXTURE     7
 #define FF_PROFILE_MPEG4_HYBRID                     8
 #define FF_PROFILE_MPEG4_ADVANCED_REAL_TIME         9
 #define FF_PROFILE_MPEG4_CORE_SCALABLE             10
 #define FF_PROFILE_MPEG4_ADVANCED_CODING           11
 #define FF_PROFILE_MPEG4_ADVANCED_CORE             12
 #define FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE 13
 #define FF_PROFILE_MPEG4_SIMPLE_STUDIO             14
 #define FF_PROFILE_MPEG4_ADVANCED_SIMPLE           15

    /**
     * level
     * - encoding: Set by user.
--- a/libavcodec/bfin/dsputil_bfin.c
+++ b/libavcodec/bfin/dsputil_bfin.c
@@ -197,14 +197,14 @@ static int bfin_pix_abs8_xy2 (void *c, uint8_t *blk1, uint8_t *blk2, int line_si

 void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    c->get_pixels         = ff_bfin_get_pixels;
    c->diff_pixels        = ff_bfin_diff_pixels;
    c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
    c->add_pixels_clamped = ff_bfin_add_pixels_clamped;

    if (!high_bit_depth)
    c->get_pixels         = ff_bfin_get_pixels;
    c->clear_blocks       = bfin_clear_blocks;
    c->pix_sum            = ff_bfin_pix_sum;
    c->pix_norm1          = ff_bfin_pix_norm1;
@@ -253,10 +253,10 @@ void dsputil_init_bfin( DSPContext* c, AVCodecContext *avctx )
 /*     c->put_no_rnd_pixels_tab[0][3] = ff_bfin_put_pixels16_xy2_nornd; */
    }

    if (avctx->dct_algo == FF_DCT_AUTO)
        c->fdct               = ff_bfin_fdct;

    if (avctx->bits_per_raw_sample <= 8) {
        if (avctx->dct_algo == FF_DCT_AUTO)
            c->fdct                  = ff_bfin_fdct;

        if (avctx->idct_algo == FF_IDCT_VP3) {
            c->idct_permutation_type = FF_NO_IDCT_PERM;
            c->idct                  = ff_bfin_vp3_idct;
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -88,7 +88,7 @@ static const struct algo fdct_tab[] = {
    { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
    { "FAAN",           ff_faandct,            FAAN_SCALE },
    { "IJG-AAN-INT",    fdct_ifast,            SCALE_PERM },
    { "IJG-LLM-INT",    ff_jpeg_fdct_islow,    NO_PERM    },
    { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },

 #if HAVE_MMX
    { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
--- a/libavcodec/dnxhddata.c
+++ b/libavcodec/dnxhddata.c
@@ -22,6 +22,28 @@
 #include "avcodec.h"
 #include "dnxhddata.h"

 static const uint8_t dnxhd_1235_luma_weight[] = {
     0, 32, 32, 32, 33, 35, 38, 39,
    32, 33, 32, 33, 36, 36, 39, 42,
    32, 32, 33, 36, 35, 37, 41, 43,
    31, 33, 34, 36, 36, 40, 42, 48,
    32, 34, 36, 37, 39, 42, 46, 51,
    36, 37, 37, 39, 41, 46, 51, 55,
    37, 39, 41, 41, 47, 50, 55, 56,
    41, 42, 41, 44, 50, 53, 60, 60
 };

 static const uint8_t dnxhd_1235_chroma_weight[] = {
     0, 32, 33, 34, 39, 41, 54, 59,
    33, 34, 35, 38, 43, 49, 58, 84,
    34, 37, 39, 44, 46, 55, 74, 87,
    40, 42, 47, 48, 58, 70, 87, 86,
    43, 50, 56, 63, 72, 94, 91, 82,
    55, 63, 65, 75, 93, 89, 85, 73,
    61, 67, 82, 81, 83, 90, 79, 73,
    74, 84, 75, 78, 90, 85, 73, 73
 };

 static const uint8_t dnxhd_1237_luma_weight[] = {
     0,  32,  33,  34, 34, 36, 37, 36,
    36,  37,  38,  38, 38, 39, 41, 44,
@@ -132,6 +154,28 @@ static const uint8_t dnxhd_1243_chroma_weight[] = {
    46, 45, 46, 47, 47, 48, 47, 47,
 };

 static const uint8_t dnxhd_1250_luma_weight[] = {
     0, 32, 35, 35, 36, 36, 41, 43,
    32, 34, 35, 36, 37, 39, 43, 47,
    33, 34, 36, 38, 38, 42, 42, 50,
    34, 36, 38, 38, 41, 40, 47, 54,
    35, 38, 39, 40, 39, 45, 49, 58,
    38, 39, 40, 39, 46, 47, 54, 60,
    38, 39, 41, 46, 46, 48, 57, 62,
    40, 41, 44, 45, 49, 54, 63, 63
 };

 static const uint8_t dnxhd_1250_chroma_weight[] = {
     0, 32, 35, 36, 40, 42, 51, 51,
    35, 36, 39, 39, 43, 51, 52, 55,
    36, 41, 41, 43, 51, 53, 54, 56,
    43, 44, 45, 50, 54, 54, 55, 57,
    45, 48, 50, 51, 55, 58, 59, 58,
    49, 52, 49, 57, 58, 62, 58, 60,
    51, 51, 56, 58, 62, 61, 59, 62,
    52, 52, 60, 61, 59, 59, 63, 63
 };

 static const uint8_t dnxhd_1251_luma_weight[] = {
     0, 32, 32, 34, 34, 34, 34, 35,
    35, 35, 36, 37, 36, 36, 35, 36,
@@ -604,6 +648,146 @@ static const uint8_t dnxhd_1235_1241_run[62] = {
    49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
 };

 static const uint8_t dnxhd_1250_dc_codes[14] = {
    10, 62, 11, 12, 13, 0, 1, 2, 3, 4, 14, 30, 126, 127
 };
 static const uint8_t dnxhd_1250_dc_bits[14] = {
    4, 6, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 7, 7
 };
 static const uint16_t dnxhd_1250_ac_codes[257] = {
        0,     1,     4,    10,    11,    24,    25,    26,
       54,    55,    56,    57,   116,   117,   118,   119,
      240,   241,   242,   243,   244,   245,   492,   493,
      494,   495,   496,   497,   498,   998,   999,  1000,
     1001,  1002,  1003,  1004,  1005,  1006,  2014,  2015,
     2016,  2017,  2018,  2019,  2020,  2021,  2022,  2023,
     2024,  2025,  4052,  4053,  4054,  4055,  4056,  4057,
     4058,  4059,  4060,  4061,  4062,  4063,  4064,  4065,
     4066,  4067,  8136,  8137,  8138,  8139,  8140,  8141,
     8142,  8143,  8144,  8145,  8146,  8147,  8148,  8149,
     8150,  8151,  8152,  8153,  8154,  8155,  8156, 16314,
    16315, 16316, 16317, 16318, 16319, 16320, 16321, 16322,
    16323, 16324, 16325, 16326, 16327, 16328, 16329, 16330,
    16331, 16332, 16333, 16334, 16335, 16336, 16337, 16338,
    32678, 32679, 32680, 32681, 32682, 32683, 32684, 32685,
    32686, 32687, 32688, 32689, 32690, 32691, 32692, 32693,
    32694, 32695, 32696, 32697, 32698, 32699, 32700, 32701,
    32702, 32703, 32704, 32705, 32706, 32707, 32708, 32709,
    32710, 32711, 32712, 65426, 65427, 65428, 65429, 65430,
    65431, 65432, 65433, 65434, 65435, 65436, 65437, 65438,
    65439, 65440, 65441, 65442, 65443, 65444, 65445, 65446,
    65447, 65448, 65449, 65450, 65451, 65452, 65453, 65454,
    65455, 65456, 65457, 65458, 65459, 65460, 65461, 65462,
    65463, 65464, 65465, 65466, 65467, 65468, 65469, 65470,
    65471, 65472, 65473, 65474, 65475, 65476, 65477, 65478,
    65479, 65480, 65481, 65482, 65483, 65484, 65485, 65486,
    65487, 65488, 65489, 65490, 65491, 65492, 65493, 65494,
    65495, 65496, 65497, 65498, 65499, 65500, 65501, 65502,
    65503, 65504, 65505, 65506, 65507, 65508, 65509, 65510,
    65511, 65512, 65513, 65514, 65515, 65516, 65517, 65518,
    65519, 65520, 65521, 65522, 65523, 65524, 65525, 65526,
    65527, 65528, 65529, 65530, 65531, 65532, 65533, 65534,
    65535
 };
 static const uint8_t dnxhd_1250_ac_bits[257] = {
     2,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
     8,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
    11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
    12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
    13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14,
    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
    15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16
 };
 static const uint8_t dnxhd_1250_ac_level[257] = {
     1,  1,  2,  3,  0,  4,  5,  2,  6,  7,  8,  3,  9, 10, 11,  4,
    12, 13, 14, 15, 16,  5, 17, 18, 19, 20, 21, 22,  6, 23, 24, 25,
    26, 27, 28, 29,  7,  8, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
     9, 10, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 11,
    12, 13, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,  1,  2,
     3,  4,  5, 14, 15, 16, 17,  6,  7,  8,  9, 10, 11, 12, 13, 14,
    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 18, 19, 20, 21,
    27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
    43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 22, 23, 24,
    25, 26, 27, 54, 57, 58, 59, 60, 61, 62, 63, 64, 28, 29, 30, 31,
    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
    64,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
    64
 };
 static const uint8_t dnxhd_1250_ac_run_flag[257] = {
    0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
    0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
    0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
    1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
    1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1
 };
 static const uint8_t dnxhd_1250_ac_index_flag[257] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
    1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
    0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1
 };
 static const uint16_t dnxhd_1250_run_codes[62] = {
       0,    4,    5,   12,   26,   27,   28,   58,
     118,  119,  120,  242,  486,  487,  976,  977,
     978,  979,  980,  981,  982,  983,  984,  985,
     986,  987,  988,  989,  990,  991,  992,  993,
     994,  995,  996,  997,  998,  999, 1000, 1001,
    1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009,
    1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017,
    1018, 1019, 1020, 1021, 1022, 1023
 };
 static const uint8_t dnxhd_1250_run_bits[62] = {
     1,  3,  3,  4,  5,  5,  5,  6,  7,  7,  7,  8,  9,  9, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
 };
 static const uint8_t dnxhd_1250_run[62] = {
     1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
    33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
    49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62
 };

 static const uint8_t dnxhd_1251_dc_codes[12] = {
    0, 12, 13, 1, 2, 3, 4, 5, 14, 30, 62, 63,
 };
@@ -878,6 +1062,13 @@ static const uint8_t dnxhd_1252_ac_index_flag[257] = {
 };

 const CIDEntry ff_dnxhd_cid_table[] = {
    { 1235, 1920, 1080, 0, 917504, 917504, 6, 10,
      dnxhd_1235_luma_weight, dnxhd_1235_chroma_weight,
      dnxhd_1235_1241_dc_codes, dnxhd_1235_1241_dc_bits,
      dnxhd_1235_1241_ac_codes, dnxhd_1235_1241_ac_bits, dnxhd_1235_1241_ac_level,
      dnxhd_1235_1241_ac_run_flag, dnxhd_1235_1241_ac_index_flag,
      dnxhd_1235_1238_1241_run_codes, dnxhd_1235_1238_1241_run_bits, dnxhd_1235_1241_run,
      { 175, 185, 365, 440 } },
    { 1237, 1920, 1080, 0, 606208, 606208, 4, 8,
      dnxhd_1237_luma_weight, dnxhd_1237_chroma_weight,
      dnxhd_1237_dc_codes, dnxhd_1237_dc_bits,
@@ -913,6 +1104,13 @@ const CIDEntry ff_dnxhd_cid_table[] = {
      dnxhd_1238_ac_run_flag, dnxhd_1238_ac_index_flag,
      dnxhd_1235_1238_1241_run_codes, dnxhd_1235_1238_1241_run_bits, dnxhd_1238_run,
      { 185, 220 } },
    { 1250, 1280,  720, 0, 458752, 458752, 6, 10,
      dnxhd_1250_luma_weight, dnxhd_1250_chroma_weight,
      dnxhd_1250_dc_codes, dnxhd_1250_dc_bits,
      dnxhd_1250_ac_codes, dnxhd_1250_ac_bits, dnxhd_1250_ac_level,
      dnxhd_1250_ac_run_flag, dnxhd_1250_ac_index_flag,
      dnxhd_1250_run_codes, dnxhd_1250_run_bits, dnxhd_1250_run,
      { 90, 180, 220 } },
    { 1251, 1280,  720, 0, 458752, 458752, 4, 8,
      dnxhd_1251_luma_weight, dnxhd_1251_chroma_weight,
      dnxhd_1251_dc_codes, dnxhd_1251_dc_bits,
@@ -945,7 +1143,7 @@ int ff_dnxhd_get_cid_table(int cid)
    return -1;
 }

 int ff_dnxhd_find_cid(AVCodecContext *avctx)
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth)
 {
    int i, j;
    int mbs = avctx->bit_rate/1000000;
@@ -955,7 +1153,7 @@ int ff_dnxhd_find_cid(AVCodecContext *avctx)
        const CIDEntry *cid = &ff_dnxhd_cid_table[i];
        if (cid->width == avctx->width && cid->height == avctx->height &&
            cid->interlaced == !!(avctx->flags & CODEC_FLAG_INTERLACED_DCT) &&
            cid->bit_depth == 8) { // until 10 bit is supported
            cid->bit_depth == bit_depth) {
            for (j = 0; j < sizeof(cid->bit_rates); j++) {
                if (cid->bit_rates[j] == mbs)
                    return cid->cid;
--- a/libavcodec/dnxhddata.h
+++ b/libavcodec/dnxhddata.h
@@ -46,6 +46,6 @@ typedef struct {
 extern const CIDEntry ff_dnxhd_cid_table[];

 int ff_dnxhd_get_cid_table(int cid);
 int ff_dnxhd_find_cid(AVCodecContext *avctx);
 int ff_dnxhd_find_cid(AVCodecContext *avctx, int bit_depth);

 #endif /* AVCODEC_DNXHDDATA_H */
--- a/libavcodec/dnxhddec.c
+++ b/libavcodec/dnxhddec.c
@@ -1,6 +1,9 @@
 /*
 * VC3/DNxHD decoder.
 * Copyright (c) 2007 SmartJog S.A., Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 * Copyright (c) 2011 MirriAd Ltd
 *
 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
 *
 * This file is part of FFmpeg.
 *
@@ -28,7 +31,7 @@
 #include "dnxhddata.h"
 #include "dsputil.h"

 typedef struct {
 typedef struct DNXHDContext {
    AVCodecContext *avctx;
    AVFrame picture;
    GetBitContext gb;
@@ -43,17 +46,22 @@ typedef struct {
    DECLARE_ALIGNED(16, DCTELEM, blocks)[8][64];
    ScanTable scantable;
    const CIDEntry *cid_table;
    int bit_depth; // 8, 10 or 0 if not initialized at all.
    void (*decode_dct_block)(struct DNXHDContext *ctx, DCTELEM *block,
                             int n, int qscale);
 } DNXHDContext;

 #define DNXHD_VLC_BITS 9
 #define DNXHD_DC_VLC_BITS 7

 static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, DCTELEM *block, int n, int qscale);
 static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, DCTELEM *block, int n, int qscale);

 static av_cold int dnxhd_decode_init(AVCodecContext *avctx)
 {
    DNXHDContext *ctx = avctx->priv_data;

    ctx->avctx = avctx;
    dsputil_init(&ctx->dsp, avctx);
    avctx->coded_frame = &ctx->picture;
    avcodec_get_frame_defaults(&ctx->picture);
    ctx->picture.type = AV_PICTURE_TYPE_I;
@@ -79,7 +87,7 @@ static int dnxhd_init_vlc(DNXHDContext *ctx, int cid)
        init_vlc(&ctx->ac_vlc, DNXHD_VLC_BITS, 257,
                 ctx->cid_table->ac_bits, 1, 1,
                 ctx->cid_table->ac_codes, 2, 2, 0);
        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, ctx->cid_table->bit_depth+4,
        init_vlc(&ctx->dc_vlc, DNXHD_DC_VLC_BITS, ctx->bit_depth + 4,
                 ctx->cid_table->dc_bits, 1, 1,
                 ctx->cid_table->dc_codes, 1, 1, 0);
        init_vlc(&ctx->run_vlc, DNXHD_VLC_BITS, 62,
@@ -117,8 +125,21 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si
    av_dlog(ctx->avctx, "width %d, heigth %d\n", ctx->width, ctx->height);

    if (buf[0x21] & 0x40) {
        av_log(ctx->avctx, AV_LOG_ERROR, "10 bit per component\n");
        return -1;
        ctx->avctx->pix_fmt = PIX_FMT_YUV422P10;
        ctx->avctx->bits_per_raw_sample = 10;
        if (ctx->bit_depth != 10) {
            dsputil_init(&ctx->dsp, ctx->avctx);
            ctx->bit_depth = 10;
            ctx->decode_dct_block = dnxhd_decode_dct_block_10;
        }
    } else {
        ctx->avctx->pix_fmt = PIX_FMT_YUV422P;
        ctx->avctx->bits_per_raw_sample = 8;
        if (ctx->bit_depth != 8) {
            dsputil_init(&ctx->dsp, ctx->avctx);
            ctx->bit_depth = 8;
            ctx->decode_dct_block = dnxhd_decode_dct_block_8;
        }
    }

    cid = AV_RB32(buf + 0x28);
@@ -158,79 +179,103 @@ static int dnxhd_decode_header(DNXHDContext *ctx, const uint8_t *buf, int buf_si
    return 0;
 }

 static int dnxhd_decode_dc(DNXHDContext *ctx)
 static av_always_inline void dnxhd_decode_dct_block(DNXHDContext *ctx,
                                                    DCTELEM *block, int n,
                                                    int qscale,
                                                    int index_bits,
                                                    int level_bias,
                                                    int level_shift)
 {
    int len;

    len = get_vlc2(&ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
    return len ? get_xbits(&ctx->gb, len) : 0;
 }

 static void dnxhd_decode_dct_block(DNXHDContext *ctx, DCTELEM *block, int n, int qscale)
 {
    int i, j, index, index2;
    int i, j, index1, index2, len;
    int level, component, sign;
    const uint8_t *weigth_matrix;
    const uint8_t *weight_matrix;
    OPEN_READER(bs, &ctx->gb);

    if (n&2) {
        component = 1 + (n&1);
        weigth_matrix = ctx->cid_table->chroma_weight;
        weight_matrix = ctx->cid_table->chroma_weight;
    } else {
        component = 0;
        weigth_matrix = ctx->cid_table->luma_weight;
        weight_matrix = ctx->cid_table->luma_weight;
    }

    ctx->last_dc[component] += dnxhd_decode_dc(ctx);
    UPDATE_CACHE(bs, &ctx->gb);
    GET_VLC(len, bs, &ctx->gb, ctx->dc_vlc.table, DNXHD_DC_VLC_BITS, 1);
    if (len) {
        level = GET_CACHE(bs, &ctx->gb);
        LAST_SKIP_BITS(bs, &ctx->gb, len);
        sign  = ~level >> 31;
        level = (NEG_USR32(sign ^ level, len) ^ sign) - sign;
        ctx->last_dc[component] += level;
    }
    block[0] = ctx->last_dc[component];
    //av_log(ctx->avctx, AV_LOG_DEBUG, "dc %d\n", block[0]);

    for (i = 1; ; i++) {
        index = get_vlc2(&ctx->gb, ctx->ac_vlc.table, DNXHD_VLC_BITS, 2);
        //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index);
        level = ctx->cid_table->ac_level[index];
        UPDATE_CACHE(bs, &ctx->gb);
        GET_VLC(index1, bs, &ctx->gb, ctx->ac_vlc.table,
                DNXHD_VLC_BITS, 2);
        //av_log(ctx->avctx, AV_LOG_DEBUG, "index %d\n", index1);
        level = ctx->cid_table->ac_level[index1];
        if (!level) { /* EOB */
            //av_log(ctx->avctx, AV_LOG_DEBUG, "EOB\n");
            return;
            break;
        }
        sign = get_sbits(&ctx->gb, 1);

        if (ctx->cid_table->ac_index_flag[index]) {
            level += get_bits(&ctx->gb, ctx->cid_table->index_bits)<<6;
        sign = SHOW_SBITS(bs, &ctx->gb, 1);
        SKIP_BITS(bs, &ctx->gb, 1);

        if (ctx->cid_table->ac_index_flag[index1]) {
            level += SHOW_UBITS(bs, &ctx->gb, index_bits) << 6;
            SKIP_BITS(bs, &ctx->gb, index_bits);
        }

        if (ctx->cid_table->ac_run_flag[index]) {
            index2 = get_vlc2(&ctx->gb, ctx->run_vlc.table, DNXHD_VLC_BITS, 2);
        if (ctx->cid_table->ac_run_flag[index1]) {
            UPDATE_CACHE(bs, &ctx->gb);
            GET_VLC(index2, bs, &ctx->gb, ctx->run_vlc.table,
                    DNXHD_VLC_BITS, 2);
            i += ctx->cid_table->run[index2];
        }

        if (i > 63) {
            av_log(ctx->avctx, AV_LOG_ERROR, "ac tex damaged %d, %d\n", n, i);
            return;
            break;
        }

        j = ctx->scantable.permutated[i];
        //av_log(ctx->avctx, AV_LOG_DEBUG, "j %d\n", j);
        //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weigth %d\n", level, weigth_matrix[i]);
        level = (2*level+1) * qscale * weigth_matrix[i];
        if (ctx->cid_table->bit_depth == 10) {
            if (weigth_matrix[i] != 8)
                level += 8;
            level >>= 4;
        } else {
            if (weigth_matrix[i] != 32)
                level += 32;
            level >>= 6;
        }
        //av_log(ctx->avctx, AV_LOG_DEBUG, "level %d, weight %d\n", level, weight_matrix[i]);
        level = (2*level+1) * qscale * weight_matrix[i];
        if (weight_matrix[i] != level_bias)
            level += level_bias;
        level >>= level_shift;

        //av_log(NULL, AV_LOG_DEBUG, "i %d, j %d, end level %d\n", i, j, level);
        block[j] = (level^sign) - sign;
    }

    CLOSE_READER(bs, &ctx->gb);
 }

 static void dnxhd_decode_dct_block_8(DNXHDContext *ctx, DCTELEM *block,
                                     int n, int qscale)
 {
    dnxhd_decode_dct_block(ctx, block, n, qscale, 4, 32, 6);
 }

 static void dnxhd_decode_dct_block_10(DNXHDContext *ctx, DCTELEM *block,
                                      int n, int qscale)
 {
    dnxhd_decode_dct_block(ctx, block, n, qscale, 6, 8, 4);
 }

 static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
 {
    int shift1 = ctx->bit_depth == 10;
    int dct_linesize_luma   = ctx->picture.linesize[0];
    int dct_linesize_chroma = ctx->picture.linesize[1];
    uint8_t *dest_y, *dest_u, *dest_v;
    int dct_offset;
    int dct_y_offset, dct_x_offset;
    int qscale, i;

    qscale = get_bits(&ctx->gb, 11);
@@ -239,7 +284,7 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)

    for (i = 0; i < 8; i++) {
        ctx->dsp.clear_block(ctx->blocks[i]);
        dnxhd_decode_dct_block(ctx, ctx->blocks[i], i, qscale);
        ctx->decode_dct_block(ctx, ctx->blocks[i], i, qscale);
    }

    if (ctx->picture.interlaced_frame) {
@@ -247,9 +292,9 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
        dct_linesize_chroma <<= 1;
    }

    dest_y = ctx->picture.data[0] + ((y * dct_linesize_luma)   << 4) + (x << 4);
    dest_u = ctx->picture.data[1] + ((y * dct_linesize_chroma) << 4) + (x << 3);
    dest_v = ctx->picture.data[2] + ((y * dct_linesize_chroma) << 4) + (x << 3);
    dest_y = ctx->picture.data[0] + ((y * dct_linesize_luma)   << 4) + (x << (4 + shift1));
    dest_u = ctx->picture.data[1] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1));
    dest_v = ctx->picture.data[2] + ((y * dct_linesize_chroma) << 4) + (x << (3 + shift1));

    if (ctx->cur_field) {
        dest_y += ctx->picture.linesize[0];
@@ -257,18 +302,19 @@ static int dnxhd_decode_macroblock(DNXHDContext *ctx, int x, int y)
        dest_v += ctx->picture.linesize[2];
    }

    dct_offset = dct_linesize_luma << 3;
    ctx->dsp.idct_put(dest_y,                  dct_linesize_luma, ctx->blocks[0]);
    ctx->dsp.idct_put(dest_y + 8,              dct_linesize_luma, ctx->blocks[1]);
    ctx->dsp.idct_put(dest_y + dct_offset,     dct_linesize_luma, ctx->blocks[4]);
    ctx->dsp.idct_put(dest_y + dct_offset + 8, dct_linesize_luma, ctx->blocks[5]);
    dct_y_offset = dct_linesize_luma << 3;
    dct_x_offset = 8 << shift1;
    ctx->dsp.idct_put(dest_y,                               dct_linesize_luma, ctx->blocks[0]);
    ctx->dsp.idct_put(dest_y + dct_x_offset,                dct_linesize_luma, ctx->blocks[1]);
    ctx->dsp.idct_put(dest_y + dct_y_offset,                dct_linesize_luma, ctx->blocks[4]);
    ctx->dsp.idct_put(dest_y + dct_y_offset + dct_x_offset, dct_linesize_luma, ctx->blocks[5]);

    if (!(ctx->avctx->flags & CODEC_FLAG_GRAY)) {
        dct_offset = dct_linesize_chroma << 3;
        ctx->dsp.idct_put(dest_u,              dct_linesize_chroma, ctx->blocks[2]);
        ctx->dsp.idct_put(dest_v,              dct_linesize_chroma, ctx->blocks[3]);
        ctx->dsp.idct_put(dest_u + dct_offset, dct_linesize_chroma, ctx->blocks[6]);
        ctx->dsp.idct_put(dest_v + dct_offset, dct_linesize_chroma, ctx->blocks[7]);
        dct_y_offset = dct_linesize_chroma << 3;
        ctx->dsp.idct_put(dest_u,                dct_linesize_chroma, ctx->blocks[2]);
        ctx->dsp.idct_put(dest_v,                dct_linesize_chroma, ctx->blocks[3]);
        ctx->dsp.idct_put(dest_u + dct_y_offset, dct_linesize_chroma, ctx->blocks[6]);
        ctx->dsp.idct_put(dest_v + dct_y_offset, dct_linesize_chroma, ctx->blocks[7]);
    }

    return 0;
@@ -280,7 +326,7 @@ static int dnxhd_decode_macroblocks(DNXHDContext *ctx, const uint8_t *buf, int b
    for (y = 0; y < ctx->mb_height; y++) {
        ctx->last_dc[0] =
        ctx->last_dc[1] =
        ctx->last_dc[2] = 1<<(ctx->cid_table->bit_depth+2); // for levels +2^(bitdepth-1)
        ctx->last_dc[2] = 1 << (ctx->bit_depth + 2); // for levels +2^(bitdepth-1)
        init_get_bits(&ctx->gb, buf + ctx->mb_scan_index[y], (buf_size - ctx->mb_scan_index[y]) << 3);
        for (x = 0; x < ctx->mb_width; x++) {
            //START_TIMER;
@@ -313,7 +359,6 @@ static int dnxhd_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
        first_field = 1;
    }

    avctx->pix_fmt = PIX_FMT_YUV422P;
    if (av_image_check_size(ctx->width, ctx->height, 0, avctx))
        return -1;
    avcodec_set_dimensions(avctx, ctx->width, ctx->height);
--- a/libavcodec/dnxhdenc.c
+++ b/libavcodec/dnxhdenc.c
@@ -1,8 +1,10 @@
 /*
 * VC3/DNxHD encoder
 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
 * Copyright (c) 2011 MirriAd Ltd
 *
 * VC-3 encoder funded by the British Broadcasting Corporation
 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
 *
 * This file is part of FFmpeg.
 *
@@ -32,6 +34,7 @@
 #include "dnxhdenc.h"

 #define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 #define DNX10BIT_QMAT_SHIFT 18 // The largest value that will not lead to overflow for 10bit samples.

 static const AVOption options[]={
    {"nitris_compat", "encode with Avid Nitris compatibility", offsetof(DNXHDEncContext, nitris_compat), FF_OPT_TYPE_INT, {.dbl = 0}, 0, 1, VE},
@@ -41,7 +44,7 @@ static const AVClass class = { "dnxhd", av_default_item_name, options, LIBAVUTIL

 #define LAMBDA_FRAC_BITS 10

 static void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 static void dnxhd_8bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
    int i;
    for (i = 0; i < 4; i++) {
@@ -58,6 +61,43 @@ static void dnxhd_get_pixels_8x4(DCTELEM *restrict block, const uint8_t *pixels,
    memcpy(block + 24, block - 32, sizeof(*block) * 8);
 }

 static av_always_inline void dnxhd_10bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
    int i;

    block += 32;

    for (i = 0; i < 4; i++) {
        memcpy(block + i     * 8, pixels + i * line_size, 8 * sizeof(*block));
        memcpy(block - (i+1) * 8, pixels + i * line_size, 8 * sizeof(*block));
    }
 }

 static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, DCTELEM *block,
                                    int n, int qscale, int *overflow)
 {
    const uint8_t *scantable= ctx->intra_scantable.scantable;
    const int *qmat = ctx->q_intra_matrix[qscale];
    int last_non_zero = 0;

    ctx->dsp.fdct(block);

    // Divide by 4 with rounding, to compensate scaling of DCT coefficients
    block[0] = (block[0] + 2) >> 2;

    for (int i = 1; i < 64; ++i) {
        int j = scantable[i];
        int sign = block[j] >> 31;
        int level = (block[j] ^ sign) - sign;
        level = level * qmat[j] >> DNX10BIT_QMAT_SHIFT;
        block[j] = (level ^ sign) - sign;
        if (level)
            last_non_zero = i;
    }

    return last_non_zero;
 }

 static int dnxhd_init_vlc(DNXHDEncContext *ctx)
 {
    int i, j, level, run;
@@ -118,31 +158,55 @@ static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
    // init first elem to 1 to avoid div by 0 in convert_matrix
    uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t*
    int qscale, i;
    const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
    const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;

    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);

    for (i = 1; i < 64; i++) {
        int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
        weight_matrix[j] = ctx->cid_table->luma_weight[i];
    }
    ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
                      ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
    for (i = 1; i < 64; i++) {
        int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
        weight_matrix[j] = ctx->cid_table->chroma_weight[i];
    }
    ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
                      ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
    for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
        for (i = 0; i < 64; i++) {
            ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
            ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
            ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
    if (ctx->cid_table->bit_depth == 8) {
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->luma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->chroma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);

        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 0; i < 64; i++) {
                ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
                ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
                ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
            }
        }
    } else {
        // 10-bit
        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 1; i < 64; i++) {
                int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];

                // The quantization formula from the VC-3 standard is:
                // quantized = sign(block[i]) * floor(abs(block[i]/s) * p / (qscale * weight_table[i]))
                // Where p is 32 for 8-bit samples and 8 for 10-bit ones.
                // The s factor compensates scaling of DCT coefficients done by the DCT routines,
                // and therefore is not present in standard.  It's 8 for 8-bit samples and 4 for 10-bit ones.
                // We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
                // ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) / (qscale * weight_table[i])
                // For 10-bit samples, p / s == 2
                ctx->qmatrix_l[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * luma_weight_table[i]);
                ctx->qmatrix_c[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * chroma_weight_table[i]);
            }
        }
    }

    return 0;
 fail:
    return -1;
@@ -165,10 +229,22 @@ static int dnxhd_init_rc(DNXHDEncContext *ctx)
 static int dnxhd_encode_init(AVCodecContext *avctx)
 {
    DNXHDEncContext *ctx = avctx->priv_data;
    int i, index;
    int i, index, bit_depth;

    switch (avctx->pix_fmt) {
    case PIX_FMT_YUV422P:
        bit_depth = 8;
        break;
    case PIX_FMT_YUV422P10:
        bit_depth = 10;
        break;
    default:
        av_log(avctx, AV_LOG_ERROR, "pixel format is incompatible with DNxHD\n");
        return -1;
    }

    ctx->cid = ff_dnxhd_find_cid(avctx);
    if (!ctx->cid || avctx->pix_fmt != PIX_FMT_YUV422P) {
    ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
    if (!ctx->cid) {
        av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n");
        return -1;
    }
@@ -181,15 +257,25 @@ static int dnxhd_encode_init(AVCodecContext *avctx)
    ctx->m.mb_intra = 1;
    ctx->m.h263_aic = 1;

    ctx->get_pixels_8x4_sym = dnxhd_get_pixels_8x4;
    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;

    dsputil_init(&ctx->m.dsp, avctx);
    ff_dct_common_init(&ctx->m);
    if (!ctx->m.dct_quantize)
        ctx->m.dct_quantize = dct_quantize_c;

    if (ctx->cid_table->bit_depth == 10) {
       ctx->m.dct_quantize = dnxhd_10bit_dct_quantize;
       ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 4;
    } else {
       ctx->get_pixels_8x4_sym = dnxhd_8bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 3;
    }

 #if HAVE_MMX
    ff_dnxhd_init_mmx(ctx);
 #endif
    if (!ctx->m.dct_quantize)
        ctx->m.dct_quantize = dct_quantize_c;

    ctx->m.mb_height = (avctx->height + 15) / 16;
    ctx->m.mb_width  = (avctx->width  + 15) / 16;
@@ -255,7 +341,7 @@ static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
    AV_WB16(buf + 0x1a, avctx->width);  // SPL
    AV_WB16(buf + 0x1d, avctx->height>>ctx->interlaced); // NAL

    buf[0x21] = 0x38; // FIXME 8 bit per comp
    buf[0x21] = ctx->cid_table->bit_depth == 10 ? 0x58 : 0x38;
    buf[0x22] = 0x88 + (ctx->interlaced<<2);
    AV_WB32(buf + 0x28, ctx->cid); // CID
    buf[0x2c] = ctx->interlaced ? 0 : 0x80;
@@ -321,15 +407,27 @@ static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *b
        if (level) {
            if (level < 0) {
                level = (1-2*level) * qscale * weight_matrix[i];
                if (weight_matrix[i] != 32)
                    level += 32;
                level >>= 6;
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
                level = -level;
            } else {
                level = (2*level+1) * qscale * weight_matrix[i];
                if (weight_matrix[i] != 32)
                    level += 32;
                level >>= 6;
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
            }
            block[j] = level;
        }
@@ -364,22 +462,24 @@ static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *bl

 static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
 {
    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << 4);
    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << 3);
    const int bs = ctx->block_width_l2;
    const int bw = 1 << bs;
    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << bs+1);
    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    DSPContext *dsp = &ctx->m.dsp;

    dsp->get_pixels(ctx->blocks[0], ptr_y,     ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[1], ptr_y + 8, ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[2], ptr_u,     ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[3], ptr_v,     ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);

    if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
        if (ctx->interlaced) {
            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,     ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,    ctx->m.uvlinesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,    ctx->m.uvlinesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
        } else {
            dsp->clear_block(ctx->blocks[4]);
            dsp->clear_block(ctx->blocks[5]);
@@ -387,10 +487,10 @@ static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, in
            dsp->clear_block(ctx->blocks[7]);
        }
    } else {
        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,     ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + 8, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,    ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,    ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
    }
 }

@@ -417,7 +517,7 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, i

    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
    ctx->m.last_dc[2] = 1024;
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);

    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
@@ -440,6 +540,8 @@ static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, i
            diff = block[0] - ctx->m.last_dc[n];
            if (diff < 0) nbits = av_log2_16bit(-2*diff);
            else          nbits = av_log2_16bit( 2*diff);

            assert(nbits < ctx->cid_table->bit_depth + 4);
            dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;

            ctx->m.last_dc[n] = block[0];
@@ -465,7 +567,7 @@ static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int

    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
    ctx->m.last_dc[2] = 1024;
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
        int qscale = ctx->mb_qscale[mb];
@@ -515,13 +617,39 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    ctx = ctx->thread[threadnr];
    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize) + (mb_x<<4);
        int sum      = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
        int varc     = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
        ctx->mb_cmp[mb].value = varc;
        ctx->mb_cmp[mb].mb = mb;
    if (ctx->cid_table->bit_depth == 8) {
        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize);
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x, pix += 16) {
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
            int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)(sum*sum))>>8)+128)>>8;
            ctx->mb_cmp[mb].value = varc;
            ctx->mb_cmp[mb].mb = mb;
        }
    } else { // 10-bit
        int const linesize = ctx->m.linesize >> 1;
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x) {
            uint16_t *pix = (uint16_t*)ctx->thread[0]->src[0] + ((mb_y << 4) * linesize) + (mb_x << 4);
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = 0;
            int sqsum = 0;
            int mean, sqmean;
            // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
            for (int i = 0; i < 16; ++i) {
                for (int j = 0; j < 16; ++j) {
                    // Turn 16-bit pixels into 10-bit ones.
                    int const sample = (unsigned)pix[j] >> 6;
                    sum += sample;
                    sqsum += sample * sample;
                    // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
                }
                pix += linesize;
            }
            mean = sum >> 8; // 16*16 == 2^8
            sqmean = sqsum >> 8;
            ctx->mb_cmp[mb].value = sqmean - mean * mean;
            ctx->mb_cmp[mb].mb = mb;
        }
    }
    return 0;
 }
@@ -871,7 +999,7 @@ AVCodec ff_dnxhd_encoder = {
    dnxhd_encode_picture,
    dnxhd_encode_end,
    .capabilities = CODEC_CAP_SLICE_THREADS,
    .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_NONE},
    .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_YUV422P10, PIX_FMT_NONE},
    .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
    .priv_class = &class,
 };
--- a/libavcodec/dnxhdenc.h
+++ b/libavcodec/dnxhdenc.h
@@ -52,8 +52,12 @@ typedef struct DNXHDEncContext {

    struct DNXHDEncContext *thread[MAX_THREADS];

    // Because our samples are either 8 or 16 bits for 8-bit and 10-bit
    // encoding respectively, these refer either to bytes or to two-byte words.
    unsigned dct_y_offset;
    unsigned dct_uv_offset;
    unsigned block_width_l2;

    int interlaced;
    int cur_field;

--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -306,25 +306,6 @@ static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
    return s;
 }

 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 {
    int i;

    /* read the pixels */
    for(i=0;i<8;i++) {
        block[0] = pixels[0];
        block[1] = pixels[1];
        block[2] = pixels[2];
        block[3] = pixels[3];
        block[4] = pixels[4];
        block[5] = pixels[5];
        block[6] = pixels[6];
        block[7] = pixels[7];
        pixels += line_size;
        block += 8;
    }
 }

 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
                          const uint8_t *s2, int stride){
    int i;
@@ -2836,17 +2817,22 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    ff_check_alignment();

 #if CONFIG_ENCODERS
    if(avctx->dct_algo==FF_DCT_FASTINT) {
        c->fdct = fdct_ifast;
        c->fdct248 = fdct_ifast248;
    }
    else if(avctx->dct_algo==FF_DCT_FAAN) {
        c->fdct = ff_faandct;
        c->fdct248 = ff_faandct248;
    }
    else {
        c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
        c->fdct248 = ff_fdct248_islow;
    if (avctx->bits_per_raw_sample == 10) {
        c->fdct    = ff_jpeg_fdct_islow_10;
        c->fdct248 = ff_fdct248_islow_10;
    } else {
        if(avctx->dct_algo==FF_DCT_FASTINT) {
            c->fdct    = fdct_ifast;
            c->fdct248 = fdct_ifast248;
        }
        else if(avctx->dct_algo==FF_DCT_FAAN) {
            c->fdct    = ff_faandct;
            c->fdct248 = ff_faandct248;
        }
        else {
            c->fdct    = ff_jpeg_fdct_islow_8; //slow/accurate/default
            c->fdct248 = ff_fdct248_islow_8;
        }
    }
 #endif //CONFIG_ENCODERS

@@ -2910,7 +2896,6 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
        }
    }

    c->get_pixels = get_pixels_c;
    c->diff_pixels = diff_pixels_c;
    c->put_pixels_clamped = ff_put_pixels_clamped_c;
    c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
@@ -3138,13 +3123,14 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
    c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)


 #define BIT_DEPTH_FUNCS(depth)\
 #define BIT_DEPTH_FUNCS(depth, dct)\
    c->get_pixels                    = FUNCC(get_pixels   ## dct   , depth);\
    c->draw_edges                    = FUNCC(draw_edges            , depth);\
    c->emulated_edge_mc              = FUNC (ff_emulated_edge_mc   , depth);\
    c->clear_block                   = FUNCC(clear_block           , depth);\
    c->clear_blocks                  = FUNCC(clear_blocks          , depth);\
    c->add_pixels8                   = FUNCC(add_pixels8           , depth);\
    c->add_pixels4                   = FUNCC(add_pixels4           , depth);\
    c->clear_block                   = FUNCC(clear_block  ## dct   , depth);\
    c->clear_blocks                  = FUNCC(clear_blocks ## dct   , depth);\
    c->add_pixels8                   = FUNCC(add_pixels8  ## dct   , depth);\
    c->add_pixels4                   = FUNCC(add_pixels4  ## dct   , depth);\
    c->put_no_rnd_pixels_l2[0]       = FUNCC(put_no_rnd_pixels16_l2, depth);\
    c->put_no_rnd_pixels_l2[1]       = FUNCC(put_no_rnd_pixels8_l2 , depth);\
 \
@@ -3178,15 +3164,23 @@ av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)

    switch (avctx->bits_per_raw_sample) {
    case 9:
        BIT_DEPTH_FUNCS(9);
        if (c->dct_bits == 32) {
            BIT_DEPTH_FUNCS(9, _32);
        } else {
            BIT_DEPTH_FUNCS(9, _16);
        }
        break;
    case 10:
        BIT_DEPTH_FUNCS(10);
        if (c->dct_bits == 32) {
            BIT_DEPTH_FUNCS(10, _32);
        } else {
            BIT_DEPTH_FUNCS(10, _16);
        }
        break;
    default:
        av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
    case 8:
        BIT_DEPTH_FUNCS(8);
        BIT_DEPTH_FUNCS(8, _16);
        break;
    }

--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -40,8 +40,10 @@ typedef short DCTELEM;

 void fdct_ifast (DCTELEM *data);
 void fdct_ifast248 (DCTELEM *data);
 void ff_jpeg_fdct_islow (DCTELEM *data);
 void ff_fdct248_islow (DCTELEM *data);
 void ff_jpeg_fdct_islow_8(DCTELEM *data);
 void ff_jpeg_fdct_islow_10(DCTELEM *data);
 void ff_fdct248_islow_8(DCTELEM *data);
 void ff_fdct248_islow_10(DCTELEM *data);

 void j_rev_dct (DCTELEM *data);
 void j_rev_dct4 (DCTELEM *data);
@@ -217,6 +219,11 @@ void ff_put_signed_pixels_clamped_c(const DCTELEM *block, uint8_t *dest, int lin
 * DSPContext.
 */
 typedef struct DSPContext {
    /**
     * Size of DCT coefficients.
     */
    int dct_bits;

    /* pixel ops : interface with DCT */
    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
--- a/libavcodec/dsputil_template.c
+++ b/libavcodec/dsputil_template.c
@@ -192,43 +192,89 @@ void FUNC(ff_emulated_edge_mc)(uint8_t *buf, const uint8_t *src, int linesize, i
    }
 }

 static void FUNCC(add_pixels8)(uint8_t *restrict p_pixels, DCTELEM *p_block, int line_size)
 {
    int i;
    pixel *restrict pixels = (pixel *restrict)p_pixels;
    dctcoef *block = (dctcoef*)p_block;
    line_size >>= sizeof(pixel)-1;

    for(i=0;i<8;i++) {
        pixels[0] += block[0];
        pixels[1] += block[1];
        pixels[2] += block[2];
        pixels[3] += block[3];
        pixels[4] += block[4];
        pixels[5] += block[5];
        pixels[6] += block[6];
        pixels[7] += block[7];
        pixels += line_size;
        block += 8;
    }
 #define DCTELEM_FUNCS(dctcoef, suffix)                                  \
 static void FUNCC(get_pixels ## suffix)(DCTELEM *restrict _block,       \
                                        const uint8_t *_pixels,         \
                                        int line_size)                  \
 {                                                                       \
    const pixel *pixels = (const pixel *) _pixels;                      \
    dctcoef *restrict block = (dctcoef *) _block;                       \
    int i;                                                              \
                                                                        \
    /* read the pixels */                                               \
    for(i=0;i<8;i++) {                                                  \
        block[0] = pixels[0];                                           \
        block[1] = pixels[1];                                           \
        block[2] = pixels[2];                                           \
        block[3] = pixels[3];                                           \
        block[4] = pixels[4];                                           \
        block[5] = pixels[5];                                           \
        block[6] = pixels[6];                                           \
        block[7] = pixels[7];                                           \
        pixels += line_size / sizeof(pixel);                            \
        block += 8;                                                     \
    }                                                                   \
 }                                                                       \
                                                                        \
 static void FUNCC(add_pixels8 ## suffix)(uint8_t *restrict _pixels,     \
                                         DCTELEM *_block,               \
                                         int line_size)                 \
 {                                                                       \
    int i;                                                              \
    pixel *restrict pixels = (pixel *restrict)_pixels;                  \
    dctcoef *block = (dctcoef*)_block;                                  \
    line_size /= sizeof(pixel);                                         \
                                                                        \
    for(i=0;i<8;i++) {                                                  \
        pixels[0] += block[0];                                          \
        pixels[1] += block[1];                                          \
        pixels[2] += block[2];                                          \
        pixels[3] += block[3];                                          \
        pixels[4] += block[4];                                          \
        pixels[5] += block[5];                                          \
        pixels[6] += block[6];                                          \
        pixels[7] += block[7];                                          \
        pixels += line_size;                                            \
        block += 8;                                                     \
    }                                                                   \
 }                                                                       \
                                                                        \
 static void FUNCC(add_pixels4 ## suffix)(uint8_t *restrict _pixels,     \
                                         DCTELEM *_block,               \
                                         int line_size)                 \
 {                                                                       \
    int i;                                                              \
    pixel *restrict pixels = (pixel *restrict)_pixels;                  \
    dctcoef *block = (dctcoef*)_block;                                  \
    line_size /= sizeof(pixel);                                         \
                                                                        \
    for(i=0;i<4;i++) {                                                  \
        pixels[0] += block[0];                                          \
        pixels[1] += block[1];                                          \
        pixels[2] += block[2];                                          \
        pixels[3] += block[3];                                          \
        pixels += line_size;                                            \
        block += 4;                                                     \
    }                                                                   \
 }                                                                       \
                                                                        \
 static void FUNCC(clear_block ## suffix)(DCTELEM *block)                \
 {                                                                       \
    memset(block, 0, sizeof(dctcoef)*64);                               \
 }                                                                       \
                                                                        \
 /**                                                                     \
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)                              \
 */                                                                     \
 static void FUNCC(clear_blocks ## suffix)(DCTELEM *blocks)              \
 {                                                                       \
    memset(blocks, 0, sizeof(dctcoef)*6*64);                            \
 }

 static void FUNCC(add_pixels4)(uint8_t *restrict p_pixels, DCTELEM *p_block, int line_size)
 {
    int i;
    pixel *restrict pixels = (pixel *restrict)p_pixels;
    dctcoef *block = (dctcoef*)p_block;
    line_size >>= sizeof(pixel)-1;

    for(i=0;i<4;i++) {
        pixels[0] += block[0];
        pixels[1] += block[1];
        pixels[2] += block[2];
        pixels[3] += block[3];
        pixels += line_size;
        block += 4;
    }
 }
 DCTELEM_FUNCS(DCTELEM, _16)
 #if BIT_DEPTH > 8
 DCTELEM_FUNCS(dctcoef, _32)
 #endif

 #define PIXOP2(OPNAME, OP) \
 static void FUNCC(OPNAME ## _pixels2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
@@ -1232,16 +1278,3 @@ void FUNCC(ff_avg_pixels16x16)(uint8_t *dst, uint8_t *src, int stride) {
    FUNCC(avg_pixels16)(dst, src, stride, 16);
 }

 static void FUNCC(clear_block)(DCTELEM *block)
 {
    memset(block, 0, sizeof(dctcoef)*64);
 }

 /**
 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
 */
 static void FUNCC(clear_blocks)(DCTELEM *blocks)
 {
    memset(blocks, 0, sizeof(dctcoef)*6*64);
 }

--- a/libavcodec/eac3enc.c
+++ b/libavcodec/eac3enc.c
@@ -63,6 +63,11 @@ void ff_eac3_get_frame_exp_strategy(AC3EncodeContext *s)
 {
    int ch;

    if (s->num_blocks < 6) {
        s->use_frame_exp_strategy = 0;
        return;
    }

    s->use_frame_exp_strategy = 1;
    for (ch = !s->cpl_on; ch <= s->fbw_channels; ch++) {
        int expstr = eac3_frame_expstr_index_tab[s->exp_strategy[ch][0]-1]
@@ -89,7 +94,7 @@ void ff_eac3_set_cpl_states(AC3EncodeContext *s)
    /* set first cpl coords */
    for (ch = 1; ch <= s->fbw_channels; ch++)
        first_cpl_coords[ch] = 1;
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        for (ch = 1; ch <= s->fbw_channels; ch++) {
            if (block->channel_in_cpl[ch]) {
@@ -104,7 +109,7 @@ void ff_eac3_set_cpl_states(AC3EncodeContext *s)
    }

    /* set first cpl leak */
    for (blk = 0; blk < AC3_MAX_BLOCKS; blk++) {
    for (blk = 0; blk < s->num_blocks; blk++) {
        AC3Block *block = &s->blocks[blk];
        if (block->cpl_in_use) {
            block->new_cpl_leak = 2;
@@ -130,7 +135,7 @@ void ff_eac3_output_frame_header(AC3EncodeContext *s)
        put_bits(&s->pb, 2, s->bit_alloc.sr_code);  /* sample rate code */
    } else {
        put_bits(&s->pb, 2, s->bit_alloc.sr_code);  /* sample rate code */
        put_bits(&s->pb, 2, 0x3);                   /* number of blocks = 6 */
        put_bits(&s->pb, 2, s->num_blks_code);      /* number of blocks */
    }
    put_bits(&s->pb, 3, s->channel_mode);           /* audio coding mode */
    put_bits(&s->pb, 1, s->lfe_on);                 /* LFE channel indicator */
@@ -141,11 +146,15 @@ void ff_eac3_output_frame_header(AC3EncodeContext *s)
    /* TODO: mixing metadata */
    put_bits(&s->pb, 1, 0);                         /* no info metadata */
    /* TODO: info metadata */
    if (s->num_blocks != 6)
        put_bits(&s->pb, 1, !(s->avctx->frame_number % 6)); /* converter sync flag */
    put_bits(&s->pb, 1, 0);                         /* no additional bit stream info */

    /* frame header */
    if (s->num_blocks == 6) {
    put_bits(&s->pb, 1, !s->use_frame_exp_strategy);/* exponent strategy syntax */
    put_bits(&s->pb, 1, 0);                         /* aht enabled = no */
    }
    put_bits(&s->pb, 2, 0);                         /* snr offset strategy = 1 */
    put_bits(&s->pb, 1, 0);                         /* transient pre-noise processing enabled = no */
    put_bits(&s->pb, 1, 0);                         /* block switch syntax enabled = no */
@@ -158,7 +167,7 @@ void ff_eac3_output_frame_header(AC3EncodeContext *s)
    /* coupling strategy use flags */
    if (s->channel_mode > AC3_CHMODE_MONO) {
        put_bits(&s->pb, 1, s->blocks[0].cpl_in_use);
        for (blk = 1; blk < AC3_MAX_BLOCKS; blk++) {
        for (blk = 1; blk < s->num_blocks; blk++) {
            AC3Block *block = &s->blocks[blk];
            put_bits(&s->pb, 1, block->new_cpl_strategy);
            if (block->new_cpl_strategy)
@@ -170,26 +179,31 @@ void ff_eac3_output_frame_header(AC3EncodeContext *s)
        for (ch = !s->cpl_on; ch <= s->fbw_channels; ch++)
            put_bits(&s->pb, 5, s->frame_exp_strategy[ch]);
    } else {
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            for (ch = !s->blocks[blk].cpl_in_use; ch <= s->fbw_channels; ch++)
                put_bits(&s->pb, 2, s->exp_strategy[ch][blk]);
    }
    if (s->lfe_on) {
        for (blk = 0; blk < AC3_MAX_BLOCKS; blk++)
        for (blk = 0; blk < s->num_blocks; blk++)
            put_bits(&s->pb, 1, s->exp_strategy[s->lfe_channel][blk]);
    }
    /* E-AC-3 to AC-3 converter exponent strategy (unfortunately not optional...) */
    /* E-AC-3 to AC-3 converter exponent strategy (not optional when num blocks == 6) */
    if (s->num_blocks != 6) {
        put_bits(&s->pb, 1, 0);
    } else {
    for (ch = 1; ch <= s->fbw_channels; ch++) {
        if (s->use_frame_exp_strategy)
            put_bits(&s->pb, 5, s->frame_exp_strategy[ch]);
        else
            put_bits(&s->pb, 5, 0);
    }
    }
    /* snr offsets */
    put_bits(&s->pb, 6, s->coarse_snr_offset);
    put_bits(&s->pb, 4, s->fine_snr_offset[1]);
    /* block start info */
    put_bits(&s->pb, 1, 0);
    if (s->num_blocks > 1)
        put_bits(&s->pb, 1, 0);
 }


--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -3707,6 +3707,7 @@ static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){

                    ff_h264dsp_init(&h->h264dsp, h->sps.bit_depth_luma);
                    ff_h264_pred_init(&h->hpc, s->codec_id, h->sps.bit_depth_luma);
                    s->dsp.dct_bits = h->sps.bit_depth_luma > 8 ? 32 : 16;
                    dsputil_init(&s->dsp, s->avctx);
                } else {
                    av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", h->sps.bit_depth_luma);
--- a/libavcodec/imgconvert.c
+++ b/libavcodec/imgconvert.c
@@ -42,9 +42,6 @@
 #include "x86/dsputil_mmx.h"
 #endif

 #define xglue(x, y) x ## y
 #define glue(x, y) xglue(x, y)

 #define FF_COLOR_RGB      0 /**< RGB color space */
 #define FF_COLOR_GRAY     1 /**< gray color space */
 #define FF_COLOR_YUV      2 /**< YUV color space. 16 <= Y <= 235, 16 <= U, V <= 240 */
--- a/libavcodec/jfdctint.c
+++ b/libavcodec/jfdctint.c
@@ -1,402 +1,25 @@
 /*
 * jfdctint.c
 *
 * This file is part of the Independent JPEG Group's software.
 *
 * The authors make NO WARRANTY or representation, either express or implied,
 * with respect to this software, its quality, accuracy, merchantability, or
 * fitness for a particular purpose.  This software is provided "AS IS", and
 * you, its user, assume the entire risk as to its quality and accuracy.
 *
 * This software is copyright (C) 1991-1996, Thomas G. Lane.
 * All Rights Reserved except as specified below.
 *
 * Permission is hereby granted to use, copy, modify, and distribute this
 * software (or portions thereof) for any purpose, without fee, subject to
 * these conditions:
 * (1) If any part of the source code for this software is distributed, then
 * this README file must be included, with this copyright and no-warranty
 * notice unaltered; and any additions, deletions, or changes to the original
 * files must be clearly indicated in accompanying documentation.
 * (2) If only executable code is distributed, then the accompanying
 * documentation must state that "this software is based in part on the work
 * of the Independent JPEG Group".
 * (3) Permission for use of this software is granted only if the user accepts
 * full responsibility for any undesirable consequences; the authors accept
 * NO LIABILITY for damages of any kind.
 *
 * These conditions apply to any software derived from or based on the IJG
 * code, not just to the unmodified library.  If you use our work, you ought
 * to acknowledge us.
 *
 * Permission is NOT granted for the use of any IJG author's name or company
 * name in advertising or publicity relating to this software or products
 * derived from it.  This software may be referred to only as "the Independent
 * JPEG Group's software".
 *
 * We specifically permit and encourage the use of this software as the basis
 * of commercial products, provided that all warranty or liability claims are
 * assumed by the product vendor.
 *
 * This file contains a slow-but-accurate integer implementation of the
 * forward DCT (Discrete Cosine Transform).
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on an algorithm described in
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
 * The primary algorithm described there uses 11 multiplies and 29 adds.
 * We use their alternate method with 12 multiplies and 32 adds.
 * The advantage of this method is that no data path contains more than one
 * multiplication; this allows a very simple and accurate implementation in
 * scaled fixed-point arithmetic, with a minimal number of shifts.
 */

 /**
 * @file
 * Independent JPEG Group's slow & accurate dct.
 */

 #include <stdlib.h>
 #include <stdio.h>
 #include "libavutil/common.h"
 #include "dsputil.h"

 #define DCTSIZE 8
 #define BITS_IN_JSAMPLE 8
 #define GLOBAL(x) x
 #define RIGHT_SHIFT(x, n) ((x) >> (n))
 #define MULTIPLY16C16(var,const) ((var)*(const))

 #if 1 //def USE_ACCURATE_ROUNDING
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (1 << ((n) - 1)), n)
 #else
 #define DESCALE(x,n)  RIGHT_SHIFT(x, n)
 #endif


 /*
 * This module is specialized to the case DCTSIZE = 8.
 */

 #if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
 #endif


 /*
 * The poop on this scaling stuff is as follows:
 *
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
 * larger than the true DCT outputs.  The final outputs are therefore
 * a factor of N larger than desired; since N=8 this can be cured by
 * a simple right shift at the end of the algorithm.  The advantage of
 * this arrangement is that we save two multiplications per 1-D DCT,
 * because the y0 and y4 outputs need not be divided by sqrt(N).
 * In the IJG code, this factor of 8 is removed by the quantization step
 * (in jcdctmgr.c), NOT in this module.
 * This file is part of Libav.
 *
 * We have to do addition and subtraction of the integer inputs, which
 * is no problem, and multiplication by fractional constants, which is
 * a problem to do in integer arithmetic.  We multiply all the constants
 * by CONST_SCALE and convert them to integer constants (thus retaining
 * CONST_BITS bits of precision in the constants).  After doing a
 * multiplication we have to divide the product by CONST_SCALE, with proper
 * rounding, to produce the correct output.  This division can be done
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
 * as long as possible so that partial sums can be added together with
 * full fractional precision.
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
 * they are represented to better-than-integral precision.  These outputs
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
 * with the recommended scaling.  (For 12-bit sample data, the intermediate
 * array is int32_t anyway.)
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
 * shows that the values given below are the most effective.
 */

 #if BITS_IN_JSAMPLE == 8
 #define CONST_BITS  13
 #define PASS1_BITS  4   /* set this to 2 if 16x16 multiplies are faster */
 #else
 #define CONST_BITS  13
 #define PASS1_BITS  1   /* lose a little precision to avoid overflow */
 #endif

 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 * causing a lot of useless floating-point operations at run time.
 * To get around this we use the following pre-calculated constants.
 * If you change CONST_BITS you may want to add appropriate values.
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 */

 #if CONST_BITS == 13
 #define FIX_0_298631336  ((int32_t)  2446)      /* FIX(0.298631336) */
 #define FIX_0_390180644  ((int32_t)  3196)      /* FIX(0.390180644) */
 #define FIX_0_541196100  ((int32_t)  4433)      /* FIX(0.541196100) */
 #define FIX_0_765366865  ((int32_t)  6270)      /* FIX(0.765366865) */
 #define FIX_0_899976223  ((int32_t)  7373)      /* FIX(0.899976223) */
 #define FIX_1_175875602  ((int32_t)  9633)      /* FIX(1.175875602) */
 #define FIX_1_501321110  ((int32_t)  12299)     /* FIX(1.501321110) */
 #define FIX_1_847759065  ((int32_t)  15137)     /* FIX(1.847759065) */
 #define FIX_1_961570560  ((int32_t)  16069)     /* FIX(1.961570560) */
 #define FIX_2_053119869  ((int32_t)  16819)     /* FIX(2.053119869) */
 #define FIX_2_562915447  ((int32_t)  20995)     /* FIX(2.562915447) */
 #define FIX_3_072711026  ((int32_t)  25172)     /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
 #define FIX_0_541196100  FIX(0.541196100)
 #define FIX_0_765366865  FIX(0.765366865)
 #define FIX_0_899976223  FIX(0.899976223)
 #define FIX_1_175875602  FIX(1.175875602)
 #define FIX_1_501321110  FIX(1.501321110)
 #define FIX_1_847759065  FIX(1.847759065)
 #define FIX_1_961570560  FIX(1.961570560)
 #define FIX_2_053119869  FIX(2.053119869)
 #define FIX_2_562915447  FIX(2.562915447)
 #define FIX_3_072711026  FIX(3.072711026)
 #endif


 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
 * For 8-bit samples with the recommended scaling, all the variable
 * and constant values involved are no more than 16 bits wide, so a
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 * For 12-bit samples, a full 32-bit multiplication will be needed.
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #if BITS_IN_JSAMPLE == 8 && CONST_BITS<=13 && PASS1_BITS<=2
 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 #else
 #define MULTIPLY(var,const)  ((var) * (const))
 #endif


 static av_always_inline void row_fdct(DCTELEM * data){
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5;
  DCTELEM *dataptr;
  int ctr;

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[0] + dataptr[7];
    tmp7 = dataptr[0] - dataptr[7];
    tmp1 = dataptr[1] + dataptr[6];
    tmp6 = dataptr[1] - dataptr[6];
    tmp2 = dataptr[2] + dataptr[5];
    tmp5 = dataptr[2] - dataptr[5];
    tmp3 = dataptr[3] + dataptr[4];
    tmp4 = dataptr[3] - dataptr[4];

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                   CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                   CONST_BITS-PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */

    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */

    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */

    z3 += z5;
    z4 += z5;

    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;         /* advance pointer to next row */
  }
 }

 /*
 * Perform the forward DCT on one block of samples.
 */

 GLOBAL(void)
 ff_jpeg_fdct_islow (DCTELEM * data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5;
  DCTELEM *dataptr;
  int ctr;

  row_fdct(data);

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                           CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                           CONST_BITS+PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */

    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */

    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */

    z3 += z5;
    z4 += z5;

    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
                                           CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
                                           CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
                                           CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
                                           CONST_BITS+PASS1_BITS);

    dataptr++;                  /* advance pointer to next column */
  }
 }

 /*
 * The secret of DCT2-4-8 is really simple -- you do the usual 1-DCT
 * on the rows and then, instead of doing even and odd, part on the colums
 * you do even part two times.
 */
 GLOBAL(void)
 ff_fdct248_islow (DCTELEM * data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1;
  DCTELEM *dataptr;
  int ctr;

  row_fdct(data);

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1];
     tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
     tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
     tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1];
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
     tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
     tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];

     tmp10 = tmp0 + tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
     tmp13 = tmp0 - tmp3;

     dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
     dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);

     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                            CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                            CONST_BITS+PASS1_BITS);

     tmp10 = tmp4 + tmp7;
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp5 - tmp6;
     tmp13 = tmp4 - tmp7;

     dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
     dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);

     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                            CONST_BITS+PASS1_BITS);
     dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                            CONST_BITS+PASS1_BITS);
 #define BIT_DEPTH 8
 #include "jfdctint_template.c"
 #undef BIT_DEPTH

     dataptr++;                 /* advance pointer to next column */
  }
 }
 #define BIT_DEPTH 10
 #include "jfdctint_template.c"
 #undef BIT_DEPTH
--- a/libavcodec/jfdctint_template.c
+++ b/libavcodec/jfdctint_template.c
@@ -0,0 +1,405 @@
 /*
 * jfdctint.c
 *
 * This file is part of the Independent JPEG Group's software.
 *
 * The authors make NO WARRANTY or representation, either express or implied,
 * with respect to this software, its quality, accuracy, merchantability, or
 * fitness for a particular purpose.  This software is provided "AS IS", and
 * you, its user, assume the entire risk as to its quality and accuracy.
 *
 * This software is copyright (C) 1991-1996, Thomas G. Lane.
 * All Rights Reserved except as specified below.
 *
 * Permission is hereby granted to use, copy, modify, and distribute this
 * software (or portions thereof) for any purpose, without fee, subject to
 * these conditions:
 * (1) If any part of the source code for this software is distributed, then
 * this README file must be included, with this copyright and no-warranty
 * notice unaltered; and any additions, deletions, or changes to the original
 * files must be clearly indicated in accompanying documentation.
 * (2) If only executable code is distributed, then the accompanying
 * documentation must state that "this software is based in part on the work
 * of the Independent JPEG Group".
 * (3) Permission for use of this software is granted only if the user accepts
 * full responsibility for any undesirable consequences; the authors accept
 * NO LIABILITY for damages of any kind.
 *
 * These conditions apply to any software derived from or based on the IJG
 * code, not just to the unmodified library.  If you use our work, you ought
 * to acknowledge us.
 *
 * Permission is NOT granted for the use of any IJG author's name or company
 * name in advertising or publicity relating to this software or products
 * derived from it.  This software may be referred to only as "the Independent
 * JPEG Group's software".
 *
 * We specifically permit and encourage the use of this software as the basis
 * of commercial products, provided that all warranty or liability claims are
 * assumed by the product vendor.
 *
 * This file contains a slow-but-accurate integer implementation of the
 * forward DCT (Discrete Cosine Transform).
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on an algorithm described in
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
 * The primary algorithm described there uses 11 multiplies and 29 adds.
 * We use their alternate method with 12 multiplies and 32 adds.
 * The advantage of this method is that no data path contains more than one
 * multiplication; this allows a very simple and accurate implementation in
 * scaled fixed-point arithmetic, with a minimal number of shifts.
 */

 /**
 * @file
 * Independent JPEG Group's slow & accurate dct.
 */

 #include "libavutil/common.h"
 #include "dsputil.h"

 #include "bit_depth_template.c"

 #define DCTSIZE 8
 #define BITS_IN_JSAMPLE BIT_DEPTH
 #define GLOBAL(x) x
 #define RIGHT_SHIFT(x, n) ((x) >> (n))
 #define MULTIPLY16C16(var,const) ((var)*(const))

 #if 1 //def USE_ACCURATE_ROUNDING
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (1 << ((n) - 1)), n)
 #else
 #define DESCALE(x,n)  RIGHT_SHIFT(x, n)
 #endif


 /*
 * This module is specialized to the case DCTSIZE = 8.
 */

 #if DCTSIZE != 8
 #error  "Sorry, this code only copes with 8x8 DCTs."
 #endif


 /*
 * The poop on this scaling stuff is as follows:
 *
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
 * larger than the true DCT outputs.  The final outputs are therefore
 * a factor of N larger than desired; since N=8 this can be cured by
 * a simple right shift at the end of the algorithm.  The advantage of
 * this arrangement is that we save two multiplications per 1-D DCT,
 * because the y0 and y4 outputs need not be divided by sqrt(N).
 * In the IJG code, this factor of 8 is removed by the quantization step
 * (in jcdctmgr.c), NOT in this module.
 *
 * We have to do addition and subtraction of the integer inputs, which
 * is no problem, and multiplication by fractional constants, which is
 * a problem to do in integer arithmetic.  We multiply all the constants
 * by CONST_SCALE and convert them to integer constants (thus retaining
 * CONST_BITS bits of precision in the constants).  After doing a
 * multiplication we have to divide the product by CONST_SCALE, with proper
 * rounding, to produce the correct output.  This division can be done
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
 * as long as possible so that partial sums can be added together with
 * full fractional precision.
 *
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
 * they are represented to better-than-integral precision.  These outputs
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
 * with the recommended scaling.  (For 12-bit sample data, the intermediate
 * array is int32_t anyway.)
 *
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
 * shows that the values given below are the most effective.
 */

 #undef CONST_BITS
 #undef PASS1_BITS
 #undef OUT_SHIFT

 #if BITS_IN_JSAMPLE == 8
 #define CONST_BITS  13
 #define PASS1_BITS  4   /* set this to 2 if 16x16 multiplies are faster */
 #define OUT_SHIFT   PASS1_BITS
 #else
 #define CONST_BITS  13
 #define PASS1_BITS  1   /* lose a little precision to avoid overflow */
 #define OUT_SHIFT   (PASS1_BITS + 1)
 #endif

 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 * causing a lot of useless floating-point operations at run time.
 * To get around this we use the following pre-calculated constants.
 * If you change CONST_BITS you may want to add appropriate values.
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 */

 #if CONST_BITS == 13
 #define FIX_0_298631336  ((int32_t)  2446)      /* FIX(0.298631336) */
 #define FIX_0_390180644  ((int32_t)  3196)      /* FIX(0.390180644) */
 #define FIX_0_541196100  ((int32_t)  4433)      /* FIX(0.541196100) */
 #define FIX_0_765366865  ((int32_t)  6270)      /* FIX(0.765366865) */
 #define FIX_0_899976223  ((int32_t)  7373)      /* FIX(0.899976223) */
 #define FIX_1_175875602  ((int32_t)  9633)      /* FIX(1.175875602) */
 #define FIX_1_501321110  ((int32_t)  12299)     /* FIX(1.501321110) */
 #define FIX_1_847759065  ((int32_t)  15137)     /* FIX(1.847759065) */
 #define FIX_1_961570560  ((int32_t)  16069)     /* FIX(1.961570560) */
 #define FIX_2_053119869  ((int32_t)  16819)     /* FIX(2.053119869) */
 #define FIX_2_562915447  ((int32_t)  20995)     /* FIX(2.562915447) */
 #define FIX_3_072711026  ((int32_t)  25172)     /* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
 #define FIX_0_541196100  FIX(0.541196100)
 #define FIX_0_765366865  FIX(0.765366865)
 #define FIX_0_899976223  FIX(0.899976223)
 #define FIX_1_175875602  FIX(1.175875602)
 #define FIX_1_501321110  FIX(1.501321110)
 #define FIX_1_847759065  FIX(1.847759065)
 #define FIX_1_961570560  FIX(1.961570560)
 #define FIX_2_053119869  FIX(2.053119869)
 #define FIX_2_562915447  FIX(2.562915447)
 #define FIX_3_072711026  FIX(3.072711026)
 #endif


 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
 * For 8-bit samples with the recommended scaling, all the variable
 * and constant values involved are no more than 16 bits wide, so a
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 * For 12-bit samples, a full 32-bit multiplication will be needed.
 */

 #if BITS_IN_JSAMPLE == 8 && CONST_BITS<=13 && PASS1_BITS<=2
 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 #else
 #define MULTIPLY(var,const)  ((var) * (const))
 #endif


 static av_always_inline void FUNC(row_fdct)(DCTELEM *data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5;
  DCTELEM *dataptr;
  int ctr;

  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[0] + dataptr[7];
    tmp7 = dataptr[0] - dataptr[7];
    tmp1 = dataptr[1] + dataptr[6];
    tmp6 = dataptr[1] - dataptr[6];
    tmp2 = dataptr[2] + dataptr[5];
    tmp5 = dataptr[2] - dataptr[5];
    tmp3 = dataptr[3] + dataptr[4];
    tmp4 = dataptr[3] - dataptr[4];

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                   CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                   CONST_BITS-PASS1_BITS);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */

    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */

    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */

    z3 += z5;
    z4 += z5;

    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);

    dataptr += DCTSIZE;         /* advance pointer to next row */
  }
 }

 /*
 * Perform the forward DCT on one block of samples.
 */

 GLOBAL(void)
 FUNC(ff_jpeg_fdct_islow)(DCTELEM *data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1, z2, z3, z4, z5;
  DCTELEM *dataptr;
  int ctr;

  FUNC(row_fdct)(data);

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];

    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */

    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;

    dataptr[DCTSIZE*0] = DESCALE(tmp10 + tmp11, OUT_SHIFT);
    dataptr[DCTSIZE*4] = DESCALE(tmp10 - tmp11, OUT_SHIFT);

    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[DCTSIZE*2] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                 CONST_BITS + OUT_SHIFT);
    dataptr[DCTSIZE*6] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                 CONST_BITS + OUT_SHIFT);

    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */

    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */

    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */

    z3 += z5;
    z4 += z5;

    dataptr[DCTSIZE*7] = DESCALE(tmp4 + z1 + z3, CONST_BITS + OUT_SHIFT);
    dataptr[DCTSIZE*5] = DESCALE(tmp5 + z2 + z4, CONST_BITS + OUT_SHIFT);
    dataptr[DCTSIZE*3] = DESCALE(tmp6 + z2 + z3, CONST_BITS + OUT_SHIFT);
    dataptr[DCTSIZE*1] = DESCALE(tmp7 + z1 + z4, CONST_BITS + OUT_SHIFT);

    dataptr++;                  /* advance pointer to next column */
  }
 }

 /*
 * The secret of DCT2-4-8 is really simple -- you do the usual 1-DCT
 * on the rows and then, instead of doing even and odd, part on the colums
 * you do even part two times.
 */
 GLOBAL(void)
 FUNC(ff_fdct248_islow)(DCTELEM *data)
 {
  int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  int tmp10, tmp11, tmp12, tmp13;
  int z1;
  DCTELEM *dataptr;
  int ctr;

  FUNC(row_fdct)(data);

  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */

  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
     tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*1];
     tmp1 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*3];
     tmp2 = dataptr[DCTSIZE*4] + dataptr[DCTSIZE*5];
     tmp3 = dataptr[DCTSIZE*6] + dataptr[DCTSIZE*7];
     tmp4 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*1];
     tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*3];
     tmp6 = dataptr[DCTSIZE*4] - dataptr[DCTSIZE*5];
     tmp7 = dataptr[DCTSIZE*6] - dataptr[DCTSIZE*7];

     tmp10 = tmp0 + tmp3;
     tmp11 = tmp1 + tmp2;
     tmp12 = tmp1 - tmp2;
     tmp13 = tmp0 - tmp3;

     dataptr[DCTSIZE*0] = DESCALE(tmp10 + tmp11, OUT_SHIFT);
     dataptr[DCTSIZE*4] = DESCALE(tmp10 - tmp11, OUT_SHIFT);

     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[DCTSIZE*2] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                  CONST_BITS+OUT_SHIFT);
     dataptr[DCTSIZE*6] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                  CONST_BITS+OUT_SHIFT);

     tmp10 = tmp4 + tmp7;
     tmp11 = tmp5 + tmp6;
     tmp12 = tmp5 - tmp6;
     tmp13 = tmp4 - tmp7;

     dataptr[DCTSIZE*1] = DESCALE(tmp10 + tmp11, OUT_SHIFT);
     dataptr[DCTSIZE*5] = DESCALE(tmp10 - tmp11, OUT_SHIFT);

     z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
     dataptr[DCTSIZE*3] = DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
                                  CONST_BITS + OUT_SHIFT);
     dataptr[DCTSIZE*7] = DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
                                  CONST_BITS + OUT_SHIFT);

     dataptr++;                 /* advance pointer to next column */
  }
 }
--- a/libavcodec/mlib/dsputil_mlib.c
+++ b/libavcodec/mlib/dsputil_mlib.c
@@ -421,13 +421,14 @@ static void ff_fdct_mlib(DCTELEM *data)

 void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    c->get_pixels  = get_pixels_mlib;
    c->diff_pixels = diff_pixels_mlib;
    c->add_pixels_clamped = add_pixels_clamped_mlib;

    if (!high_bit_depth) {
    c->get_pixels  = get_pixels_mlib;

    c->put_pixels_tab[0][0] = put_pixels16_mlib;
    c->put_pixels_tab[0][1] = put_pixels16_x2_mlib;
    c->put_pixels_tab[0][2] = put_pixels16_y2_mlib;
--- a/libavcodec/mpeg4videodec.c
+++ b/libavcodec/mpeg4videodec.c
@@ -1527,6 +1527,22 @@ static int mpeg4_decode_gop_header(MpegEncContext * s, GetBitContext *gb){
    return 0;
 }

 static int mpeg4_decode_profile_level(MpegEncContext * s, GetBitContext *gb){
  int profile_and_level_indication;

  profile_and_level_indication = get_bits(gb, 8);

  s->avctx->profile = (profile_and_level_indication & 0xf0) >> 4;
  s->avctx->level   = (profile_and_level_indication & 0x0f);

  // for Simple profile, level 0
  if (s->avctx->profile == 0 && s->avctx->level == 8) {
      s->avctx->level = 0;
  }

  return 0;
 }

 static int decode_vol_header(MpegEncContext *s, GetBitContext *gb){
    int width, height, vo_ver_id;

@@ -2181,6 +2197,9 @@ int ff_mpeg4_decode_picture_header(MpegEncContext * s, GetBitContext *gb)
        else if(startcode == GOP_STARTCODE){
            mpeg4_decode_gop_header(s, gb);
        }
        else if(startcode == VOS_STARTCODE){
            mpeg4_decode_profile_level(s, gb);
        }
        else if(startcode == VOP_STARTCODE){
            break;
        }
@@ -2241,6 +2260,25 @@ static av_cold int decode_init(AVCodecContext *avctx)
    return 0;
 }

 static const AVProfile mpeg4_video_profiles[] = {
    { FF_PROFILE_MPEG4_SIMPLE,                    "Simple Profile" },
    { FF_PROFILE_MPEG4_SIMPLE_SCALABLE,           "Simple Scalable Profile" },
    { FF_PROFILE_MPEG4_CORE,                      "Core Profile" },
    { FF_PROFILE_MPEG4_MAIN,                      "Main Profile" },
    { FF_PROFILE_MPEG4_N_BIT,                     "N-bit Profile" },
    { FF_PROFILE_MPEG4_SCALABLE_TEXTURE,          "Scalable Texture Profile" },
    { FF_PROFILE_MPEG4_SIMPLE_FACE_ANIMATION,     "Simple Face Animation Profile" },
    { FF_PROFILE_MPEG4_BASIC_ANIMATED_TEXTURE,    "Basic Animated Texture Profile" },
    { FF_PROFILE_MPEG4_HYBRID,                    "Hybrid Profile" },
    { FF_PROFILE_MPEG4_ADVANCED_REAL_TIME,        "Advanced Real Time Simple Profile" },
    { FF_PROFILE_MPEG4_CORE_SCALABLE,             "Code Scalable Profile" },
    { FF_PROFILE_MPEG4_ADVANCED_CODING,           "Advanced Coding Profile" },
    { FF_PROFILE_MPEG4_ADVANCED_CORE,             "Advanced Core Profile" },
    { FF_PROFILE_MPEG4_ADVANCED_SCALABLE_TEXTURE, "Advanced Scalable Texture Profile" },
    { FF_PROFILE_MPEG4_SIMPLE_STUDIO,             "Simple Studio Profile" },
    { FF_PROFILE_MPEG4_ADVANCED_SIMPLE,           "Advanced Simple Profile" },
 };

 AVCodec ff_mpeg4_decoder = {
    "mpeg4",
    AVMEDIA_TYPE_VIDEO,
@@ -2255,6 +2293,7 @@ AVCodec ff_mpeg4_decoder = {
    .max_lowres= 3,
    .long_name= NULL_IF_CONFIG_SMALL("MPEG-4 part 2"),
    .pix_fmts= ff_hwaccel_pixfmt_list_420,
    .profiles = NULL_IF_CONFIG_SMALL(mpeg4_video_profiles),
    .update_thread_context= ONLY_IF_THREADS_ENABLED(ff_mpeg_update_thread_context)
 };

--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -69,7 +69,8 @@ void ff_convert_matrix(DSPContext *dsp, int (*qmat)[64], uint16_t (*qmat16)[2][6

    for(qscale=qmin; qscale<=qmax; qscale++){
        int i;
        if (dsp->fdct == ff_jpeg_fdct_islow
        if (dsp->fdct == ff_jpeg_fdct_islow_8 ||
            dsp->fdct == ff_jpeg_fdct_islow_10
 #ifdef FAAN_POSTSCALE
            || dsp->fdct == ff_faandct
 #endif
--- a/libavcodec/ppc/dsputil_altivec.c
+++ b/libavcodec/ppc/dsputil_altivec.c
@@ -1373,7 +1373,7 @@ static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int l

 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    c->pix_abs[0][1] = sad16_x2_altivec;
    c->pix_abs[0][2] = sad16_y2_altivec;
@@ -1387,11 +1387,10 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
    c->sse[0]= sse16_altivec;
    c->pix_sum = pix_sum_altivec;
    c->diff_pixels = diff_pixels_altivec;
    c->get_pixels = get_pixels_altivec;
    if (!high_bit_depth)
    c->clear_block = clear_block_altivec;
    c->add_bytes= add_bytes_altivec;
    if (!high_bit_depth) {
    c->get_pixels = get_pixels_altivec;
    c->clear_block = clear_block_altivec;
    c->put_pixels_tab[0][0] = put_pixels16_altivec;
    /* the two functions do the same thing, so use the same code */
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
--- a/libavcodec/ppc/dsputil_ppc.c
+++ b/libavcodec/ppc/dsputil_ppc.c
@@ -145,7 +145,7 @@ static void prefetch_ppc(void *mem, int stride, int h)

 void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
 {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    // Common optimizations whether AltiVec is available or not
    c->prefetch = prefetch_ppc;
@@ -172,8 +172,9 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
        c->gmc1 = gmc1_altivec;

 #if CONFIG_ENCODERS
        if (avctx->dct_algo == FF_DCT_AUTO ||
            avctx->dct_algo == FF_DCT_ALTIVEC) {
        if (avctx->bits_per_raw_sample <= 8 &&
            (avctx->dct_algo == FF_DCT_AUTO ||
             avctx->dct_algo == FF_DCT_ALTIVEC)) {
            c->fdct = fdct_altivec;
        }
 #endif //CONFIG_ENCODERS
--- a/libavcodec/ppc/h264_altivec.c
+++ b/libavcodec/ppc/h264_altivec.c
@@ -967,7 +967,7 @@ H264_WEIGHT( 8, 8)
 H264_WEIGHT( 8, 4)

 void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
    if (!high_bit_depth) {
--- a/libavcodec/ps2/dsputil_mmi.c
+++ b/libavcodec/ps2/dsputil_mmi.c
@@ -142,7 +142,7 @@ static void put_pixels16_mmi(uint8_t *block, const uint8_t *pixels, int line_siz
 void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx)
 {
    const int idct_algo= avctx->idct_algo;
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;

    if (!high_bit_depth) {
    c->clear_blocks = clear_blocks_mmi;
@@ -152,9 +152,9 @@ void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx)

    c->put_pixels_tab[0][0] = put_pixels16_mmi;
    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmi;
    }

    c->get_pixels = get_pixels_mmi;
    }

    if (avctx->bits_per_raw_sample <= 8 &&
        (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_PS2)) {
--- a/libavcodec/sh4/dsputil_align.c
+++ b/libavcodec/sh4/dsputil_align.c
@@ -333,7 +333,7 @@ DEFFUNC(avg,no_rnd,xy,16,OP_XY,PACK)

 void dsputil_init_align(DSPContext* c, AVCodecContext *avctx)
 {
        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
        const int high_bit_depth = avctx->bits_per_raw_sample > 8;

        if (!high_bit_depth) {
        c->put_pixels_tab[0][0] = put_rnd_pixels16_o;
--- a/libavcodec/sh4/dsputil_sh4.c
+++ b/libavcodec/sh4/dsputil_sh4.c
@@ -92,7 +92,7 @@ static void idct_add(uint8_t *dest, int line_size, DCTELEM *block)
 void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx)
 {
        const int idct_algo= avctx->idct_algo;
        const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
        const int high_bit_depth = avctx->bits_per_raw_sample > 8;
        dsputil_init_align(c,avctx);

        if (!high_bit_depth)
--- a/libavcodec/sparc/dsputil_vis.c
+++ b/libavcodec/sparc/dsputil_vis.c
@@ -3953,7 +3953,7 @@ void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
 {
  /* VIS-specific optimizations */
  int accel = vis_level ();
  const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
  const int high_bit_depth = avctx->bits_per_raw_sample > 8;

  if (accel & ACCEL_SPARC_VIS) {
      if (avctx->bits_per_raw_sample <= 8 &&
--- a/libavcodec/x86/dnxhd_mmx.c
+++ b/libavcodec/x86/dnxhd_mmx.c
@@ -53,6 +53,7 @@ static void get_pixels_8x4_sym_sse2(DCTELEM *block, const uint8_t *pixels, int l
 void ff_dnxhd_init_mmx(DNXHDEncContext *ctx)
 {
    if (av_get_cpu_flags() & AV_CPU_FLAG_SSE2) {
        ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
        if (ctx->cid_table->bit_depth == 8)
            ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
    }
 }
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2341,7 +2341,7 @@ void ff_vector_clip_int32_sse41   (int32_t *dst, const int32_t *src, int32_t min
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();
    const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
    const int high_bit_depth = avctx->bits_per_raw_sample > 8;
    const int bit_depth = avctx->bits_per_raw_sample;

    if (avctx->dsp_mask) {
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1098,10 +1098,12 @@ static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int si
 void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
    int mm_flags = av_get_cpu_flags();
    int bit_depth = avctx->bits_per_raw_sample;

    if (mm_flags & AV_CPU_FLAG_MMX) {
        const int dct_algo = avctx->dct_algo;
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
        if (avctx->bits_per_raw_sample <= 8 &&
            (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
            if(mm_flags & AV_CPU_FLAG_SSE2){
                c->fdct = ff_fdct_sse2;
            }else if(mm_flags & AV_CPU_FLAG_MMX2){
@@ -1111,7 +1113,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
            }
        }

        c->get_pixels = get_pixels_mmx;
        if (bit_depth <= 8)
            c->get_pixels = get_pixels_mmx;
        c->diff_pixels = diff_pixels_mmx;
        c->pix_sum = pix_sum16_mmx;

@@ -1158,7 +1161,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
        }

        if(mm_flags & AV_CPU_FLAG_SSE2){
            c->get_pixels = get_pixels_sse2;
            if (bit_depth <= 8)
                c->get_pixels = get_pixels_sse2;
            c->sum_abs_dctelem= sum_abs_dctelem_sse2;
 #if HAVE_YASM && HAVE_ALIGNED_STACK
            c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
--- a/libavformat/mov.c
+++ b/libavformat/mov.c
@@ -456,7 +456,7 @@ static int mov_read_hdlr(MOVContext *c, AVIOContext *pb, MOVAtom atom)
        st->codec->codec_type = AVMEDIA_TYPE_AUDIO;
    else if(type == MKTAG('m','1','a',' '))
        st->codec->codec_id = CODEC_ID_MP2;
    else if(type == MKTAG('s','u','b','p'))
    else if((type == MKTAG('s','u','b','p')) || (type == MKTAG('c','l','c','p')))
        st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;

    avio_rb32(pb); /* component  manufacture */
--- a/libavformat/rtpdec.c
+++ b/libavformat/rtpdec.c
@@ -218,22 +218,6 @@ static int rtp_valid_packet_in_sequence(RTPStatistics *s, uint16_t seq)
    return 1;
 }

 #if 0
 /**
 * This function is currently unused; without a valid local ntp time, I don't see how we could calculate the
 * difference between the arrival and sent timestamp.  As a result, the jitter and transit statistics values
 * never change.  I left this in in case someone else can see a way. (rdm)
 */
 static void rtcp_update_jitter(RTPStatistics *s, uint32_t sent_timestamp, uint32_t arrival_timestamp)
 {
    uint32_t transit= arrival_timestamp - sent_timestamp;
    int d;
    s->transit= transit;
    d= FFABS(transit - s->transit);
    s->jitter += d - ((s->jitter + 8)>>4);
 }
 #endif

 int rtp_check_and_send_back_rr(RTPDemuxContext *s, int count)
 {
    AVIOContext *pb;
--- a/libavformat/rtpproto.c
+++ b/libavformat/rtpproto.c
@@ -225,20 +225,6 @@ static int rtp_read(URLContext *h, uint8_t *buf, int size)
    int len, n;
    struct pollfd p[2] = {{s->rtp_fd, POLLIN, 0}, {s->rtcp_fd, POLLIN, 0}};

 #if 0
    for(;;) {
        from_len = sizeof(from);
        len = recvfrom (s->rtp_fd, buf, size, 0,
                        (struct sockaddr *)&from, &from_len);
        if (len < 0) {
            if (ff_neterrno() == AVERROR(EAGAIN) ||
                ff_neterrno() == AVERROR(EINTR))
                continue;
            return AVERROR(EIO);
        }
        break;
    }
 #else
    for(;;) {
        if (url_interrupt_cb())
            return AVERROR_EXIT;
@@ -277,7 +263,6 @@ static int rtp_read(URLContext *h, uint8_t *buf, int size)
            return AVERROR(EIO);
        }
    }
 #endif
    return len;
 }

@@ -296,14 +281,6 @@ static int rtp_write(URLContext *h, const uint8_t *buf, int size)
    }

    ret = ffurl_write(hd, buf, size);
 #if 0
    {
        struct timespec ts;
        ts.tv_sec = 0;
        ts.tv_nsec = 10 * 1000000;
        nanosleep(&ts, NULL);
    }
 #endif
    return ret;
 }

--- a/tests/codec-regression.sh
+++ b/tests/codec-regression.sh
@@ -235,6 +235,11 @@ do_video_encoding dnxhd-720p-rd.dnxhd "-threads 4 -mbd rd -s hd720 -b 90M -pix_f
 do_video_decoding "" "-s cif -pix_fmt yuv420p"
 fi

 if [ -n "$do_dnxhd_720p_10bit" ] ; then
 do_video_encoding dnxhd-720p-10bit.dnxhd "-s hd720 -b 90M -pix_fmt yuv422p10 -vframes 5 -an"
 do_video_decoding "" "-s cif -pix_fmt yuv420p"
 fi

 if [ -n "$do_svq1" ] ; then
 do_video_encoding svq1.mov "-an -vcodec svq1 -qscale 3 -pix_fmt yuv410p"
 do_video_decoding "" "-pix_fmt yuv420p"
--- a/tests/ref/vsynth1/dnxhd_720p_10bit
+++ b/tests/ref/vsynth1/dnxhd_720p_10bit
@@ -0,0 +1,4 @@
 cb29b6ae4e1562d95f9311991fef98df *./tests/data/vsynth1/dnxhd-720p-10bit.dnxhd
 2293760 ./tests/data/vsynth1/dnxhd-720p-10bit.dnxhd
 2f45bb1af7da5dd3dca870ac87237b7d *./tests/data/dnxhd_720p_10bit.vsynth1.out.yuv
 stddev:    6.27 PSNR: 32.18 MAXDIFF:   64 bytes:   760320/  7603200
--- a/tests/ref/vsynth2/dnxhd_720p_10bit
+++ b/tests/ref/vsynth2/dnxhd_720p_10bit
@@ -0,0 +1,4 @@
 8648511257afb816b5b911706ca391db *./tests/data/vsynth2/dnxhd-720p-10bit.dnxhd
 2293760 ./tests/data/vsynth2/dnxhd-720p-10bit.dnxhd
 391b6f5aa7c7b488b479cb43d420b860 *./tests/data/dnxhd_720p_10bit.vsynth2.out.yuv
 stddev:    1.35 PSNR: 45.46 MAXDIFF:   23 bytes:   760320/  7603200