Rearchitecturing the stiched up goose part 1

Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before. Originally committed as revision 21270 to svn://svn.ffmpeg.org/ffmpeg/trunk
15 years ago · c988f97566
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -666,6 +666,7 @@ static void free_tables(H264Context *h){
    av_freep(&h->non_zero_count);
    av_freep(&h->slice_table_base);
    h->slice_table= NULL;
    av_freep(&h->list_counts);

    av_freep(&h->mb2b_xy);
    av_freep(&h->mb2b8_xy);
@@ -756,7 +757,7 @@ int ff_h264_alloc_tables(H264Context *h){

    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t), fail)

    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t), fail)
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count    , big_mb_num * 32 * sizeof(uint8_t), fail)
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail)
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)

@@ -764,6 +765,7 @@ int ff_h264_alloc_tables(H264Context *h){
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)

    memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
    h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
@@ -945,12 +947,7 @@ int ff_h264_frame_start(H264Context *h){

 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
    MpegEncContext * const s = &h->s;
    int i;
    int step    = 1;
    int offset  = 1;
    int uvoffset= 1;
    int top_idx = 1;
    int skiplast= 0;

    src_y  -=   linesize;
    src_cb -= uvlinesize;
@@ -958,8 +955,6 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src

    if(!simple && FRAME_MBAFF){
        if(s->mb_y&1){
            offset  = MB_MBAFF ? 1 : 17;
            uvoffset= MB_MBAFF ? 1 : 9;
            if(!MB_MBAFF){
                *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
                *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
@@ -968,39 +963,19 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src
                    *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
                }
            }
        }else{
            if(!MB_MBAFF){
                h->left_border[0]= h->top_borders[0][s->mb_x][15];
                if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
                    h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
                    h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
                }
                skiplast= 1;
            }
            offset  =
            uvoffset=
            top_idx = MB_MBAFF ? 0 : 1;
        }
        step= MB_MBAFF ? 2 : 1;
        }else if(MB_MBAFF){
            top_idx = 0;
        }else
            return;
    }

    // There are two lines saved, the line above the the top macroblock of a pair,
    // and the line above the bottom macroblock
    h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
    for(i=1; i<17 - skiplast; i++){
        h->left_border[offset+i*step]= src_y[15+i*  linesize];
    }

    *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
    *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);

    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
        h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
        h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
        for(i=1; i<9 - skiplast; i++){
            h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
            h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
        }
        *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
        *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
    }
@@ -1013,21 +988,15 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c
    int deblock_left;
    int deblock_top;
    int mb_xy;
    int step    = 1;
    int offset  = 1;
    int uvoffset= 1;
    int top_idx = 1;

    if(!simple && FRAME_MBAFF){
        if(s->mb_y&1){
            offset  = MB_MBAFF ? 1 : 17;
            uvoffset= MB_MBAFF ? 1 : 9;
            if(!MB_MBAFF)
                return;
        }else{
            offset  =
            uvoffset=
            top_idx = MB_MBAFF ? 0 : 1;
        }
        step= MB_MBAFF ? 2 : 1;
    }

    if(h->deblocking_filter == 2) {
@@ -1049,14 +1018,10 @@ if(xchg)\
    a= b;\
 b= t;

    if(deblock_left){
        for(i = !deblock_top; i<16; i++){
            XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
        }
        XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
    }

    if(deblock_top){
        if(deblock_left){
            XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+8), *(uint64_t*)(src_y -7), temp64, 1);
        }
        XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
        XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
        if(s->mb_x+1 < s->mb_width){
@@ -1065,15 +1030,11 @@ b= t;
    }

    if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
        if(deblock_left){
            for(i = !deblock_top; i<8; i++){
                XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
                XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
            }
            XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
            XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
        }
        if(deblock_top){
            if(deblock_left){
                XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+16), *(uint64_t*)(src_cb -7), temp64, 1);
                XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+24), *(uint64_t*)(src_cr -7), temp64, 1);
            }
            XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
            XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
        }
@@ -1103,6 +1064,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
    s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
    s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);

    h->list_counts[mb_xy]= h->list_count;

    if (!simple && MB_FIELD) {
        linesize   = h->mb_linesize   = s->linesize * 2;
        uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
@@ -1322,7 +1285,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
    if(h->cbp || IS_INTRA(mb_type))
        s->dsp.clear_blocks(h->mb);

    if(h->deblocking_filter) {
    if(h->deblocking_filter && 0) {
        backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
        fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
        h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
@@ -2174,6 +2137,70 @@ int ff_h264_get_slice_type(H264Context *h)
    }
 }

 static void loop_filter(H264Context *h){
    MpegEncContext * const s = &h->s;
    uint8_t  *dest_y, *dest_cb, *dest_cr;
    int linesize, uvlinesize, mb_x, mb_y;
    const int end_mb_y= s->mb_y + FRAME_MBAFF;
    const int old_slice_type= h->slice_type;

    if(h->deblocking_filter) {
        for(mb_x= 0; mb_x<s->mb_width; mb_x++){
            for(mb_y=end_mb_y - FRAME_MBAFF; mb_y<= end_mb_y; mb_y++){
                int list, mb_xy, mb_type, is_complex;
                mb_xy = h->mb_xy = mb_x + mb_y*s->mb_stride;
                h->slice_num= h->slice_table[mb_xy];
                mb_type= s->current_picture.mb_type[mb_xy];
                h->list_count= h->list_counts[mb_xy];
                if(h->list_count==2){
                    h->slice_type= h->slice_type_nos= FF_B_TYPE;
                }else if(h->list_count==1){
                    h->slice_type= h->slice_type_nos= FF_P_TYPE;
                }else
                    h->slice_type= h->slice_type_nos= FF_I_TYPE;

                if(FRAME_MBAFF)
                    h->mb_mbaff = h->mb_field_decoding_flag = !!IS_INTERLACED(mb_type);

                is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0; //FIXME qscale might be wrong

                s->mb_x= mb_x;
                s->mb_y= mb_y;
                dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
                dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
                dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
                    //FIXME simplify above

                if (MB_FIELD) {
                    linesize   = h->mb_linesize   = s->linesize * 2;
                    uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
                    if(mb_y&1){ //FIXME move out of this function?
                        dest_y -= s->linesize*15;
                        dest_cb-= s->uvlinesize*7;
                        dest_cr-= s->uvlinesize*7;
                    }
                } else {
                    linesize   = h->mb_linesize   = s->linesize;
                    uvlinesize = h->mb_uvlinesize = s->uvlinesize;
                }
                backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, !is_complex);
                fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
                h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
                h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);

                if (is_complex && FRAME_MBAFF) {
                    ff_h264_filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
                } else {
                    ff_h264_filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
                }
            }
        }
    }
    h->slice_type= old_slice_type;
    s->mb_x= 0;
    s->mb_y= end_mb_y - FRAME_MBAFF;
 }

 static int decode_slice(struct AVCodecContext *avctx, void *arg){
    H264Context *h = *(void**)arg;
    MpegEncContext * const s = &h->s;
@@ -2222,6 +2249,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){

            if( ++s->mb_x >= s->mb_width ) {
                s->mb_x = 0;
                loop_filter(h);
                ff_draw_horiz_band(s, 16*s->mb_y, 16);
                ++s->mb_y;
                if(FIELD_OR_MBAFF_PICTURE) {
@@ -2259,6 +2287,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){

            if(++s->mb_x >= s->mb_width){
                s->mb_x=0;
                loop_filter(h);
                ff_draw_horiz_band(s, 16*s->mb_y, 16);
                ++s->mb_y;
                if(FIELD_OR_MBAFF_PICTURE) {
--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -300,7 +300,7 @@ typedef struct H264Context{
     * is 64 if not available.
     */
    DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
    uint8_t (*non_zero_count)[16];
    uint8_t (*non_zero_count)[32];

    /**
     * Motion vector cache.
@@ -423,6 +423,7 @@ typedef struct H264Context{
     */
    unsigned int ref_count[2];   ///< counts frames or fields, depending on current mb mode
    unsigned int list_count;
    uint8_t *list_counts;            ///< Array of list_count per MB specifying the slice type
    Picture *short_ref[32];
    Picture *long_ref[32];
    Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture
@@ -736,8 +737,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
    top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);

    //FIXME deblocking could skip the intra and nnz parts.
    if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
        return;
 //     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 //         return;

    /* Wow, what a mess, why didn't they simplify the interlacing & intra
     * stuff, I can't imagine that these complex rules are worth it. */
@@ -793,20 +794,33 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){
        left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
        left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;

        if(MB_MBAFF && !IS_INTRA(mb_type)){
        if(!IS_INTRA(mb_type)){
            int list;
            for(list=0; list<h->list_count; list++){
                //These values where changed for ease of performing MC, we need to change them back
                //FIXME maybe we can make MC and loop filter use the same values or prevent
                //the MC code from changing ref_cache and rather use a temporary array.
                if(USES_LIST(mb_type,list)){
                    int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
                int8_t *ref;
                int y, b_xy;
                if(!USES_LIST(mb_type, list)){
                    fill_rectangle(  h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4);
                    *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
                    ref += h->b8_stride;
                    *(uint32_t*)&h->ref_cache[list][scan8[ 2]] =
                    *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
                    *(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101;
                    continue;
                }

                ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
                *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
                *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
                ref += h->b8_stride;
                *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
                *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;

                b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
                for(y=0; y<4; y++){
                    *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride];
                    *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride];
                }

            }
        }
    }else{
@@ -1196,6 +1210,23 @@ static inline void write_back_non_zero_count(H264Context *h){
    h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
    h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
    h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];

    //FIXME sort better how things are stored in non_zero_count


    h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1];
    h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2];
    h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3];
    h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1];
    h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2];
    h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3];
    h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1];
    h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2];
    h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3];

    h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1];
    h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4];

 }

 static inline void write_back_motion(H264Context *h, int mb_type){
@@ -1271,7 +1302,7 @@ static void decode_mb_skip(H264Context *h){
    const int mb_xy= h->mb_xy;
    int mb_type=0;

    memset(h->non_zero_count[mb_xy], 0, 16);
    memset(h->non_zero_count[mb_xy], 0, 32);
    memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui

    if(MB_FIELD)
--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -1392,7 +1392,7 @@ decode_intra_mb:
        // In deblocking, the quantizer is 0
        s->current_picture.qscale_table[mb_xy]= 0;
        // All coeffs are present
        memset(h->non_zero_count[mb_xy], 16, 16);
        memset(h->non_zero_count[mb_xy], 16, 32);
        s->current_picture.mb_type[mb_xy]= mb_type;
        h->last_qscale_diff = 0;
        return 0;
--- a/libavcodec/h264_cavlc.c
+++ b/libavcodec/h264_cavlc.c
@@ -620,7 +620,7 @@ decode_intra_mb:
        // In deblocking, the quantizer is 0
        s->current_picture.qscale_table[mb_xy]= 0;
        // All coeffs are present
        memset(h->non_zero_count[mb_xy], 16, 16);
        memset(h->non_zero_count[mb_xy], 16, 32);

        s->current_picture.mb_type[mb_xy]= mb_type;
        return 0;
--- a/libavcodec/h264_loopfilter.c
+++ b/libavcodec/h264_loopfilter.c
@@ -620,7 +620,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u
        // Do not use s->qscale as luma quantizer because it has not the same
        // value in IPCM macroblocks.
        qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
        //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
        //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]);
        tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
        //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
        if( dir == 0 ) {
@@ -650,6 +650,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
    const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
    int first_vertical_edge_done = 0;
    av_unused int dir;
    int list;

    //for sufficiently low qp, filtering wouldn't do anything
    //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
@@ -663,6 +664,35 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint
        }
    }

    h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0];
    h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1];
    h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2];
    h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3];
    h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4];
    h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5];
    h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6];

    h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9];
    h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8];
    h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7];

    h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12];
    h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11];
    h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10];

    h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13];
    h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14];
    h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15];
    h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16];
    h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17];
    h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18];
    h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19];
    h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20];
    h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21];

    h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22];
    h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23];

    // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
    if(!h->pps.cabac && h->pps.transform_8x8_mode){
        int top_type, left_type[2];
@@ -687,16 +717,16 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint

        if(IS_8x8DCT(mb_type)){
            h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
            h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp_table[mb_xy] & 1;

            h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
            h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2;

            h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
            h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4;

            h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
            h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8;
        }
    }