Change mvd_cache & mvd_table to 8bit, this is overall a bit faster

for high resolution videos. about 20cycles faster per MB for cathederal. Originally committed as revision 22038 to svn://svn.ffmpeg.org/ffmpeg/trunk
15 years ago · b5bd070029
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
@@ -756,8 +756,8 @@ int ff_h264_alloc_tables(H264Context *h){
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail)

    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail)
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail);
    FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail)

--- a/libavcodec/h264.h
+++ b/libavcodec/h264.h
@@ -486,8 +486,8 @@ typedef struct H264Context{
    /* chroma_pred_mode for i4x4 or i16x16, else 0 */
    uint8_t     *chroma_pred_mode_table;
    int         last_qscale_diff;
    int16_t     (*mvd_table[2])[2];
    DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2];
    uint8_t     (*mvd_table[2])[2];
    DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2];
    uint8_t     *direct_table;
    uint8_t     direct_cache[5*8];

@@ -732,6 +732,14 @@ static av_always_inline uint32_t pack16to32(int a, int b){
 #endif
 }

 static av_always_inline uint16_t pack8to16(int a, int b){
 #if HAVE_BIGENDIAN
   return (b&0xFF) + (a<<8);
 #else
   return (a&0xFF) + (b<<8);
 #endif
 }

 /**
 * gets the chroma qp.
 */
@@ -1060,32 +1068,31 @@ static void fill_decode_caches(H264Context *h, int mb_type){
                /* XXX beurk, Load mvd */
                if(USES_LIST(top_type, list)){
                    const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
                    AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
                    AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]);
                }else{
                    AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
                    AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]);
                }
                if(USES_LIST(left_type[0], list)){
                    const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]);
                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]);
                }else{
                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]);
                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]);
                }
                if(USES_LIST(left_type[1], list)){
                    const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
                    AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]);
                    AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]);
                }else{
                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
                    AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]);
                    AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]);
                }
                AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]);
                AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]);
                AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
                AV_ZERO32(h->mvd_cache [list][scan8[4 ]]);
                AV_ZERO32(h->mvd_cache [list][scan8[12]]);

                AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]);
                AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]);
                AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else)
                AV_ZERO16(h->mvd_cache [list][scan8[4 ]]);
                AV_ZERO16(h->mvd_cache [list][scan8[12]]);
                if(h->slice_type_nos == FF_B_TYPE){
                    fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1);

@@ -1414,13 +1421,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){
            AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y);
        }
        if( CABAC ) {
            int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
            int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
            uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy];
            uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]];
            if(IS_SKIP(mb_type))
                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4);
                fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2);
            else
            for(y=0; y<4; y++){
                AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y);
                AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y);
            }
        }

--- a/libavcodec/h264_cabac.c
+++ b/libavcodec/h264_cabac.c
@@ -938,8 +938,9 @@ static int decode_cabac_mb_mvd( H264Context *h, int ctxbase, int amvd, int *mvda
        while( k-- ) {
            mvd += get_cabac_bypass( &h->cabac )<<k;
        }
    }
    *mvda=mvd;
        *mvda=mvd < 70 ? mvd : 70;
    }else
        *mvda=mvd;
    return get_cabac_bypass_sign( &h->cabac, -mvd );
 }

@@ -1429,7 +1430,7 @@ decode_intra_mb:
            for(i=0; i<4; i++){
                h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
                if(IS_DIRECT(h->sub_mb_type[i])){
                    fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
                    fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2);
                    continue;
                }

@@ -1441,9 +1442,8 @@ decode_intra_mb:
                        int mx, my;
                        const int index= 4*i + block_width*j;
                        int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
                        int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
                        uint8_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
                        pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);

                        DECODE_CABAC_MB_MVD( h, list, index)
                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);

@@ -1478,14 +1478,14 @@ decode_intra_mb:
                    }
                }else{
                    fill_rectangle(h->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4);
                    fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 4);
                    fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2);
                }
            }
        }
    } else if( IS_DIRECT(mb_type) ) {
        ff_h264_pred_direct_motion(h, &mb_type);
        fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
        fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
        fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2);
        fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2);
        dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
    } else {
        int list, i;
@@ -1512,7 +1512,7 @@ decode_intra_mb:
                    DECODE_CABAC_MB_MVD( h, list, 0)
                    tprintf(s->avctx, "final mv:%d %d\n", mx, my);

                    fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mpx,mpy), 4);
                    fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2);
                    fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
                }else
                    fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
@@ -1544,10 +1544,10 @@ decode_intra_mb:
                        DECODE_CABAC_MB_MVD( h, list, 8*i)
                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);

                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mpx,mpy), 4);
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2);
                        fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
                    }else{
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2);
                        fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
                    }
                }
@@ -1579,10 +1579,10 @@ decode_intra_mb:
                        DECODE_CABAC_MB_MVD( h, list, 4*i)

                        tprintf(s->avctx, "final mv:%d %d\n", mx, my);
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mpx,mpy), 4);
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2);
                        fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
                    }else{
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
                        fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2);
                        fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
                    }
                }