for high resolution videos. about 20cycles faster per MB for cathederal. Originally committed as revision 22038 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.6
| @@ -756,8 +756,8 @@ int ff_h264_alloc_tables(H264Context *h){ | |||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) | ||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t), fail) | ||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail); | |||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail); | |||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint8_t), fail); | |||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint8_t), fail); | |||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); | ||||
| FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail) | ||||
| @@ -486,8 +486,8 @@ typedef struct H264Context{ | |||||
| /* chroma_pred_mode for i4x4 or i16x16, else 0 */ | /* chroma_pred_mode for i4x4 or i16x16, else 0 */ | ||||
| uint8_t *chroma_pred_mode_table; | uint8_t *chroma_pred_mode_table; | ||||
| int last_qscale_diff; | int last_qscale_diff; | ||||
| int16_t (*mvd_table[2])[2]; | |||||
| DECLARE_ALIGNED_16(int16_t, mvd_cache)[2][5*8][2]; | |||||
| uint8_t (*mvd_table[2])[2]; | |||||
| DECLARE_ALIGNED_16(uint8_t, mvd_cache)[2][5*8][2]; | |||||
| uint8_t *direct_table; | uint8_t *direct_table; | ||||
| uint8_t direct_cache[5*8]; | uint8_t direct_cache[5*8]; | ||||
| @@ -732,6 +732,14 @@ static av_always_inline uint32_t pack16to32(int a, int b){ | |||||
| #endif | #endif | ||||
| } | } | ||||
| static av_always_inline uint16_t pack8to16(int a, int b){ | |||||
| #if HAVE_BIGENDIAN | |||||
| return (b&0xFF) + (a<<8); | |||||
| #else | |||||
| return (a&0xFF) + (b<<8); | |||||
| #endif | |||||
| } | |||||
| /** | /** | ||||
| * gets the chroma qp. | * gets the chroma qp. | ||||
| */ | */ | ||||
| @@ -1060,32 +1068,31 @@ static void fill_decode_caches(H264Context *h, int mb_type){ | |||||
| /* XXX beurk, Load mvd */ | /* XXX beurk, Load mvd */ | ||||
| if(USES_LIST(top_type, list)){ | if(USES_LIST(top_type, list)){ | ||||
| const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride; | ||||
| AV_COPY128(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); | |||||
| AV_COPY64(h->mvd_cache[list][scan8[0] + 0 - 1*8], h->mvd_table[list][b_xy + 0]); | |||||
| }else{ | }else{ | ||||
| AV_ZERO128(h->mvd_cache[list][scan8[0] + 0 - 1*8]); | |||||
| AV_ZERO64(h->mvd_cache[list][scan8[0] + 0 - 1*8]); | |||||
| } | } | ||||
| if(USES_LIST(left_type[0], list)){ | if(USES_LIST(left_type[0], list)){ | ||||
| const int b_xy= h->mb2b_xy[left_xy[0]] + 3; | const int b_xy= h->mb2b_xy[left_xy[0]] + 3; | ||||
| AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]); | |||||
| AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]); | |||||
| AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 0*8], h->mvd_table[list][b_xy + h->b_stride*left_block[0]]); | |||||
| AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 1*8], h->mvd_table[list][b_xy + h->b_stride*left_block[1]]); | |||||
| }else{ | }else{ | ||||
| AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 0*8]); | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 1*8]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 0*8]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 1*8]); | |||||
| } | } | ||||
| if(USES_LIST(left_type[1], list)){ | if(USES_LIST(left_type[1], list)){ | ||||
| const int b_xy= h->mb2b_xy[left_xy[1]] + 3; | const int b_xy= h->mb2b_xy[left_xy[1]] + 3; | ||||
| AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]); | |||||
| AV_COPY32(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]); | |||||
| AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 2*8], h->mvd_table[list][b_xy + h->b_stride*left_block[2]]); | |||||
| AV_COPY16(h->mvd_cache[list][scan8[0] - 1 + 3*8], h->mvd_table[list][b_xy + h->b_stride*left_block[3]]); | |||||
| }else{ | }else{ | ||||
| AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 2*8]); | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[0] - 1 + 3*8]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 2*8]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[0] - 1 + 3*8]); | |||||
| } | } | ||||
| AV_ZERO32(h->mvd_cache [list][scan8[5 ]+1]); | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[7 ]+1]); | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else) | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[4 ]]); | |||||
| AV_ZERO32(h->mvd_cache [list][scan8[12]]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[5 ]+1]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[7 ]+1]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[13]+1]); //FIXME remove past 3 (init somewhere else) | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[4 ]]); | |||||
| AV_ZERO16(h->mvd_cache [list][scan8[12]]); | |||||
| if(h->slice_type_nos == FF_B_TYPE){ | if(h->slice_type_nos == FF_B_TYPE){ | ||||
| fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); | fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, MB_TYPE_16x16>>1, 1); | ||||
| @@ -1414,13 +1421,13 @@ static inline void write_back_motion(H264Context *h, int mb_type){ | |||||
| AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); | AV_COPY128(mv_dst + y*b_stride, mv_src + 8*y); | ||||
| } | } | ||||
| if( CABAC ) { | if( CABAC ) { | ||||
| int16_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy]; | |||||
| int16_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; | |||||
| uint8_t (*mvd_dst)[2] = &h->mvd_table[list][b_xy]; | |||||
| uint8_t (*mvd_src)[2] = &h->mvd_cache[list][scan8[0]]; | |||||
| if(IS_SKIP(mb_type)) | if(IS_SKIP(mb_type)) | ||||
| fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 4); | |||||
| fill_rectangle(mvd_dst, 4, 4, h->b_stride, 0, 2); | |||||
| else | else | ||||
| for(y=0; y<4; y++){ | for(y=0; y<4; y++){ | ||||
| AV_COPY128(mvd_dst + y*b_stride, mvd_src + 8*y); | |||||
| AV_COPY64(mvd_dst + y*b_stride, mvd_src + 8*y); | |||||
| } | } | ||||
| } | } | ||||
| @@ -938,8 +938,9 @@ static int decode_cabac_mb_mvd( H264Context *h, int ctxbase, int amvd, int *mvda | |||||
| while( k-- ) { | while( k-- ) { | ||||
| mvd += get_cabac_bypass( &h->cabac )<<k; | mvd += get_cabac_bypass( &h->cabac )<<k; | ||||
| } | } | ||||
| } | |||||
| *mvda=mvd; | |||||
| *mvda=mvd < 70 ? mvd : 70; | |||||
| }else | |||||
| *mvda=mvd; | |||||
| return get_cabac_bypass_sign( &h->cabac, -mvd ); | return get_cabac_bypass_sign( &h->cabac, -mvd ); | ||||
| } | } | ||||
| @@ -1429,7 +1430,7 @@ decode_intra_mb: | |||||
| for(i=0; i<4; i++){ | for(i=0; i<4; i++){ | ||||
| h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]; | h->ref_cache[list][ scan8[4*i] ]=h->ref_cache[list][ scan8[4*i]+1 ]; | ||||
| if(IS_DIRECT(h->sub_mb_type[i])){ | if(IS_DIRECT(h->sub_mb_type[i])){ | ||||
| fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 2); | |||||
| continue; | continue; | ||||
| } | } | ||||
| @@ -1441,9 +1442,8 @@ decode_intra_mb: | |||||
| int mx, my; | int mx, my; | ||||
| const int index= 4*i + block_width*j; | const int index= 4*i + block_width*j; | ||||
| int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ]; | int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ]; | ||||
| int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ]; | |||||
| uint8_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ]; | |||||
| pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my); | pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my); | ||||
| DECODE_CABAC_MB_MVD( h, list, index) | DECODE_CABAC_MB_MVD( h, list, index) | ||||
| tprintf(s->avctx, "final mv:%d %d\n", mx, my); | tprintf(s->avctx, "final mv:%d %d\n", mx, my); | ||||
| @@ -1478,14 +1478,14 @@ decode_intra_mb: | |||||
| } | } | ||||
| }else{ | }else{ | ||||
| fill_rectangle(h->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); | fill_rectangle(h->mv_cache [list][ scan8[4*i] ], 2, 2, 8, 0, 4); | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[4*i] ], 2, 2, 8, 0, 2); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| } else if( IS_DIRECT(mb_type) ) { | } else if( IS_DIRECT(mb_type) ) { | ||||
| ff_h264_pred_direct_motion(h, &mb_type); | ff_h264_pred_direct_motion(h, &mb_type); | ||||
| fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 2); | |||||
| fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 2); | |||||
| dct8x8_allowed &= h->sps.direct_8x8_inference_flag; | dct8x8_allowed &= h->sps.direct_8x8_inference_flag; | ||||
| } else { | } else { | ||||
| int list, i; | int list, i; | ||||
| @@ -1512,7 +1512,7 @@ decode_intra_mb: | |||||
| DECODE_CABAC_MB_MVD( h, list, 0) | DECODE_CABAC_MB_MVD( h, list, 0) | ||||
| tprintf(s->avctx, "final mv:%d %d\n", mx, my); | tprintf(s->avctx, "final mv:%d %d\n", mx, my); | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mpx,mpy), 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack8to16(mpx,mpy), 2); | |||||
| fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); | fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4); | ||||
| }else | }else | ||||
| fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4); | fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4); | ||||
| @@ -1544,10 +1544,10 @@ decode_intra_mb: | |||||
| DECODE_CABAC_MB_MVD( h, list, 8*i) | DECODE_CABAC_MB_MVD( h, list, 8*i) | ||||
| tprintf(s->avctx, "final mv:%d %d\n", mx, my); | tprintf(s->avctx, "final mv:%d %d\n", mx, my); | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mpx,mpy), 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack8to16(mpx,mpy), 2); | |||||
| fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); | fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4); | ||||
| }else{ | }else{ | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 2); | |||||
| fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); | fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4); | ||||
| } | } | ||||
| } | } | ||||
| @@ -1579,10 +1579,10 @@ decode_intra_mb: | |||||
| DECODE_CABAC_MB_MVD( h, list, 4*i) | DECODE_CABAC_MB_MVD( h, list, 4*i) | ||||
| tprintf(s->avctx, "final mv:%d %d\n", mx, my); | tprintf(s->avctx, "final mv:%d %d\n", mx, my); | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mpx,mpy), 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack8to16(mpx,mpy), 2); | |||||
| fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); | fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4); | ||||
| }else{ | }else{ | ||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); | |||||
| fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 2); | |||||
| fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); | fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4); | ||||
| } | } | ||||
| } | } | ||||