Run loop filter per row instead of per MB, this also should make it much easier to switch to per frame filtering and also doing so in a seperate thread in the future if some volunteer wants to try. Overall decoding speedup of 1.7% (single thread on pentium dual / cathedral sample) This change also allows some optimizations to be tried that would not have been possible before. Originally committed as revision 21270 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.6
@@ -666,6 +666,7 @@ static void free_tables(H264Context *h){ | |||||
av_freep(&h->non_zero_count); | av_freep(&h->non_zero_count); | ||||
av_freep(&h->slice_table_base); | av_freep(&h->slice_table_base); | ||||
h->slice_table= NULL; | h->slice_table= NULL; | ||||
av_freep(&h->list_counts); | |||||
av_freep(&h->mb2b_xy); | av_freep(&h->mb2b_xy); | ||||
av_freep(&h->mb2b8_xy); | av_freep(&h->mb2b8_xy); | ||||
@@ -756,7 +757,7 @@ int ff_h264_alloc_tables(H264Context *h){ | |||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->intra4x4_pred_mode, big_mb_num * 8 * sizeof(uint8_t), fail) | ||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count , big_mb_num * 16 * sizeof(uint8_t), fail) | |||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->non_zero_count , big_mb_num * 32 * sizeof(uint8_t), fail) | |||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->slice_table_base , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base), fail) | ||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->cbp_table, big_mb_num * sizeof(uint16_t), fail) | ||||
@@ -764,6 +765,7 @@ int ff_h264_alloc_tables(H264Context *h){ | |||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail); | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t), fail); | ||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail); | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t), fail); | ||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); | FF_ALLOCZ_OR_GOTO(h->s.avctx, h->direct_table, 32*big_mb_num * sizeof(uint8_t) , fail); | ||||
FF_ALLOCZ_OR_GOTO(h->s.avctx, h->list_counts, big_mb_num * sizeof(uint8_t), fail) | |||||
memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base)); | memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base)); | ||||
h->slice_table= h->slice_table_base + s->mb_stride*2 + 1; | h->slice_table= h->slice_table_base + s->mb_stride*2 + 1; | ||||
@@ -945,12 +947,7 @@ int ff_h264_frame_start(H264Context *h){ | |||||
static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ | static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){ | ||||
MpegEncContext * const s = &h->s; | MpegEncContext * const s = &h->s; | ||||
int i; | |||||
int step = 1; | |||||
int offset = 1; | |||||
int uvoffset= 1; | |||||
int top_idx = 1; | int top_idx = 1; | ||||
int skiplast= 0; | |||||
src_y -= linesize; | src_y -= linesize; | ||||
src_cb -= uvlinesize; | src_cb -= uvlinesize; | ||||
@@ -958,8 +955,6 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src | |||||
if(!simple && FRAME_MBAFF){ | if(!simple && FRAME_MBAFF){ | ||||
if(s->mb_y&1){ | if(s->mb_y&1){ | ||||
offset = MB_MBAFF ? 1 : 17; | |||||
uvoffset= MB_MBAFF ? 1 : 9; | |||||
if(!MB_MBAFF){ | if(!MB_MBAFF){ | ||||
*(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize); | *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y + 15*linesize); | ||||
*(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize); | *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize); | ||||
@@ -968,39 +963,19 @@ static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src | |||||
*(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize); | *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize); | ||||
} | } | ||||
} | } | ||||
}else{ | |||||
if(!MB_MBAFF){ | |||||
h->left_border[0]= h->top_borders[0][s->mb_x][15]; | |||||
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ | |||||
h->left_border[34 ]= h->top_borders[0][s->mb_x][16+7 ]; | |||||
h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7]; | |||||
} | |||||
skiplast= 1; | |||||
} | |||||
offset = | |||||
uvoffset= | |||||
top_idx = MB_MBAFF ? 0 : 1; | |||||
} | |||||
step= MB_MBAFF ? 2 : 1; | |||||
}else if(MB_MBAFF){ | |||||
top_idx = 0; | |||||
}else | |||||
return; | |||||
} | } | ||||
// There are two lines saved, the line above the the top macroblock of a pair, | // There are two lines saved, the line above the the top macroblock of a pair, | ||||
// and the line above the bottom macroblock | // and the line above the bottom macroblock | ||||
h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15]; | |||||
for(i=1; i<17 - skiplast; i++){ | |||||
h->left_border[offset+i*step]= src_y[15+i* linesize]; | |||||
} | |||||
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize); | *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y + 16*linesize); | ||||
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize); | *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize); | ||||
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ | if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ | ||||
h->left_border[uvoffset+34 ]= h->top_borders[top_idx][s->mb_x][16+7]; | |||||
h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7]; | |||||
for(i=1; i<9 - skiplast; i++){ | |||||
h->left_border[uvoffset+34 +i*step]= src_cb[7+i*uvlinesize]; | |||||
h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize]; | |||||
} | |||||
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); | *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize); | ||||
*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); | *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize); | ||||
} | } | ||||
@@ -1013,21 +988,15 @@ static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_c | |||||
int deblock_left; | int deblock_left; | ||||
int deblock_top; | int deblock_top; | ||||
int mb_xy; | int mb_xy; | ||||
int step = 1; | |||||
int offset = 1; | |||||
int uvoffset= 1; | |||||
int top_idx = 1; | int top_idx = 1; | ||||
if(!simple && FRAME_MBAFF){ | if(!simple && FRAME_MBAFF){ | ||||
if(s->mb_y&1){ | if(s->mb_y&1){ | ||||
offset = MB_MBAFF ? 1 : 17; | |||||
uvoffset= MB_MBAFF ? 1 : 9; | |||||
if(!MB_MBAFF) | |||||
return; | |||||
}else{ | }else{ | ||||
offset = | |||||
uvoffset= | |||||
top_idx = MB_MBAFF ? 0 : 1; | top_idx = MB_MBAFF ? 0 : 1; | ||||
} | } | ||||
step= MB_MBAFF ? 2 : 1; | |||||
} | } | ||||
if(h->deblocking_filter == 2) { | if(h->deblocking_filter == 2) { | ||||
@@ -1049,14 +1018,10 @@ if(xchg)\ | |||||
a= b;\ | a= b;\ | ||||
b= t; | b= t; | ||||
if(deblock_left){ | |||||
for(i = !deblock_top; i<16; i++){ | |||||
XCHG(h->left_border[offset+i*step], src_y [i* linesize], temp8, xchg); | |||||
} | |||||
XCHG(h->left_border[offset+i*step], src_y [i* linesize], temp8, 1); | |||||
} | |||||
if(deblock_top){ | if(deblock_top){ | ||||
if(deblock_left){ | |||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+8), *(uint64_t*)(src_y -7), temp64, 1); | |||||
} | |||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); | XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg); | ||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); | XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1); | ||||
if(s->mb_x+1 < s->mb_width){ | if(s->mb_x+1 < s->mb_width){ | ||||
@@ -1065,15 +1030,11 @@ b= t; | |||||
} | } | ||||
if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ | if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){ | ||||
if(deblock_left){ | |||||
for(i = !deblock_top; i<8; i++){ | |||||
XCHG(h->left_border[uvoffset+34 +i*step], src_cb[i*uvlinesize], temp8, xchg); | |||||
XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg); | |||||
} | |||||
XCHG(h->left_border[uvoffset+34 +i*step], src_cb[i*uvlinesize], temp8, 1); | |||||
XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1); | |||||
} | |||||
if(deblock_top){ | if(deblock_top){ | ||||
if(deblock_left){ | |||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+16), *(uint64_t*)(src_cb -7), temp64, 1); | |||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x-1]+24), *(uint64_t*)(src_cr -7), temp64, 1); | |||||
} | |||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); | XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1); | ||||
XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); | XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1); | ||||
} | } | ||||
@@ -1103,6 +1064,8 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||||
s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4); | s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4); | ||||
s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2); | s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2); | ||||
h->list_counts[mb_xy]= h->list_count; | |||||
if (!simple && MB_FIELD) { | if (!simple && MB_FIELD) { | ||||
linesize = h->mb_linesize = s->linesize * 2; | linesize = h->mb_linesize = s->linesize * 2; | ||||
uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; | uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; | ||||
@@ -1322,7 +1285,7 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||||
if(h->cbp || IS_INTRA(mb_type)) | if(h->cbp || IS_INTRA(mb_type)) | ||||
s->dsp.clear_blocks(h->mb); | s->dsp.clear_blocks(h->mb); | ||||
if(h->deblocking_filter) { | |||||
if(h->deblocking_filter && 0) { | |||||
backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple); | backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple); | ||||
fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb | fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb | ||||
h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]); | h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]); | ||||
@@ -2174,6 +2137,70 @@ int ff_h264_get_slice_type(H264Context *h) | |||||
} | } | ||||
} | } | ||||
static void loop_filter(H264Context *h){ | |||||
MpegEncContext * const s = &h->s; | |||||
uint8_t *dest_y, *dest_cb, *dest_cr; | |||||
int linesize, uvlinesize, mb_x, mb_y; | |||||
const int end_mb_y= s->mb_y + FRAME_MBAFF; | |||||
const int old_slice_type= h->slice_type; | |||||
if(h->deblocking_filter) { | |||||
for(mb_x= 0; mb_x<s->mb_width; mb_x++){ | |||||
for(mb_y=end_mb_y - FRAME_MBAFF; mb_y<= end_mb_y; mb_y++){ | |||||
int list, mb_xy, mb_type, is_complex; | |||||
mb_xy = h->mb_xy = mb_x + mb_y*s->mb_stride; | |||||
h->slice_num= h->slice_table[mb_xy]; | |||||
mb_type= s->current_picture.mb_type[mb_xy]; | |||||
h->list_count= h->list_counts[mb_xy]; | |||||
if(h->list_count==2){ | |||||
h->slice_type= h->slice_type_nos= FF_B_TYPE; | |||||
}else if(h->list_count==1){ | |||||
h->slice_type= h->slice_type_nos= FF_P_TYPE; | |||||
}else | |||||
h->slice_type= h->slice_type_nos= FF_I_TYPE; | |||||
if(FRAME_MBAFF) | |||||
h->mb_mbaff = h->mb_field_decoding_flag = !!IS_INTERLACED(mb_type); | |||||
is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0; //FIXME qscale might be wrong | |||||
s->mb_x= mb_x; | |||||
s->mb_y= mb_y; | |||||
dest_y = s->current_picture.data[0] + (mb_x + mb_y * s->linesize ) * 16; | |||||
dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8; | |||||
dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8; | |||||
//FIXME simplify above | |||||
if (MB_FIELD) { | |||||
linesize = h->mb_linesize = s->linesize * 2; | |||||
uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2; | |||||
if(mb_y&1){ //FIXME move out of this function? | |||||
dest_y -= s->linesize*15; | |||||
dest_cb-= s->uvlinesize*7; | |||||
dest_cr-= s->uvlinesize*7; | |||||
} | |||||
} else { | |||||
linesize = h->mb_linesize = s->linesize; | |||||
uvlinesize = h->mb_uvlinesize = s->uvlinesize; | |||||
} | |||||
backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, !is_complex); | |||||
fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb | |||||
h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]); | |||||
h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]); | |||||
if (is_complex && FRAME_MBAFF) { | |||||
ff_h264_filter_mb (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); | |||||
} else { | |||||
ff_h264_filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
h->slice_type= old_slice_type; | |||||
s->mb_x= 0; | |||||
s->mb_y= end_mb_y - FRAME_MBAFF; | |||||
} | |||||
static int decode_slice(struct AVCodecContext *avctx, void *arg){ | static int decode_slice(struct AVCodecContext *avctx, void *arg){ | ||||
H264Context *h = *(void**)arg; | H264Context *h = *(void**)arg; | ||||
MpegEncContext * const s = &h->s; | MpegEncContext * const s = &h->s; | ||||
@@ -2222,6 +2249,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){ | |||||
if( ++s->mb_x >= s->mb_width ) { | if( ++s->mb_x >= s->mb_width ) { | ||||
s->mb_x = 0; | s->mb_x = 0; | ||||
loop_filter(h); | |||||
ff_draw_horiz_band(s, 16*s->mb_y, 16); | ff_draw_horiz_band(s, 16*s->mb_y, 16); | ||||
++s->mb_y; | ++s->mb_y; | ||||
if(FIELD_OR_MBAFF_PICTURE) { | if(FIELD_OR_MBAFF_PICTURE) { | ||||
@@ -2259,6 +2287,7 @@ static int decode_slice(struct AVCodecContext *avctx, void *arg){ | |||||
if(++s->mb_x >= s->mb_width){ | if(++s->mb_x >= s->mb_width){ | ||||
s->mb_x=0; | s->mb_x=0; | ||||
loop_filter(h); | |||||
ff_draw_horiz_band(s, 16*s->mb_y, 16); | ff_draw_horiz_band(s, 16*s->mb_y, 16); | ||||
++s->mb_y; | ++s->mb_y; | ||||
if(FIELD_OR_MBAFF_PICTURE) { | if(FIELD_OR_MBAFF_PICTURE) { | ||||
@@ -300,7 +300,7 @@ typedef struct H264Context{ | |||||
* is 64 if not available. | * is 64 if not available. | ||||
*/ | */ | ||||
DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); | DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]); | ||||
uint8_t (*non_zero_count)[16]; | |||||
uint8_t (*non_zero_count)[32]; | |||||
/** | /** | ||||
* Motion vector cache. | * Motion vector cache. | ||||
@@ -423,6 +423,7 @@ typedef struct H264Context{ | |||||
*/ | */ | ||||
unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode | unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode | ||||
unsigned int list_count; | unsigned int list_count; | ||||
uint8_t *list_counts; ///< Array of list_count per MB specifying the slice type | |||||
Picture *short_ref[32]; | Picture *short_ref[32]; | ||||
Picture *long_ref[32]; | Picture *long_ref[32]; | ||||
Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture | Picture default_ref_list[2][32]; ///< base reference list for all slices of a coded picture | ||||
@@ -736,8 +737,8 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){ | |||||
top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); | top_xy = mb_xy - (s->mb_stride << FIELD_PICTURE); | ||||
//FIXME deblocking could skip the intra and nnz parts. | //FIXME deblocking could skip the intra and nnz parts. | ||||
if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) | |||||
return; | |||||
// if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF) | |||||
// return; | |||||
/* Wow, what a mess, why didn't they simplify the interlacing & intra | /* Wow, what a mess, why didn't they simplify the interlacing & intra | ||||
* stuff, I can't imagine that these complex rules are worth it. */ | * stuff, I can't imagine that these complex rules are worth it. */ | ||||
@@ -793,20 +794,33 @@ static void fill_caches(H264Context *h, int mb_type, int for_deblock){ | |||||
left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; | left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0; | ||||
left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; | left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0; | ||||
if(MB_MBAFF && !IS_INTRA(mb_type)){ | |||||
if(!IS_INTRA(mb_type)){ | |||||
int list; | int list; | ||||
for(list=0; list<h->list_count; list++){ | for(list=0; list<h->list_count; list++){ | ||||
//These values where changed for ease of performing MC, we need to change them back | |||||
//FIXME maybe we can make MC and loop filter use the same values or prevent | |||||
//the MC code from changing ref_cache and rather use a temporary array. | |||||
if(USES_LIST(mb_type,list)){ | |||||
int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; | |||||
int8_t *ref; | |||||
int y, b_xy; | |||||
if(!USES_LIST(mb_type, list)){ | |||||
fill_rectangle( h->mv_cache[list][scan8[0]], 4, 4, 8, pack16to32(0,0), 4); | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | *(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | ||||
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |||||
ref += h->b8_stride; | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | *(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | ||||
*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |||||
*(uint32_t*)&h->ref_cache[list][scan8[10]] = ((LIST_NOT_USED)&0xFF)*0x01010101; | |||||
continue; | |||||
} | } | ||||
ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]]; | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 0]] = | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |||||
ref += h->b8_stride; | |||||
*(uint32_t*)&h->ref_cache[list][scan8[ 8]] = | |||||
*(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101; | |||||
b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride; | |||||
for(y=0; y<4; y++){ | |||||
*(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]; | |||||
*(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y]= *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]; | |||||
} | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
@@ -1196,6 +1210,23 @@ static inline void write_back_non_zero_count(H264Context *h){ | |||||
h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; | h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5]; | ||||
h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; | h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5]; | ||||
h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; | h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4]; | ||||
//FIXME sort better how things are stored in non_zero_count | |||||
h->non_zero_count[mb_xy][13]= h->non_zero_count_cache[6+8*1]; | |||||
h->non_zero_count[mb_xy][14]= h->non_zero_count_cache[6+8*2]; | |||||
h->non_zero_count[mb_xy][15]= h->non_zero_count_cache[6+8*3]; | |||||
h->non_zero_count[mb_xy][16]= h->non_zero_count_cache[5+8*1]; | |||||
h->non_zero_count[mb_xy][17]= h->non_zero_count_cache[5+8*2]; | |||||
h->non_zero_count[mb_xy][18]= h->non_zero_count_cache[5+8*3]; | |||||
h->non_zero_count[mb_xy][19]= h->non_zero_count_cache[4+8*1]; | |||||
h->non_zero_count[mb_xy][20]= h->non_zero_count_cache[4+8*2]; | |||||
h->non_zero_count[mb_xy][21]= h->non_zero_count_cache[4+8*3]; | |||||
h->non_zero_count[mb_xy][22]= h->non_zero_count_cache[1+8*1]; | |||||
h->non_zero_count[mb_xy][23]= h->non_zero_count_cache[1+8*4]; | |||||
} | } | ||||
static inline void write_back_motion(H264Context *h, int mb_type){ | static inline void write_back_motion(H264Context *h, int mb_type){ | ||||
@@ -1271,7 +1302,7 @@ static void decode_mb_skip(H264Context *h){ | |||||
const int mb_xy= h->mb_xy; | const int mb_xy= h->mb_xy; | ||||
int mb_type=0; | int mb_type=0; | ||||
memset(h->non_zero_count[mb_xy], 0, 16); | |||||
memset(h->non_zero_count[mb_xy], 0, 32); | |||||
memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui | memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui | ||||
if(MB_FIELD) | if(MB_FIELD) | ||||
@@ -1392,7 +1392,7 @@ decode_intra_mb: | |||||
// In deblocking, the quantizer is 0 | // In deblocking, the quantizer is 0 | ||||
s->current_picture.qscale_table[mb_xy]= 0; | s->current_picture.qscale_table[mb_xy]= 0; | ||||
// All coeffs are present | // All coeffs are present | ||||
memset(h->non_zero_count[mb_xy], 16, 16); | |||||
memset(h->non_zero_count[mb_xy], 16, 32); | |||||
s->current_picture.mb_type[mb_xy]= mb_type; | s->current_picture.mb_type[mb_xy]= mb_type; | ||||
h->last_qscale_diff = 0; | h->last_qscale_diff = 0; | ||||
return 0; | return 0; | ||||
@@ -620,7 +620,7 @@ decode_intra_mb: | |||||
// In deblocking, the quantizer is 0 | // In deblocking, the quantizer is 0 | ||||
s->current_picture.qscale_table[mb_xy]= 0; | s->current_picture.qscale_table[mb_xy]= 0; | ||||
// All coeffs are present | // All coeffs are present | ||||
memset(h->non_zero_count[mb_xy], 16, 16); | |||||
memset(h->non_zero_count[mb_xy], 16, 32); | |||||
s->current_picture.mb_type[mb_xy]= mb_type; | s->current_picture.mb_type[mb_xy]= mb_type; | ||||
return 0; | return 0; | ||||
@@ -620,7 +620,7 @@ static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, u | |||||
// Do not use s->qscale as luma quantizer because it has not the same | // Do not use s->qscale as luma quantizer because it has not the same | ||||
// value in IPCM macroblocks. | // value in IPCM macroblocks. | ||||
qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; | qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1; | ||||
//tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]); | |||||
//tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp[0], s->current_picture.qscale_table[mbn_xy]); | |||||
tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); | tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize); | ||||
//{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } | //{ int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); } | ||||
if( dir == 0 ) { | if( dir == 0 ) { | ||||
@@ -650,6 +650,7 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint | |||||
const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4; | const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4; | ||||
int first_vertical_edge_done = 0; | int first_vertical_edge_done = 0; | ||||
av_unused int dir; | av_unused int dir; | ||||
int list; | |||||
//for sufficiently low qp, filtering wouldn't do anything | //for sufficiently low qp, filtering wouldn't do anything | ||||
//this is a conservative estimate: could also check beta_offset and more accurate chroma_qp | //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp | ||||
@@ -663,6 +664,35 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint | |||||
} | } | ||||
} | } | ||||
h->non_zero_count_cache[7+8*1]=h->non_zero_count[mb_xy][0]; | |||||
h->non_zero_count_cache[7+8*2]=h->non_zero_count[mb_xy][1]; | |||||
h->non_zero_count_cache[7+8*3]=h->non_zero_count[mb_xy][2]; | |||||
h->non_zero_count_cache[7+8*4]=h->non_zero_count[mb_xy][3]; | |||||
h->non_zero_count_cache[4+8*4]=h->non_zero_count[mb_xy][4]; | |||||
h->non_zero_count_cache[5+8*4]=h->non_zero_count[mb_xy][5]; | |||||
h->non_zero_count_cache[6+8*4]=h->non_zero_count[mb_xy][6]; | |||||
h->non_zero_count_cache[1+8*2]=h->non_zero_count[mb_xy][9]; | |||||
h->non_zero_count_cache[2+8*2]=h->non_zero_count[mb_xy][8]; | |||||
h->non_zero_count_cache[2+8*1]=h->non_zero_count[mb_xy][7]; | |||||
h->non_zero_count_cache[1+8*5]=h->non_zero_count[mb_xy][12]; | |||||
h->non_zero_count_cache[2+8*5]=h->non_zero_count[mb_xy][11]; | |||||
h->non_zero_count_cache[2+8*4]=h->non_zero_count[mb_xy][10]; | |||||
h->non_zero_count_cache[6+8*1]=h->non_zero_count[mb_xy][13]; | |||||
h->non_zero_count_cache[6+8*2]=h->non_zero_count[mb_xy][14]; | |||||
h->non_zero_count_cache[6+8*3]=h->non_zero_count[mb_xy][15]; | |||||
h->non_zero_count_cache[5+8*1]=h->non_zero_count[mb_xy][16]; | |||||
h->non_zero_count_cache[5+8*2]=h->non_zero_count[mb_xy][17]; | |||||
h->non_zero_count_cache[5+8*3]=h->non_zero_count[mb_xy][18]; | |||||
h->non_zero_count_cache[4+8*1]=h->non_zero_count[mb_xy][19]; | |||||
h->non_zero_count_cache[4+8*2]=h->non_zero_count[mb_xy][20]; | |||||
h->non_zero_count_cache[4+8*3]=h->non_zero_count[mb_xy][21]; | |||||
h->non_zero_count_cache[1+8*1]=h->non_zero_count[mb_xy][22]; | |||||
h->non_zero_count_cache[1+8*4]=h->non_zero_count[mb_xy][23]; | |||||
// CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs | // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs | ||||
if(!h->pps.cabac && h->pps.transform_8x8_mode){ | if(!h->pps.cabac && h->pps.transform_8x8_mode){ | ||||
int top_type, left_type[2]; | int top_type, left_type[2]; | ||||
@@ -687,16 +717,16 @@ void ff_h264_filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint | |||||
if(IS_8x8DCT(mb_type)){ | if(IS_8x8DCT(mb_type)){ | ||||
h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]= | h->non_zero_count_cache[scan8[0 ]]= h->non_zero_count_cache[scan8[1 ]]= | ||||
h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp & 1; | |||||
h->non_zero_count_cache[scan8[2 ]]= h->non_zero_count_cache[scan8[3 ]]= h->cbp_table[mb_xy] & 1; | |||||
h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]= | h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]= | ||||
h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2; | |||||
h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp_table[mb_xy] & 2; | |||||
h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]= | h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]= | ||||
h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4; | |||||
h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp_table[mb_xy] & 4; | |||||
h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]= | h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]= | ||||
h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8; | |||||
h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp_table[mb_xy] & 8; | |||||
} | } | ||||
} | } | ||||