No speed improvement, but necessary for some future stuff. Also opens up the possibility of asm chroma dc idct/dequant. Originally committed as revision 26349 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
@@ -66,6 +66,7 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, | |||||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | ||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | ||||
void ff_chroma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | ||||
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | ||||
@@ -246,93 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ | |||||
return 0; | return 0; | ||||
} | } | ||||
#if 0 | |||||
/** | |||||
* DCT transforms the 16 dc values. | |||||
* @param qp quantization parameter ??? FIXME | |||||
*/ | |||||
static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){ | |||||
// const int qmul= dequant_coeff[qp][0]; | |||||
int i; | |||||
int temp[16]; //FIXME check if this is a good idea | |||||
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; | |||||
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; | |||||
for(i=0; i<4; i++){ | |||||
const int offset= y_offset[i]; | |||||
const int z0= block[offset+stride*0] + block[offset+stride*4]; | |||||
const int z1= block[offset+stride*0] - block[offset+stride*4]; | |||||
const int z2= block[offset+stride*1] - block[offset+stride*5]; | |||||
const int z3= block[offset+stride*1] + block[offset+stride*5]; | |||||
temp[4*i+0]= z0+z3; | |||||
temp[4*i+1]= z1+z2; | |||||
temp[4*i+2]= z1-z2; | |||||
temp[4*i+3]= z0-z3; | |||||
} | |||||
for(i=0; i<4; i++){ | |||||
const int offset= x_offset[i]; | |||||
const int z0= temp[4*0+i] + temp[4*2+i]; | |||||
const int z1= temp[4*0+i] - temp[4*2+i]; | |||||
const int z2= temp[4*1+i] - temp[4*3+i]; | |||||
const int z3= temp[4*1+i] + temp[4*3+i]; | |||||
block[stride*0 +offset]= (z0 + z3)>>1; | |||||
block[stride*2 +offset]= (z1 + z2)>>1; | |||||
block[stride*8 +offset]= (z1 - z2)>>1; | |||||
block[stride*10+offset]= (z0 - z3)>>1; | |||||
} | |||||
} | |||||
#endif | |||||
#undef xStride | |||||
#undef stride | |||||
static void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){ | |||||
const int stride= 16*2; | |||||
const int xStride= 16; | |||||
int a,b,c,d,e; | |||||
a= block[stride*0 + xStride*0]; | |||||
b= block[stride*0 + xStride*1]; | |||||
c= block[stride*1 + xStride*0]; | |||||
d= block[stride*1 + xStride*1]; | |||||
e= a-b; | |||||
a= a+b; | |||||
b= c-d; | |||||
c= c+d; | |||||
block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; | |||||
block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; | |||||
block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; | |||||
block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; | |||||
} | |||||
#if 0 | |||||
static void chroma_dc_dct_c(DCTELEM *block){ | |||||
const int stride= 16*2; | |||||
const int xStride= 16; | |||||
int a,b,c,d,e; | |||||
a= block[stride*0 + xStride*0]; | |||||
b= block[stride*0 + xStride*1]; | |||||
c= block[stride*1 + xStride*0]; | |||||
d= block[stride*1 + xStride*1]; | |||||
e= a-b; | |||||
a= a+b; | |||||
b= c-d; | |||||
c= c+d; | |||||
block[stride*0 + xStride*0]= (a+c); | |||||
block[stride*0 + xStride*1]= (e+b); | |||||
block[stride*1 + xStride*0]= (a-c); | |||||
block[stride*1 + xStride*1]= (e-b); | |||||
} | |||||
#endif | |||||
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, | static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, | ||||
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | ||||
int src_x_offset, int src_y_offset, | int src_x_offset, int src_y_offset, | ||||
@@ -1283,17 +1196,19 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||||
} | } | ||||
} | } | ||||
}else{ | }else{ | ||||
int chroma_qpu = h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]; | |||||
int chroma_qpv = h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]; | |||||
if(is_h264){ | if(is_h264){ | ||||
if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ]) | if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ]) | ||||
chroma_dc_dequant_idct_c(h->mb + 16*16 , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); | |||||
h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+0*16, &h->mb_chroma_dc[0], chroma_qpu ); | |||||
if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ]) | if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ]) | ||||
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); | |||||
h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, &h->mb_chroma_dc[1], chroma_qpv ); | |||||
h->h264dsp.h264_idct_add8(dest, block_offset, | h->h264dsp.h264_idct_add8(dest, block_offset, | ||||
h->mb, uvlinesize, | h->mb, uvlinesize, | ||||
h->non_zero_count_cache); | h->non_zero_count_cache); | ||||
}else{ | }else{ | ||||
chroma_dc_dequant_idct_c(h->mb + 16*16 , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); | |||||
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); | |||||
h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+0*16, &h->mb_chroma_dc[0], chroma_qpu ); | |||||
h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, &h->mb_chroma_dc[1], chroma_qpv ); | |||||
for(i=16; i<16+8; i++){ | for(i=16; i<16+8; i++){ | ||||
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | ||||
uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; | uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; | ||||
@@ -407,6 +407,7 @@ typedef struct H264Context{ | |||||
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | ||||
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | ||||
DECLARE_ALIGNED(16, DCTELEM, mb_chroma_dc)[2][4]; | |||||
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | ||||
/** | /** | ||||
@@ -1680,9 +1680,10 @@ decode_intra_mb: | |||||
if( cbp&0x30 ){ | if( cbp&0x30 ){ | ||||
int c; | int c; | ||||
AV_ZERO128(h->mb_chroma_dc); | |||||
for( c = 0; c < 2; c++ ) { | for( c = 0; c < 2; c++ ) { | ||||
//av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); | //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); | ||||
decode_cabac_residual_dc(h, h->mb + 256 + 16*4*c, 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4); | |||||
decode_cabac_residual_dc(h, h->mb_chroma_dc[c], 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4); | |||||
} | } | ||||
} | } | ||||
@@ -987,8 +987,9 @@ decode_intra_mb: | |||||
} | } | ||||
if(cbp&0x30){ | if(cbp&0x30){ | ||||
AV_ZERO128(h->mb_chroma_dc); | |||||
for(chroma_idx=0; chroma_idx<2; chroma_idx++) | for(chroma_idx=0; chroma_idx<2; chroma_idx++) | ||||
if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){ | |||||
if( decode_residual(h, gb, h->mb_chroma_dc[chroma_idx], CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){ | |||||
return -1; | return -1; | ||||
} | } | ||||
} | } | ||||
@@ -79,8 +79,7 @@ static const uint8_t luma_dc_field_scan[16]={ | |||||
}; | }; | ||||
static const uint8_t chroma_dc_scan[4]={ | static const uint8_t chroma_dc_scan[4]={ | ||||
(0+0*2)*16, (1+0*2)*16, | |||||
(0+1*2)*16, (1+1*2)*16, //FIXME | |||||
0,1,2,3 | |||||
}; | }; | ||||
// zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)] | // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)] | ||||
@@ -283,6 +283,7 @@ void ff_h264dsp_init(H264DSPContext *c) | |||||
c->h264_idct_add8 = ff_h264_idct_add8_c; | c->h264_idct_add8 = ff_h264_idct_add8_c; | ||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | ||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | ||||
c->h264_chroma_dc_dequant_idct= ff_chroma_dc_dequant_idct_c; | |||||
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | ||||
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | ||||
@@ -68,6 +68,7 @@ typedef struct H264DSPContext{ | |||||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | ||||
void (*h264_chroma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||||
}H264DSPContext; | }H264DSPContext; | ||||
void ff_h264dsp_init(H264DSPContext *c); | void ff_h264dsp_init(H264DSPContext *c); | ||||
@@ -250,4 +250,26 @@ void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||||
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | ||||
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | ||||
} | } | ||||
#undef stride | |||||
} | |||||
void ff_chroma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||||
const int stride= 16*2; | |||||
const int xStride= 16; | |||||
int a,b,c,d,e; | |||||
a= input[0]; | |||||
b= input[1]; | |||||
c= input[2]; | |||||
d= input[3]; | |||||
e= a-b; | |||||
a= a+b; | |||||
b= c-d; | |||||
c= c+d; | |||||
output[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; | |||||
output[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; | |||||
output[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; | |||||
output[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; | |||||
} | } |
@@ -671,11 +671,12 @@ static int svq3_decode_mb(H264Context *h, unsigned int mb_type) | |||||
} | } | ||||
if ((cbp & 0x30)) { | if ((cbp & 0x30)) { | ||||
AV_ZERO128(h->mb_chroma_dc); | |||||
for (i = 0; i < 2; ++i) { | for (i = 0; i < 2; ++i) { | ||||
if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){ | |||||
av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n"); | |||||
return -1; | |||||
} | |||||
if (svq3_decode_block(&s->gb, h->mb_chroma_dc[i], 0, 3)){ | |||||
av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n"); | |||||
return -1; | |||||
} | |||||
} | } | ||||
if ((cbp & 0x20)) { | if ((cbp & 0x20)) { | ||||