No speed improvement, but necessary for some future stuff. Also opens up the possibility of asm chroma dc idct/dequant. Originally committed as revision 26349 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -66,6 +66,7 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, | |||
| void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | |||
| void ff_chroma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||
| void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||
| void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | |||
| @@ -246,93 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ | |||
| return 0; | |||
| } | |||
| #if 0 | |||
| /** | |||
| * DCT transforms the 16 dc values. | |||
| * @param qp quantization parameter ??? FIXME | |||
| */ | |||
| static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){ | |||
| // const int qmul= dequant_coeff[qp][0]; | |||
| int i; | |||
| int temp[16]; //FIXME check if this is a good idea | |||
| static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; | |||
| static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; | |||
| for(i=0; i<4; i++){ | |||
| const int offset= y_offset[i]; | |||
| const int z0= block[offset+stride*0] + block[offset+stride*4]; | |||
| const int z1= block[offset+stride*0] - block[offset+stride*4]; | |||
| const int z2= block[offset+stride*1] - block[offset+stride*5]; | |||
| const int z3= block[offset+stride*1] + block[offset+stride*5]; | |||
| temp[4*i+0]= z0+z3; | |||
| temp[4*i+1]= z1+z2; | |||
| temp[4*i+2]= z1-z2; | |||
| temp[4*i+3]= z0-z3; | |||
| } | |||
| for(i=0; i<4; i++){ | |||
| const int offset= x_offset[i]; | |||
| const int z0= temp[4*0+i] + temp[4*2+i]; | |||
| const int z1= temp[4*0+i] - temp[4*2+i]; | |||
| const int z2= temp[4*1+i] - temp[4*3+i]; | |||
| const int z3= temp[4*1+i] + temp[4*3+i]; | |||
| block[stride*0 +offset]= (z0 + z3)>>1; | |||
| block[stride*2 +offset]= (z1 + z2)>>1; | |||
| block[stride*8 +offset]= (z1 - z2)>>1; | |||
| block[stride*10+offset]= (z0 - z3)>>1; | |||
| } | |||
| } | |||
| #endif | |||
| #undef xStride | |||
| #undef stride | |||
| static void chroma_dc_dequant_idct_c(DCTELEM *block, int qmul){ | |||
| const int stride= 16*2; | |||
| const int xStride= 16; | |||
| int a,b,c,d,e; | |||
| a= block[stride*0 + xStride*0]; | |||
| b= block[stride*0 + xStride*1]; | |||
| c= block[stride*1 + xStride*0]; | |||
| d= block[stride*1 + xStride*1]; | |||
| e= a-b; | |||
| a= a+b; | |||
| b= c-d; | |||
| c= c+d; | |||
| block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; | |||
| block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; | |||
| block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; | |||
| block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; | |||
| } | |||
| #if 0 | |||
| static void chroma_dc_dct_c(DCTELEM *block){ | |||
| const int stride= 16*2; | |||
| const int xStride= 16; | |||
| int a,b,c,d,e; | |||
| a= block[stride*0 + xStride*0]; | |||
| b= block[stride*0 + xStride*1]; | |||
| c= block[stride*1 + xStride*0]; | |||
| d= block[stride*1 + xStride*1]; | |||
| e= a-b; | |||
| a= a+b; | |||
| b= c-d; | |||
| c= c+d; | |||
| block[stride*0 + xStride*0]= (a+c); | |||
| block[stride*0 + xStride*1]= (e+b); | |||
| block[stride*1 + xStride*0]= (a-c); | |||
| block[stride*1 + xStride*1]= (e-b); | |||
| } | |||
| #endif | |||
| static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list, | |||
| uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr, | |||
| int src_x_offset, int src_y_offset, | |||
| @@ -1283,17 +1196,19 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||
| } | |||
| } | |||
| }else{ | |||
| int chroma_qpu = h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]; | |||
| int chroma_qpv = h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]; | |||
| if(is_h264){ | |||
| if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+0] ]) | |||
| chroma_dc_dequant_idct_c(h->mb + 16*16 , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); | |||
| h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+0*16, &h->mb_chroma_dc[0], chroma_qpu ); | |||
| if(h->non_zero_count_cache[ scan8[CHROMA_DC_BLOCK_INDEX+1] ]) | |||
| chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); | |||
| h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, &h->mb_chroma_dc[1], chroma_qpv ); | |||
| h->h264dsp.h264_idct_add8(dest, block_offset, | |||
| h->mb, uvlinesize, | |||
| h->non_zero_count_cache); | |||
| }else{ | |||
| chroma_dc_dequant_idct_c(h->mb + 16*16 , h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]); | |||
| chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]); | |||
| h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+0*16, &h->mb_chroma_dc[0], chroma_qpu ); | |||
| h->h264dsp.h264_chroma_dc_dequant_idct(h->mb + 16*16+4*16, &h->mb_chroma_dc[1], chroma_qpv ); | |||
| for(i=16; i<16+8; i++){ | |||
| if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ | |||
| uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i]; | |||
| @@ -407,6 +407,7 @@ typedef struct H264Context{ | |||
| DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | |||
| DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | |||
| DECLARE_ALIGNED(16, DCTELEM, mb_chroma_dc)[2][4]; | |||
| DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | |||
| /** | |||
| @@ -1680,9 +1680,10 @@ decode_intra_mb: | |||
| if( cbp&0x30 ){ | |||
| int c; | |||
| AV_ZERO128(h->mb_chroma_dc); | |||
| for( c = 0; c < 2; c++ ) { | |||
| //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c ); | |||
| decode_cabac_residual_dc(h, h->mb + 256 + 16*4*c, 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4); | |||
| decode_cabac_residual_dc(h, h->mb_chroma_dc[c], 3, CHROMA_DC_BLOCK_INDEX+c, chroma_dc_scan, 4); | |||
| } | |||
| } | |||
| @@ -987,8 +987,9 @@ decode_intra_mb: | |||
| } | |||
| if(cbp&0x30){ | |||
| AV_ZERO128(h->mb_chroma_dc); | |||
| for(chroma_idx=0; chroma_idx<2; chroma_idx++) | |||
| if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){ | |||
| if( decode_residual(h, gb, h->mb_chroma_dc[chroma_idx], CHROMA_DC_BLOCK_INDEX+chroma_idx, chroma_dc_scan, NULL, 4) < 0){ | |||
| return -1; | |||
| } | |||
| } | |||
| @@ -79,8 +79,7 @@ static const uint8_t luma_dc_field_scan[16]={ | |||
| }; | |||
| static const uint8_t chroma_dc_scan[4]={ | |||
| (0+0*2)*16, (1+0*2)*16, | |||
| (0+1*2)*16, (1+1*2)*16, //FIXME | |||
| 0,1,2,3 | |||
| }; | |||
| // zigzag_scan8x8_cavlc[i] = zigzag_scan8x8[(i/4) + 16*(i%4)] | |||
| @@ -283,6 +283,7 @@ void ff_h264dsp_init(H264DSPContext *c) | |||
| c->h264_idct_add8 = ff_h264_idct_add8_c; | |||
| c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | |||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | |||
| c->h264_chroma_dc_dequant_idct= ff_chroma_dc_dequant_idct_c; | |||
| c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | |||
| c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |||
| @@ -68,6 +68,7 @@ typedef struct H264DSPContext{ | |||
| void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||
| void (*h264_chroma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||
| }H264DSPContext; | |||
| void ff_h264dsp_init(H264DSPContext *c); | |||
| @@ -250,4 +250,26 @@ void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||
| output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||
| output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||
| } | |||
| #undef stride | |||
| } | |||
| void ff_chroma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||
| const int stride= 16*2; | |||
| const int xStride= 16; | |||
| int a,b,c,d,e; | |||
| a= input[0]; | |||
| b= input[1]; | |||
| c= input[2]; | |||
| d= input[3]; | |||
| e= a-b; | |||
| a= a+b; | |||
| b= c-d; | |||
| c= c+d; | |||
| output[stride*0 + xStride*0]= ((a+c)*qmul) >> 7; | |||
| output[stride*0 + xStride*1]= ((e+b)*qmul) >> 7; | |||
| output[stride*1 + xStride*0]= ((a-c)*qmul) >> 7; | |||
| output[stride*1 + xStride*1]= ((e-b)*qmul) >> 7; | |||
| } | |||
| @@ -671,11 +671,12 @@ static int svq3_decode_mb(H264Context *h, unsigned int mb_type) | |||
| } | |||
| if ((cbp & 0x30)) { | |||
| AV_ZERO128(h->mb_chroma_dc); | |||
| for (i = 0; i < 2; ++i) { | |||
| if (svq3_decode_block(&s->gb, &h->mb[16*(16 + 4*i)], 0, 3)){ | |||
| av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n"); | |||
| return -1; | |||
| } | |||
| if (svq3_decode_block(&s->gb, h->mb_chroma_dc[i], 0, 3)){ | |||
| av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding chroma dc block\n"); | |||
| return -1; | |||
| } | |||
| } | |||
| if ((cbp & 0x20)) { | |||