About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl | |||
| void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
| void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
| void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | |||
| void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||
| void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | |||
| const float *win, float add_bias, int len); | |||
| void ff_float_to_int16_c(int16_t *dst, const float *src, long len); | |||
| @@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ | |||
| return 0; | |||
| } | |||
| /** | |||
| * IDCT transforms the 16 dc values and dequantizes them. | |||
| * @param qp quantization parameter | |||
| */ | |||
| static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){ | |||
| #define stride 16 | |||
| int i; | |||
| int temp[16]; //FIXME check if this is a good idea | |||
| static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; | |||
| static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; | |||
| //memset(block, 64, 2*256); | |||
| //return; | |||
| for(i=0; i<4; i++){ | |||
| const int offset= y_offset[i]; | |||
| const int z0= block[offset+stride*0] + block[offset+stride*4]; | |||
| const int z1= block[offset+stride*0] - block[offset+stride*4]; | |||
| const int z2= block[offset+stride*1] - block[offset+stride*5]; | |||
| const int z3= block[offset+stride*1] + block[offset+stride*5]; | |||
| temp[4*i+0]= z0+z3; | |||
| temp[4*i+1]= z1+z2; | |||
| temp[4*i+2]= z1-z2; | |||
| temp[4*i+3]= z0-z3; | |||
| } | |||
| for(i=0; i<4; i++){ | |||
| const int offset= x_offset[i]; | |||
| const int z0= temp[4*0+i] + temp[4*2+i]; | |||
| const int z1= temp[4*0+i] - temp[4*2+i]; | |||
| const int z2= temp[4*1+i] - temp[4*3+i]; | |||
| const int z3= temp[4*1+i] + temp[4*3+i]; | |||
| block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual | |||
| block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||
| block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||
| block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||
| } | |||
| } | |||
| #if 0 | |||
| /** | |||
| * DCT transforms the 16 dc values. | |||
| @@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||
| h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); | |||
| if(is_h264){ | |||
| if(!transform_bypass) | |||
| h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]); | |||
| h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); | |||
| else{ | |||
| static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16, | |||
| 8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16}; | |||
| for(i = 0; i < 16; i++) | |||
| h->mb[dc_mapping[i]] = h->mb_luma_dc[i]; | |||
| } | |||
| }else | |||
| ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); | |||
| ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); | |||
| } | |||
| if(h->deblocking_filter) | |||
| xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); | |||
| @@ -406,6 +406,7 @@ typedef struct H264Context{ | |||
| GetBitContext *inter_gb_ptr; | |||
| DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | |||
| DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | |||
| DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | |||
| /** | |||
| @@ -600,10 +601,6 @@ typedef struct H264Context{ | |||
| extern const uint8_t ff_h264_chroma_qp[52]; | |||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); | |||
| void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||
| /** | |||
| * Decode SEI | |||
| */ | |||
| @@ -1597,17 +1597,15 @@ decode_intra_mb: | |||
| s->current_picture.mb_type[mb_xy]= mb_type; | |||
| if( cbp || IS_INTRA16x16( mb_type ) ) { | |||
| const uint8_t *scan, *scan8x8, *dc_scan; | |||
| const uint8_t *scan, *scan8x8; | |||
| const uint32_t *qmul; | |||
| if(IS_INTERLACED(mb_type)){ | |||
| scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0; | |||
| scan= s->qscale ? h->field_scan : h->field_scan_q0; | |||
| dc_scan= luma_dc_field_scan; | |||
| }else{ | |||
| scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0; | |||
| scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | |||
| dc_scan= luma_dc_zigzag_scan; | |||
| } | |||
| // decode_cabac_mb_dqp | |||
| @@ -1642,7 +1640,9 @@ decode_intra_mb: | |||
| if( IS_INTRA16x16( mb_type ) ) { | |||
| int i; | |||
| //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); | |||
| decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16); | |||
| AV_ZERO128(h->mb_luma_dc+0); | |||
| AV_ZERO128(h->mb_luma_dc+8); | |||
| decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16); | |||
| if( cbp&15 ) { | |||
| qmul = h->dequant4_coeff[0][s->qscale]; | |||
| @@ -911,16 +911,14 @@ decode_intra_mb: | |||
| int i8x8, i4x4, chroma_idx; | |||
| int dquant; | |||
| GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; | |||
| const uint8_t *scan, *scan8x8, *dc_scan; | |||
| const uint8_t *scan, *scan8x8; | |||
| if(IS_INTERLACED(mb_type)){ | |||
| scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0; | |||
| scan= s->qscale ? h->field_scan : h->field_scan_q0; | |||
| dc_scan= luma_dc_field_scan; | |||
| }else{ | |||
| scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0; | |||
| scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | |||
| dc_scan= luma_dc_zigzag_scan; | |||
| } | |||
| dquant= get_se_golomb(&s->gb); | |||
| @@ -939,7 +937,9 @@ decode_intra_mb: | |||
| h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale); | |||
| h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale); | |||
| if(IS_INTRA16x16(mb_type)){ | |||
| if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||
| AV_ZERO128(h->mb_luma_dc+0); | |||
| AV_ZERO128(h->mb_luma_dc+8); | |||
| if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||
| return -1; //FIXME continue if partitioned and other return -1 too | |||
| } | |||
| @@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c) | |||
| c->h264_idct8_add4 = ff_h264_idct8_add4_c; | |||
| c->h264_idct_add8 = ff_h264_idct_add8_c; | |||
| c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | |||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | |||
| c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | |||
| c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |||
| @@ -65,11 +65,13 @@ typedef struct H264DSPContext{ | |||
| void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | |||
| void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); | |||
| void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | |||
| void (*h264_dct)(DCTELEM block[4][4]); | |||
| void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
| void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||
| }H264DSPContext; | |||
| void ff_h264dsp_init(H264DSPContext *c); | |||
| @@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block | |||
| ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
| } | |||
| } | |||
| /** | |||
| * IDCT transforms the 16 dc values and dequantizes them. | |||
| * @param qp quantization parameter | |||
| */ | |||
| void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||
| #define stride 16 | |||
| int i; | |||
| int temp[16]; | |||
| static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride}; | |||
| for(i=0; i<4; i++){ | |||
| const int z0= input[4*i+0] + input[4*i+1]; | |||
| const int z1= input[4*i+0] - input[4*i+1]; | |||
| const int z2= input[4*i+2] - input[4*i+3]; | |||
| const int z3= input[4*i+2] + input[4*i+3]; | |||
| temp[4*i+0]= z0+z3; | |||
| temp[4*i+1]= z0-z3; | |||
| temp[4*i+2]= z1-z2; | |||
| temp[4*i+3]= z1+z2; | |||
| } | |||
| for(i=0; i<4; i++){ | |||
| const int offset= x_offset[i]; | |||
| const int z0= temp[4*0+i] + temp[4*2+i]; | |||
| const int z1= temp[4*0+i] - temp[4*2+i]; | |||
| const int z2= temp[4*1+i] - temp[4*3+i]; | |||
| const int z3= temp[4*1+i] + temp[4*3+i]; | |||
| output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); | |||
| output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||
| output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||
| output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||
| } | |||
| } | |||
| @@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = { | |||
| }; | |||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||
| void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp) | |||
| { | |||
| const int qmul = svq3_dequant_coeff[qp]; | |||
| #define stride 16 | |||
| int i; | |||
| int temp[16]; | |||
| static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride}; | |||
| static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride}; | |||
| for (i = 0; i < 4; i++){ | |||
| const int offset = y_offset[i]; | |||
| const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]); | |||
| const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]); | |||
| const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5]; | |||
| const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5]; | |||
| const int z0= 13*(input[4*i+0] + input[4*i+1]); | |||
| const int z1= 13*(input[4*i+0] - input[4*i+1]); | |||
| const int z2= 7* input[4*i+2] - 17*input[4*i+3]; | |||
| const int z3= 17* input[4*i+2] + 7*input[4*i+3]; | |||
| temp[4*i+0] = z0+z3; | |||
| temp[4*i+1] = z1+z2; | |||
| @@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||
| const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i]; | |||
| const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i]; | |||
| block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||
| block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||
| block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||
| block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||
| output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||
| output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||
| output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||
| output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||
| } | |||
| } | |||
| #undef stride | |||
| @@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |||
| DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | |||
| {0x8000000080000000ULL, 0x8000000080000000ULL}; | |||
| DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; | |||
| DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; | |||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; | |||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | |||
| @@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 | |||
| %endif | |||
| cextern pw_32 | |||
| cextern pw_1 | |||
| SECTION .text | |||
| @@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8 | |||
| add8_sse2_cycle 2, 0x21 | |||
| add8_sse2_cycle 3, 0x29 | |||
| RET | |||
| ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) | |||
| %macro WALSH4_1D 5 | |||
| SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 | |||
| SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 | |||
| SWAP %1, %4, %3 | |||
| %endmacro | |||
| %macro DEQUANT_MMX 3 | |||
| mova m7, [pw_1] | |||
| mova m4, %1 | |||
| punpcklwd %1, m7 | |||
| punpckhwd m4, m7 | |||
| mova m5, %2 | |||
| punpcklwd %2, m7 | |||
| punpckhwd m5, m7 | |||
| movd m7, t3d | |||
| punpckldq m7, m7 | |||
| pmaddwd %1, m7 | |||
| pmaddwd %2, m7 | |||
| pmaddwd m4, m7 | |||
| pmaddwd m5, m7 | |||
| psrad %1, %3 | |||
| psrad %2, %3 | |||
| psrad m4, %3 | |||
| psrad m5, %3 | |||
| packssdw %1, m4 | |||
| packssdw %2, m5 | |||
| %endmacro | |||
| %macro STORE_WORDS_MMX 5 | |||
| movd t0d, %1 | |||
| psrlq %1, 32 | |||
| movd t1d, %1 | |||
| mov [t2+%2*32], t0w | |||
| mov [t2+%4*32], t1w | |||
| shr t0d, 16 | |||
| shr t1d, 16 | |||
| mov [t2+%3*32], t0w | |||
| mov [t2+%5*32], t1w | |||
| %endmacro | |||
| %macro DEQUANT_STORE_MMX 1 | |||
| DEQUANT_MMX m0, m1, %1 | |||
| STORE_WORDS_MMX m0, 0, 1, 4, 5 | |||
| STORE_WORDS_MMX m1, 2, 3, 6, 7 | |||
| DEQUANT_MMX m2, m3, %1 | |||
| STORE_WORDS_MMX m2, 8, 9, 12, 13 | |||
| STORE_WORDS_MMX m3, 10, 11, 14, 15 | |||
| %endmacro | |||
| %macro STORE_WORDS_SSE 9 | |||
| movd t0d, %1 | |||
| psrldq %1, 4 | |||
| movd t1d, %1 | |||
| psrldq %1, 4 | |||
| mov [t2+%2*32], t0w | |||
| mov [t2+%4*32], t1w | |||
| shr t0d, 16 | |||
| shr t1d, 16 | |||
| mov [t2+%3*32], t0w | |||
| mov [t2+%5*32], t1w | |||
| movd t0d, %1 | |||
| psrldq %1, 4 | |||
| movd t1d, %1 | |||
| mov [t2+%6*32], t0w | |||
| mov [t2+%8*32], t1w | |||
| shr t0d, 16 | |||
| shr t1d, 16 | |||
| mov [t2+%7*32], t0w | |||
| mov [t2+%9*32], t1w | |||
| %endmacro | |||
| %macro DEQUANT_STORE_SSE2 1 | |||
| movd xmm4, t3d | |||
| movq xmm5, [pw_1] | |||
| pshufd xmm4, xmm4, 0 | |||
| movq2dq xmm0, m0 | |||
| movq2dq xmm1, m1 | |||
| movq2dq xmm2, m2 | |||
| movq2dq xmm3, m3 | |||
| punpcklwd xmm0, xmm5 | |||
| punpcklwd xmm1, xmm5 | |||
| punpcklwd xmm2, xmm5 | |||
| punpcklwd xmm3, xmm5 | |||
| pmaddwd xmm0, xmm4 | |||
| pmaddwd xmm1, xmm4 | |||
| pmaddwd xmm2, xmm4 | |||
| pmaddwd xmm3, xmm4 | |||
| psrad xmm0, %1 | |||
| psrad xmm1, %1 | |||
| psrad xmm2, %1 | |||
| psrad xmm3, %1 | |||
| packssdw xmm0, xmm1 | |||
| packssdw xmm2, xmm3 | |||
| STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 | |||
| STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 | |||
| %endmacro | |||
| %macro IDCT_DC_DEQUANT 2 | |||
| cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 | |||
| movq m3, [r1+24] | |||
| movq m2, [r1+16] | |||
| movq m1, [r1+ 8] | |||
| movq m0, [r1+ 0] | |||
| WALSH4_1D 0,1,2,3,4 | |||
| TRANSPOSE4x4W 0,1,2,3,4 | |||
| WALSH4_1D 0,1,2,3,4 | |||
| ; shift, tmp, output, qmul | |||
| %ifdef WIN64 | |||
| DECLARE_REG_TMP 0,3,1,2 | |||
| ; we can't avoid this, because r0 is the shift register (ecx) on win64 | |||
| xchg r0, t2 | |||
| %elifdef ARCH_X86_64 | |||
| DECLARE_REG_TMP 3,1,0,2 | |||
| %else | |||
| DECLARE_REG_TMP 1,3,0,2 | |||
| %endif | |||
| cmp t3d, 32767 | |||
| jg .big_qmul | |||
| add t3d, 128 << 16 | |||
| %ifidn %1,mmx | |||
| DEQUANT_STORE_MMX 8 | |||
| %else | |||
| DEQUANT_STORE_SSE2 8 | |||
| %endif | |||
| RET | |||
| .big_qmul: | |||
| bsr t0d, t3d | |||
| add t3d, 128 << 16 | |||
| mov t1d, 7 | |||
| cmp t0d, t1d | |||
| cmovg t0d, t1d | |||
| inc t1d | |||
| shr t3d, t0b | |||
| sub t1d, t0d | |||
| %ifidn %1,mmx | |||
| movd m6, t1d | |||
| DEQUANT_STORE_MMX m6 | |||
| %else | |||
| movd xmm6, t1d | |||
| DEQUANT_STORE_SSE2 xmm6 | |||
| %endif | |||
| RET | |||
| %endmacro | |||
| INIT_MMX | |||
| IDCT_DC_DEQUANT mmx, 0 | |||
| IDCT_DC_DEQUANT sse2, 7 | |||
| @@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM | |||
| int stride, const uint8_t nnzc[6*8]); | |||
| void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, | |||
| int stride, const uint8_t nnzc[6*8]); | |||
| void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); | |||
| void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); | |||
| /***********************************/ | |||
| /* deblocking */ | |||
| @@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||
| c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | |||
| c->h264_idct_add8 = ff_h264_idct_add8_mmx; | |||
| c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | |||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; | |||
| if (mm_flags & AV_CPU_FLAG_MMX2) { | |||
| c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |||
| @@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||
| if (mm_flags&AV_CPU_FLAG_SSE2) { | |||
| c->h264_idct8_add = ff_h264_idct8_add_sse2; | |||
| c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |||
| c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | |||
| c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | |||
| c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | |||