About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
@@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl | |||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | |||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | |||
const float *win, float add_bias, int len); | |||
void ff_float_to_int16_c(int16_t *dst, const float *src, long len); | |||
@@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ | |||
return 0; | |||
} | |||
/** | |||
* IDCT transforms the 16 dc values and dequantizes them. | |||
* @param qp quantization parameter | |||
*/ | |||
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){ | |||
#define stride 16 | |||
int i; | |||
int temp[16]; //FIXME check if this is a good idea | |||
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; | |||
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; | |||
//memset(block, 64, 2*256); | |||
//return; | |||
for(i=0; i<4; i++){ | |||
const int offset= y_offset[i]; | |||
const int z0= block[offset+stride*0] + block[offset+stride*4]; | |||
const int z1= block[offset+stride*0] - block[offset+stride*4]; | |||
const int z2= block[offset+stride*1] - block[offset+stride*5]; | |||
const int z3= block[offset+stride*1] + block[offset+stride*5]; | |||
temp[4*i+0]= z0+z3; | |||
temp[4*i+1]= z1+z2; | |||
temp[4*i+2]= z1-z2; | |||
temp[4*i+3]= z0-z3; | |||
} | |||
for(i=0; i<4; i++){ | |||
const int offset= x_offset[i]; | |||
const int z0= temp[4*0+i] + temp[4*2+i]; | |||
const int z1= temp[4*0+i] - temp[4*2+i]; | |||
const int z2= temp[4*1+i] - temp[4*3+i]; | |||
const int z3= temp[4*1+i] + temp[4*3+i]; | |||
block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual | |||
block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||
block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||
block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||
} | |||
} | |||
#if 0 | |||
/** | |||
* DCT transforms the 16 dc values. | |||
@@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); | |||
if(is_h264){ | |||
if(!transform_bypass) | |||
h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]); | |||
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); | |||
else{ | |||
static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16, | |||
8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16}; | |||
for(i = 0; i < 16; i++) | |||
h->mb[dc_mapping[i]] = h->mb_luma_dc[i]; | |||
} | |||
}else | |||
ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); | |||
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); | |||
} | |||
if(h->deblocking_filter) | |||
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); | |||
@@ -406,6 +406,7 @@ typedef struct H264Context{ | |||
GetBitContext *inter_gb_ptr; | |||
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | |||
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | |||
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | |||
/** | |||
@@ -600,10 +601,6 @@ typedef struct H264Context{ | |||
extern const uint8_t ff_h264_chroma_qp[52]; | |||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); | |||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||
/** | |||
* Decode SEI | |||
*/ | |||
@@ -1597,17 +1597,15 @@ decode_intra_mb: | |||
s->current_picture.mb_type[mb_xy]= mb_type; | |||
if( cbp || IS_INTRA16x16( mb_type ) ) { | |||
const uint8_t *scan, *scan8x8, *dc_scan; | |||
const uint8_t *scan, *scan8x8; | |||
const uint32_t *qmul; | |||
if(IS_INTERLACED(mb_type)){ | |||
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0; | |||
scan= s->qscale ? h->field_scan : h->field_scan_q0; | |||
dc_scan= luma_dc_field_scan; | |||
}else{ | |||
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0; | |||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | |||
dc_scan= luma_dc_zigzag_scan; | |||
} | |||
// decode_cabac_mb_dqp | |||
@@ -1642,7 +1640,9 @@ decode_intra_mb: | |||
if( IS_INTRA16x16( mb_type ) ) { | |||
int i; | |||
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); | |||
decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16); | |||
AV_ZERO128(h->mb_luma_dc+0); | |||
AV_ZERO128(h->mb_luma_dc+8); | |||
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16); | |||
if( cbp&15 ) { | |||
qmul = h->dequant4_coeff[0][s->qscale]; | |||
@@ -911,16 +911,14 @@ decode_intra_mb: | |||
int i8x8, i4x4, chroma_idx; | |||
int dquant; | |||
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; | |||
const uint8_t *scan, *scan8x8, *dc_scan; | |||
const uint8_t *scan, *scan8x8; | |||
if(IS_INTERLACED(mb_type)){ | |||
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0; | |||
scan= s->qscale ? h->field_scan : h->field_scan_q0; | |||
dc_scan= luma_dc_field_scan; | |||
}else{ | |||
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0; | |||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | |||
dc_scan= luma_dc_zigzag_scan; | |||
} | |||
dquant= get_se_golomb(&s->gb); | |||
@@ -939,7 +937,9 @@ decode_intra_mb: | |||
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale); | |||
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale); | |||
if(IS_INTRA16x16(mb_type)){ | |||
if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||
AV_ZERO128(h->mb_luma_dc+0); | |||
AV_ZERO128(h->mb_luma_dc+8); | |||
if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||
return -1; //FIXME continue if partitioned and other return -1 too | |||
} | |||
@@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c) | |||
c->h264_idct8_add4 = ff_h264_idct8_add4_c; | |||
c->h264_idct_add8 = ff_h264_idct_add8_c; | |||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | |||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | |||
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | |||
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | |||
@@ -65,11 +65,13 @@ typedef struct H264DSPContext{ | |||
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | |||
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); | |||
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | |||
void (*h264_dct)(DCTELEM block[4][4]); | |||
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||
}H264DSPContext; | |||
void ff_h264dsp_init(H264DSPContext *c); | |||
@@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block | |||
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
/** | |||
* IDCT transforms the 16 dc values and dequantizes them. | |||
* @param qp quantization parameter | |||
*/ | |||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||
#define stride 16 | |||
int i; | |||
int temp[16]; | |||
static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride}; | |||
for(i=0; i<4; i++){ | |||
const int z0= input[4*i+0] + input[4*i+1]; | |||
const int z1= input[4*i+0] - input[4*i+1]; | |||
const int z2= input[4*i+2] - input[4*i+3]; | |||
const int z3= input[4*i+2] + input[4*i+3]; | |||
temp[4*i+0]= z0+z3; | |||
temp[4*i+1]= z0-z3; | |||
temp[4*i+2]= z1-z2; | |||
temp[4*i+3]= z1+z2; | |||
} | |||
for(i=0; i<4; i++){ | |||
const int offset= x_offset[i]; | |||
const int z0= temp[4*0+i] + temp[4*2+i]; | |||
const int z1= temp[4*0+i] - temp[4*2+i]; | |||
const int z2= temp[4*1+i] - temp[4*3+i]; | |||
const int z3= temp[4*1+i] + temp[4*3+i]; | |||
output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); | |||
output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||
} | |||
} |
@@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = { | |||
}; | |||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp) | |||
{ | |||
const int qmul = svq3_dequant_coeff[qp]; | |||
#define stride 16 | |||
int i; | |||
int temp[16]; | |||
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride}; | |||
static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride}; | |||
for (i = 0; i < 4; i++){ | |||
const int offset = y_offset[i]; | |||
const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]); | |||
const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]); | |||
const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5]; | |||
const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5]; | |||
const int z0= 13*(input[4*i+0] + input[4*i+1]); | |||
const int z1= 13*(input[4*i+0] - input[4*i+1]); | |||
const int z2= 7* input[4*i+2] - 17*input[4*i+3]; | |||
const int z3= 17* input[4*i+2] + 7*input[4*i+3]; | |||
temp[4*i+0] = z0+z3; | |||
temp[4*i+1] = z1+z2; | |||
@@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i]; | |||
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i]; | |||
block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||
block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||
block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||
block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||
output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||
output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||
output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||
output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||
} | |||
} | |||
#undef stride | |||
@@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |||
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | |||
{0x8000000080000000ULL, 0x8000000080000000ULL}; | |||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; | |||
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; | |||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; | |||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | |||
@@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 | |||
%endif | |||
cextern pw_32 | |||
cextern pw_1 | |||
SECTION .text | |||
@@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8 | |||
add8_sse2_cycle 2, 0x21 | |||
add8_sse2_cycle 3, 0x29 | |||
RET | |||
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) | |||
%macro WALSH4_1D 5 | |||
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 | |||
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 | |||
SWAP %1, %4, %3 | |||
%endmacro | |||
%macro DEQUANT_MMX 3 | |||
mova m7, [pw_1] | |||
mova m4, %1 | |||
punpcklwd %1, m7 | |||
punpckhwd m4, m7 | |||
mova m5, %2 | |||
punpcklwd %2, m7 | |||
punpckhwd m5, m7 | |||
movd m7, t3d | |||
punpckldq m7, m7 | |||
pmaddwd %1, m7 | |||
pmaddwd %2, m7 | |||
pmaddwd m4, m7 | |||
pmaddwd m5, m7 | |||
psrad %1, %3 | |||
psrad %2, %3 | |||
psrad m4, %3 | |||
psrad m5, %3 | |||
packssdw %1, m4 | |||
packssdw %2, m5 | |||
%endmacro | |||
%macro STORE_WORDS_MMX 5 | |||
movd t0d, %1 | |||
psrlq %1, 32 | |||
movd t1d, %1 | |||
mov [t2+%2*32], t0w | |||
mov [t2+%4*32], t1w | |||
shr t0d, 16 | |||
shr t1d, 16 | |||
mov [t2+%3*32], t0w | |||
mov [t2+%5*32], t1w | |||
%endmacro | |||
%macro DEQUANT_STORE_MMX 1 | |||
DEQUANT_MMX m0, m1, %1 | |||
STORE_WORDS_MMX m0, 0, 1, 4, 5 | |||
STORE_WORDS_MMX m1, 2, 3, 6, 7 | |||
DEQUANT_MMX m2, m3, %1 | |||
STORE_WORDS_MMX m2, 8, 9, 12, 13 | |||
STORE_WORDS_MMX m3, 10, 11, 14, 15 | |||
%endmacro | |||
%macro STORE_WORDS_SSE 9 | |||
movd t0d, %1 | |||
psrldq %1, 4 | |||
movd t1d, %1 | |||
psrldq %1, 4 | |||
mov [t2+%2*32], t0w | |||
mov [t2+%4*32], t1w | |||
shr t0d, 16 | |||
shr t1d, 16 | |||
mov [t2+%3*32], t0w | |||
mov [t2+%5*32], t1w | |||
movd t0d, %1 | |||
psrldq %1, 4 | |||
movd t1d, %1 | |||
mov [t2+%6*32], t0w | |||
mov [t2+%8*32], t1w | |||
shr t0d, 16 | |||
shr t1d, 16 | |||
mov [t2+%7*32], t0w | |||
mov [t2+%9*32], t1w | |||
%endmacro | |||
%macro DEQUANT_STORE_SSE2 1 | |||
movd xmm4, t3d | |||
movq xmm5, [pw_1] | |||
pshufd xmm4, xmm4, 0 | |||
movq2dq xmm0, m0 | |||
movq2dq xmm1, m1 | |||
movq2dq xmm2, m2 | |||
movq2dq xmm3, m3 | |||
punpcklwd xmm0, xmm5 | |||
punpcklwd xmm1, xmm5 | |||
punpcklwd xmm2, xmm5 | |||
punpcklwd xmm3, xmm5 | |||
pmaddwd xmm0, xmm4 | |||
pmaddwd xmm1, xmm4 | |||
pmaddwd xmm2, xmm4 | |||
pmaddwd xmm3, xmm4 | |||
psrad xmm0, %1 | |||
psrad xmm1, %1 | |||
psrad xmm2, %1 | |||
psrad xmm3, %1 | |||
packssdw xmm0, xmm1 | |||
packssdw xmm2, xmm3 | |||
STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 | |||
STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 | |||
%endmacro | |||
%macro IDCT_DC_DEQUANT 2 | |||
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 | |||
movq m3, [r1+24] | |||
movq m2, [r1+16] | |||
movq m1, [r1+ 8] | |||
movq m0, [r1+ 0] | |||
WALSH4_1D 0,1,2,3,4 | |||
TRANSPOSE4x4W 0,1,2,3,4 | |||
WALSH4_1D 0,1,2,3,4 | |||
; shift, tmp, output, qmul | |||
%ifdef WIN64 | |||
DECLARE_REG_TMP 0,3,1,2 | |||
; we can't avoid this, because r0 is the shift register (ecx) on win64 | |||
xchg r0, t2 | |||
%elifdef ARCH_X86_64 | |||
DECLARE_REG_TMP 3,1,0,2 | |||
%else | |||
DECLARE_REG_TMP 1,3,0,2 | |||
%endif | |||
cmp t3d, 32767 | |||
jg .big_qmul | |||
add t3d, 128 << 16 | |||
%ifidn %1,mmx | |||
DEQUANT_STORE_MMX 8 | |||
%else | |||
DEQUANT_STORE_SSE2 8 | |||
%endif | |||
RET | |||
.big_qmul: | |||
bsr t0d, t3d | |||
add t3d, 128 << 16 | |||
mov t1d, 7 | |||
cmp t0d, t1d | |||
cmovg t0d, t1d | |||
inc t1d | |||
shr t3d, t0b | |||
sub t1d, t0d | |||
%ifidn %1,mmx | |||
movd m6, t1d | |||
DEQUANT_STORE_MMX m6 | |||
%else | |||
movd xmm6, t1d | |||
DEQUANT_STORE_SSE2 xmm6 | |||
%endif | |||
RET | |||
%endmacro | |||
INIT_MMX | |||
IDCT_DC_DEQUANT mmx, 0 | |||
IDCT_DC_DEQUANT sse2, 7 |
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM | |||
int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, | |||
int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); | |||
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); | |||
/***********************************/ | |||
/* deblocking */ | |||
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | |||
c->h264_idct_add8 = ff_h264_idct_add8_mmx; | |||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | |||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; | |||
if (mm_flags & AV_CPU_FLAG_MMX2) { | |||
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |||
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||
if (mm_flags&AV_CPU_FLAG_SSE2) { | |||
c->h264_idct8_add = ff_h264_idct8_add_sse2; | |||
c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | |||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | |||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | |||