About 2.5x the speed. NOTE: the way that the asm code handles large qmuls is a bit suboptimal. If x264-style dequant was used (separate shift and qmul values), it might be possible to get some extra speed. Originally committed as revision 26336 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
@@ -64,6 +64,10 @@ void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *bl | |||||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | ||||
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | ||||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul); | |||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp); | |||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||||
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, | ||||
const float *win, float add_bias, int len); | const float *win, float add_bias, int len); | ||||
void ff_float_to_int16_c(int16_t *dst, const float *src, long len); | void ff_float_to_int16_c(int16_t *dst, const float *src, long len); | ||||
@@ -246,46 +246,6 @@ int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){ | |||||
return 0; | return 0; | ||||
} | } | ||||
/** | |||||
* IDCT transforms the 16 dc values and dequantizes them. | |||||
* @param qp quantization parameter | |||||
*/ | |||||
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){ | |||||
#define stride 16 | |||||
int i; | |||||
int temp[16]; //FIXME check if this is a good idea | |||||
static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride}; | |||||
static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride}; | |||||
//memset(block, 64, 2*256); | |||||
//return; | |||||
for(i=0; i<4; i++){ | |||||
const int offset= y_offset[i]; | |||||
const int z0= block[offset+stride*0] + block[offset+stride*4]; | |||||
const int z1= block[offset+stride*0] - block[offset+stride*4]; | |||||
const int z2= block[offset+stride*1] - block[offset+stride*5]; | |||||
const int z3= block[offset+stride*1] + block[offset+stride*5]; | |||||
temp[4*i+0]= z0+z3; | |||||
temp[4*i+1]= z1+z2; | |||||
temp[4*i+2]= z1-z2; | |||||
temp[4*i+3]= z0-z3; | |||||
} | |||||
for(i=0; i<4; i++){ | |||||
const int offset= x_offset[i]; | |||||
const int z0= temp[4*0+i] + temp[4*2+i]; | |||||
const int z1= temp[4*0+i] - temp[4*2+i]; | |||||
const int z2= temp[4*1+i] - temp[4*3+i]; | |||||
const int z3= temp[4*1+i] + temp[4*3+i]; | |||||
block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual | |||||
block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||||
block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||||
block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||||
} | |||||
} | |||||
#if 0 | #if 0 | ||||
/** | /** | ||||
* DCT transforms the 16 dc values. | * DCT transforms the 16 dc values. | ||||
@@ -1245,9 +1205,15 @@ static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){ | |||||
h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); | h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize); | ||||
if(is_h264){ | if(is_h264){ | ||||
if(!transform_bypass) | if(!transform_bypass) | ||||
h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]); | |||||
h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]); | |||||
else{ | |||||
static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16, | |||||
8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16}; | |||||
for(i = 0; i < 16; i++) | |||||
h->mb[dc_mapping[i]] = h->mb_luma_dc[i]; | |||||
} | |||||
}else | }else | ||||
ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale); | |||||
ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale); | |||||
} | } | ||||
if(h->deblocking_filter) | if(h->deblocking_filter) | ||||
xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); | xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple); | ||||
@@ -406,6 +406,7 @@ typedef struct H264Context{ | |||||
GetBitContext *inter_gb_ptr; | GetBitContext *inter_gb_ptr; | ||||
DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | DECLARE_ALIGNED(16, DCTELEM, mb)[16*24]; | ||||
DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16]; | |||||
DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb | ||||
/** | /** | ||||
@@ -600,10 +601,6 @@ typedef struct H264Context{ | |||||
extern const uint8_t ff_h264_chroma_qp[52]; | extern const uint8_t ff_h264_chroma_qp[52]; | ||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp); | |||||
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc); | |||||
/** | /** | ||||
* Decode SEI | * Decode SEI | ||||
*/ | */ | ||||
@@ -1597,17 +1597,15 @@ decode_intra_mb: | |||||
s->current_picture.mb_type[mb_xy]= mb_type; | s->current_picture.mb_type[mb_xy]= mb_type; | ||||
if( cbp || IS_INTRA16x16( mb_type ) ) { | if( cbp || IS_INTRA16x16( mb_type ) ) { | ||||
const uint8_t *scan, *scan8x8, *dc_scan; | |||||
const uint8_t *scan, *scan8x8; | |||||
const uint32_t *qmul; | const uint32_t *qmul; | ||||
if(IS_INTERLACED(mb_type)){ | if(IS_INTERLACED(mb_type)){ | ||||
scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0; | scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0; | ||||
scan= s->qscale ? h->field_scan : h->field_scan_q0; | scan= s->qscale ? h->field_scan : h->field_scan_q0; | ||||
dc_scan= luma_dc_field_scan; | |||||
}else{ | }else{ | ||||
scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0; | scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0; | ||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | ||||
dc_scan= luma_dc_zigzag_scan; | |||||
} | } | ||||
// decode_cabac_mb_dqp | // decode_cabac_mb_dqp | ||||
@@ -1642,7 +1640,9 @@ decode_intra_mb: | |||||
if( IS_INTRA16x16( mb_type ) ) { | if( IS_INTRA16x16( mb_type ) ) { | ||||
int i; | int i; | ||||
//av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); | //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" ); | ||||
decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16); | |||||
AV_ZERO128(h->mb_luma_dc+0); | |||||
AV_ZERO128(h->mb_luma_dc+8); | |||||
decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16); | |||||
if( cbp&15 ) { | if( cbp&15 ) { | ||||
qmul = h->dequant4_coeff[0][s->qscale]; | qmul = h->dequant4_coeff[0][s->qscale]; | ||||
@@ -911,16 +911,14 @@ decode_intra_mb: | |||||
int i8x8, i4x4, chroma_idx; | int i8x8, i4x4, chroma_idx; | ||||
int dquant; | int dquant; | ||||
GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; | GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr; | ||||
const uint8_t *scan, *scan8x8, *dc_scan; | |||||
const uint8_t *scan, *scan8x8; | |||||
if(IS_INTERLACED(mb_type)){ | if(IS_INTERLACED(mb_type)){ | ||||
scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0; | scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0; | ||||
scan= s->qscale ? h->field_scan : h->field_scan_q0; | scan= s->qscale ? h->field_scan : h->field_scan_q0; | ||||
dc_scan= luma_dc_field_scan; | |||||
}else{ | }else{ | ||||
scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0; | scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0; | ||||
scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0; | ||||
dc_scan= luma_dc_zigzag_scan; | |||||
} | } | ||||
dquant= get_se_golomb(&s->gb); | dquant= get_se_golomb(&s->gb); | ||||
@@ -939,7 +937,9 @@ decode_intra_mb: | |||||
h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale); | h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale); | ||||
h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale); | h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale); | ||||
if(IS_INTRA16x16(mb_type)){ | if(IS_INTRA16x16(mb_type)){ | ||||
if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||||
AV_ZERO128(h->mb_luma_dc+0); | |||||
AV_ZERO128(h->mb_luma_dc+8); | |||||
if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){ | |||||
return -1; //FIXME continue if partitioned and other return -1 too | return -1; //FIXME continue if partitioned and other return -1 too | ||||
} | } | ||||
@@ -282,6 +282,7 @@ void ff_h264dsp_init(H264DSPContext *c) | |||||
c->h264_idct8_add4 = ff_h264_idct8_add4_c; | c->h264_idct8_add4 = ff_h264_idct8_add4_c; | ||||
c->h264_idct_add8 = ff_h264_idct_add8_c; | c->h264_idct_add8 = ff_h264_idct_add8_c; | ||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | ||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c; | |||||
c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c; | ||||
c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c; | ||||
@@ -65,11 +65,13 @@ typedef struct H264DSPContext{ | |||||
void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | ||||
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); | void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); | ||||
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | ||||
void (*h264_dct)(DCTELEM block[4][4]); | void (*h264_dct)(DCTELEM block[4][4]); | ||||
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | ||||
void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul); | |||||
}H264DSPContext; | }H264DSPContext; | ||||
void ff_h264dsp_init(H264DSPContext *c); | void ff_h264dsp_init(H264DSPContext *c); | ||||
@@ -216,3 +216,38 @@ void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block | |||||
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | ||||
} | } | ||||
} | } | ||||
/** | |||||
* IDCT transforms the 16 dc values and dequantizes them. | |||||
* @param qp quantization parameter | |||||
*/ | |||||
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){ | |||||
#define stride 16 | |||||
int i; | |||||
int temp[16]; | |||||
static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride}; | |||||
for(i=0; i<4; i++){ | |||||
const int z0= input[4*i+0] + input[4*i+1]; | |||||
const int z1= input[4*i+0] - input[4*i+1]; | |||||
const int z2= input[4*i+2] - input[4*i+3]; | |||||
const int z3= input[4*i+2] + input[4*i+3]; | |||||
temp[4*i+0]= z0+z3; | |||||
temp[4*i+1]= z0-z3; | |||||
temp[4*i+2]= z1-z2; | |||||
temp[4*i+3]= z1+z2; | |||||
} | |||||
for(i=0; i<4; i++){ | |||||
const int offset= x_offset[i]; | |||||
const int z0= temp[4*0+i] + temp[4*2+i]; | |||||
const int z1= temp[4*0+i] - temp[4*2+i]; | |||||
const int z2= temp[4*1+i] - temp[4*3+i]; | |||||
const int z3= temp[4*1+i] + temp[4*3+i]; | |||||
output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); | |||||
output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8)); | |||||
output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8)); | |||||
output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8)); | |||||
} | |||||
} |
@@ -126,21 +126,19 @@ static const uint32_t svq3_dequant_coeff[32] = { | |||||
}; | }; | ||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||||
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp) | |||||
{ | { | ||||
const int qmul = svq3_dequant_coeff[qp]; | const int qmul = svq3_dequant_coeff[qp]; | ||||
#define stride 16 | #define stride 16 | ||||
int i; | int i; | ||||
int temp[16]; | int temp[16]; | ||||
static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride}; | static const int x_offset[4] = {0, 1*stride, 4* stride, 5*stride}; | ||||
static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride}; | |||||
for (i = 0; i < 4; i++){ | for (i = 0; i < 4; i++){ | ||||
const int offset = y_offset[i]; | |||||
const int z0 = 13*(block[offset+stride*0] + block[offset+stride*4]); | |||||
const int z1 = 13*(block[offset+stride*0] - block[offset+stride*4]); | |||||
const int z2 = 7* block[offset+stride*1] - 17*block[offset+stride*5]; | |||||
const int z3 = 17* block[offset+stride*1] + 7*block[offset+stride*5]; | |||||
const int z0= 13*(input[4*i+0] + input[4*i+1]); | |||||
const int z1= 13*(input[4*i+0] - input[4*i+1]); | |||||
const int z2= 7* input[4*i+2] - 17*input[4*i+3]; | |||||
const int z3= 17* input[4*i+2] + 7*input[4*i+3]; | |||||
temp[4*i+0] = z0+z3; | temp[4*i+0] = z0+z3; | ||||
temp[4*i+1] = z1+z2; | temp[4*i+1] = z1+z2; | ||||
@@ -155,10 +153,10 @@ void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp) | |||||
const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i]; | const int z2 = 7* temp[4*1+i] - 17*temp[4*3+i]; | ||||
const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i]; | const int z3 = 17* temp[4*1+i] + 7*temp[4*3+i]; | ||||
block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||||
block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||||
block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||||
block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||||
output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20; | |||||
output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20; | |||||
output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20; | |||||
output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20; | |||||
} | } | ||||
} | } | ||||
#undef stride | #undef stride | ||||
@@ -41,6 +41,7 @@ DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | |||||
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | ||||
{0x8000000080000000ULL, 0x8000000080000000ULL}; | {0x8000000080000000ULL, 0x8000000080000000ULL}; | ||||
DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL; | |||||
DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; | DECLARE_ALIGNED(8, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL}; | ||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL}; | ||||
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL}; | ||||
@@ -47,6 +47,7 @@ scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8 | |||||
%endif | %endif | ||||
cextern pw_32 | cextern pw_32 | ||||
cextern pw_1 | |||||
SECTION .text | SECTION .text | ||||
@@ -854,3 +855,156 @@ cglobal h264_idct_add8_sse2, 5, 7, 8 | |||||
add8_sse2_cycle 2, 0x21 | add8_sse2_cycle 2, 0x21 | ||||
add8_sse2_cycle 3, 0x29 | add8_sse2_cycle 3, 0x29 | ||||
RET | RET | ||||
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul) | |||||
%macro WALSH4_1D 5 | |||||
SUMSUB_BADC m%4, m%3, m%2, m%1, m%5 | |||||
SUMSUB_BADC m%4, m%2, m%3, m%1, m%5 | |||||
SWAP %1, %4, %3 | |||||
%endmacro | |||||
%macro DEQUANT_MMX 3 | |||||
mova m7, [pw_1] | |||||
mova m4, %1 | |||||
punpcklwd %1, m7 | |||||
punpckhwd m4, m7 | |||||
mova m5, %2 | |||||
punpcklwd %2, m7 | |||||
punpckhwd m5, m7 | |||||
movd m7, t3d | |||||
punpckldq m7, m7 | |||||
pmaddwd %1, m7 | |||||
pmaddwd %2, m7 | |||||
pmaddwd m4, m7 | |||||
pmaddwd m5, m7 | |||||
psrad %1, %3 | |||||
psrad %2, %3 | |||||
psrad m4, %3 | |||||
psrad m5, %3 | |||||
packssdw %1, m4 | |||||
packssdw %2, m5 | |||||
%endmacro | |||||
%macro STORE_WORDS_MMX 5 | |||||
movd t0d, %1 | |||||
psrlq %1, 32 | |||||
movd t1d, %1 | |||||
mov [t2+%2*32], t0w | |||||
mov [t2+%4*32], t1w | |||||
shr t0d, 16 | |||||
shr t1d, 16 | |||||
mov [t2+%3*32], t0w | |||||
mov [t2+%5*32], t1w | |||||
%endmacro | |||||
%macro DEQUANT_STORE_MMX 1 | |||||
DEQUANT_MMX m0, m1, %1 | |||||
STORE_WORDS_MMX m0, 0, 1, 4, 5 | |||||
STORE_WORDS_MMX m1, 2, 3, 6, 7 | |||||
DEQUANT_MMX m2, m3, %1 | |||||
STORE_WORDS_MMX m2, 8, 9, 12, 13 | |||||
STORE_WORDS_MMX m3, 10, 11, 14, 15 | |||||
%endmacro | |||||
%macro STORE_WORDS_SSE 9 | |||||
movd t0d, %1 | |||||
psrldq %1, 4 | |||||
movd t1d, %1 | |||||
psrldq %1, 4 | |||||
mov [t2+%2*32], t0w | |||||
mov [t2+%4*32], t1w | |||||
shr t0d, 16 | |||||
shr t1d, 16 | |||||
mov [t2+%3*32], t0w | |||||
mov [t2+%5*32], t1w | |||||
movd t0d, %1 | |||||
psrldq %1, 4 | |||||
movd t1d, %1 | |||||
mov [t2+%6*32], t0w | |||||
mov [t2+%8*32], t1w | |||||
shr t0d, 16 | |||||
shr t1d, 16 | |||||
mov [t2+%7*32], t0w | |||||
mov [t2+%9*32], t1w | |||||
%endmacro | |||||
%macro DEQUANT_STORE_SSE2 1 | |||||
movd xmm4, t3d | |||||
movq xmm5, [pw_1] | |||||
pshufd xmm4, xmm4, 0 | |||||
movq2dq xmm0, m0 | |||||
movq2dq xmm1, m1 | |||||
movq2dq xmm2, m2 | |||||
movq2dq xmm3, m3 | |||||
punpcklwd xmm0, xmm5 | |||||
punpcklwd xmm1, xmm5 | |||||
punpcklwd xmm2, xmm5 | |||||
punpcklwd xmm3, xmm5 | |||||
pmaddwd xmm0, xmm4 | |||||
pmaddwd xmm1, xmm4 | |||||
pmaddwd xmm2, xmm4 | |||||
pmaddwd xmm3, xmm4 | |||||
psrad xmm0, %1 | |||||
psrad xmm1, %1 | |||||
psrad xmm2, %1 | |||||
psrad xmm3, %1 | |||||
packssdw xmm0, xmm1 | |||||
packssdw xmm2, xmm3 | |||||
STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7 | |||||
STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15 | |||||
%endmacro | |||||
%macro IDCT_DC_DEQUANT 2 | |||||
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2 | |||||
movq m3, [r1+24] | |||||
movq m2, [r1+16] | |||||
movq m1, [r1+ 8] | |||||
movq m0, [r1+ 0] | |||||
WALSH4_1D 0,1,2,3,4 | |||||
TRANSPOSE4x4W 0,1,2,3,4 | |||||
WALSH4_1D 0,1,2,3,4 | |||||
; shift, tmp, output, qmul | |||||
%ifdef WIN64 | |||||
DECLARE_REG_TMP 0,3,1,2 | |||||
; we can't avoid this, because r0 is the shift register (ecx) on win64 | |||||
xchg r0, t2 | |||||
%elifdef ARCH_X86_64 | |||||
DECLARE_REG_TMP 3,1,0,2 | |||||
%else | |||||
DECLARE_REG_TMP 1,3,0,2 | |||||
%endif | |||||
cmp t3d, 32767 | |||||
jg .big_qmul | |||||
add t3d, 128 << 16 | |||||
%ifidn %1,mmx | |||||
DEQUANT_STORE_MMX 8 | |||||
%else | |||||
DEQUANT_STORE_SSE2 8 | |||||
%endif | |||||
RET | |||||
.big_qmul: | |||||
bsr t0d, t3d | |||||
add t3d, 128 << 16 | |||||
mov t1d, 7 | |||||
cmp t0d, t1d | |||||
cmovg t0d, t1d | |||||
inc t1d | |||||
shr t3d, t0b | |||||
sub t1d, t0d | |||||
%ifidn %1,mmx | |||||
movd m6, t1d | |||||
DEQUANT_STORE_MMX m6 | |||||
%else | |||||
movd xmm6, t1d | |||||
DEQUANT_STORE_SSE2 xmm6 | |||||
%endif | |||||
RET | |||||
%endmacro | |||||
INIT_MMX | |||||
IDCT_DC_DEQUANT mmx, 0 | |||||
IDCT_DC_DEQUANT sse2, 7 |
@@ -59,6 +59,8 @@ void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM | |||||
int stride, const uint8_t nnzc[6*8]); | int stride, const uint8_t nnzc[6*8]); | ||||
void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, | void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block, | ||||
int stride, const uint8_t nnzc[6*8]); | int stride, const uint8_t nnzc[6*8]); | ||||
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul); | |||||
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul); | |||||
/***********************************/ | /***********************************/ | ||||
/* deblocking */ | /* deblocking */ | ||||
@@ -301,6 +303,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | ||||
c->h264_idct_add8 = ff_h264_idct_add8_mmx; | c->h264_idct_add8 = ff_h264_idct_add8_mmx; | ||||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | ||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx; | |||||
if (mm_flags & AV_CPU_FLAG_MMX2) { | if (mm_flags & AV_CPU_FLAG_MMX2) { | ||||
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | ||||
@@ -341,6 +344,7 @@ void ff_h264dsp_init_x86(H264DSPContext *c) | |||||
if (mm_flags&AV_CPU_FLAG_SSE2) { | if (mm_flags&AV_CPU_FLAG_SSE2) { | ||||
c->h264_idct8_add = ff_h264_idct8_add_sse2; | c->h264_idct8_add = ff_h264_idct8_add_sse2; | ||||
c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | ||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2; | |||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | ||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | ||||