Browse Source

h264: special case dc-only idct. ~1% faster overall

Originally committed as revision 4971 to svn://svn.ffmpeg.org/ffmpeg/trunk
tags/v0.5
Loren Merritt 19 years ago
parent
commit
ef9d1d1575
6 changed files with 173 additions and 38 deletions
  1. +2
    -0
      libavcodec/dsputil.c
  2. +4
    -0
      libavcodec/dsputil.h
  3. +59
    -38
      libavcodec/h264.c
  4. +25
    -0
      libavcodec/h264idct.c
  5. +2
    -0
      libavcodec/i386/dsputil_mmx.c
  6. +81
    -0
      libavcodec/i386/h264dsp_mmx.c

+ 2
- 0
libavcodec/dsputil.c View File

@@ -3851,6 +3851,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)


c->h264_idct_add= ff_h264_idct_add_c; c->h264_idct_add= ff_h264_idct_add_c;
c->h264_idct8_add= ff_h264_idct8_add_c; c->h264_idct8_add= ff_h264_idct8_add_c;
c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;


c->get_pixels = get_pixels_c; c->get_pixels = get_pixels_c;
c->diff_pixels = diff_pixels_c; c->diff_pixels = diff_pixels_c;


+ 4
- 0
libavcodec/dsputil.h View File

@@ -52,6 +52,8 @@ void ff_fdct_sse2(DCTELEM *block);


void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride); void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);


@@ -330,6 +332,8 @@ typedef struct DSPContext {


void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride); void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride); void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
} DSPContext; } DSPContext;


void dsputil_static_init(void); void dsputil_static_init(void);


+ 59
- 38
libavcodec/h264.c View File

@@ -3314,6 +3314,7 @@ static void hl_decode_mb(H264Context *h){
const unsigned int bottom = mb_y & 1; const unsigned int bottom = mb_y & 1;
const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass); const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride); void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);


if(!s->decode) if(!s->decode)
return; return;
@@ -3337,9 +3338,16 @@ static void hl_decode_mb(H264Context *h){
// dct_offset = s->linesize * 16; // dct_offset = s->linesize * 16;
} }


idct_add = transform_bypass
? IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4
: IS_8x8DCT(mb_type) ? s->dsp.h264_idct8_add : s->dsp.h264_idct_add;
if(transform_bypass){
idct_dc_add =
idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
}else if(IS_8x8DCT(mb_type)){
idct_dc_add = s->dsp.h264_idct8_dc_add;
idct_add = s->dsp.h264_idct8_add;
}else{
idct_dc_add = s->dsp.h264_idct_dc_add;
idct_add = s->dsp.h264_idct_add;
}


if (IS_INTRA_PCM(mb_type)) { if (IS_INTRA_PCM(mb_type)) {
unsigned int x, y; unsigned int x, y;
@@ -3389,17 +3397,22 @@ static void hl_decode_mb(H264Context *h){
for(i=0; i<16; i+=4){ for(i=0; i<16; i+=4){
uint8_t * const ptr= dest_y + block_offset[i]; uint8_t * const ptr= dest_y + block_offset[i];
const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
const int nnz = h->non_zero_count_cache[ scan8[i] ];
h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000, h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
(h->topright_samples_available<<(i+1))&0x8000, linesize); (h->topright_samples_available<<(i+1))&0x8000, linesize);
if(h->non_zero_count_cache[ scan8[i] ])
idct_add(ptr, h->mb + i*16, linesize);
if(nnz){
if(nnz == 1 && h->mb[i*16])
idct_dc_add(ptr, h->mb + i*16, linesize);
else
idct_add(ptr, h->mb + i*16, linesize);
}
} }
}else }else
for(i=0; i<16; i++){ for(i=0; i<16; i++){
uint8_t * const ptr= dest_y + block_offset[i]; uint8_t * const ptr= dest_y + block_offset[i];
uint8_t *topright; uint8_t *topright;
const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ]; const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
int tr;
int nnz, tr;


if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){ if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
const int topright_avail= (h->topright_samples_available<<i)&0x8000; const int topright_avail= (h->topright_samples_available<<i)&0x8000;
@@ -3413,10 +3426,14 @@ static void hl_decode_mb(H264Context *h){
topright= NULL; topright= NULL;


h->pred4x4[ dir ](ptr, topright, linesize); h->pred4x4[ dir ](ptr, topright, linesize);
if(h->non_zero_count_cache[ scan8[i] ]){
if(s->codec_id == CODEC_ID_H264)
idct_add(ptr, h->mb + i*16, linesize);
else
nnz = h->non_zero_count_cache[ scan8[i] ];
if(nnz){
if(s->codec_id == CODEC_ID_H264){
if(nnz == 1 && h->mb[i*16])
idct_dc_add(ptr, h->mb + i*16, linesize);
else
idct_add(ptr, h->mb + i*16, linesize);
}else
svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0); svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
} }
} }
@@ -3453,11 +3470,23 @@ static void hl_decode_mb(H264Context *h){


if(!IS_INTRA4x4(mb_type)){ if(!IS_INTRA4x4(mb_type)){
if(s->codec_id == CODEC_ID_H264){ if(s->codec_id == CODEC_ID_H264){
const int di = IS_8x8DCT(mb_type) ? 4 : 1;
for(i=0; i<16; i+=di){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
uint8_t * const ptr= dest_y + block_offset[i];
idct_add(ptr, h->mb + i*16, linesize);
if(IS_INTRA16x16(mb_type)){
for(i=0; i<16; i++){
if(h->non_zero_count_cache[ scan8[i] ])
idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
else if(h->mb[i*16])
idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
}
}else{
const int di = IS_8x8DCT(mb_type) ? 4 : 1;
for(i=0; i<16; i+=di){
int nnz = h->non_zero_count_cache[ scan8[i] ];
if(nnz){
if(nnz==1 && h->mb[i*16])
idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
else
idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
}
} }
} }
}else{ }else{
@@ -3471,34 +3500,26 @@ static void hl_decode_mb(H264Context *h){
} }


if(!(s->flags&CODEC_FLAG_GRAY)){ if(!(s->flags&CODEC_FLAG_GRAY)){
idct_add = transform_bypass ? s->dsp.add_pixels4 : s->dsp.h264_idct_add;
if(!transform_bypass){
uint8_t *dest[2] = {dest_cb, dest_cr};
if(transform_bypass){
idct_add = idct_dc_add = s->dsp.add_pixels4;
}else{
idct_add = s->dsp.h264_idct_add;
idct_dc_add = s->dsp.h264_idct_dc_add;
chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]); chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]); chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
} }
if(s->codec_id == CODEC_ID_H264){ if(s->codec_id == CODEC_ID_H264){
for(i=16; i<16+4; i++){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
uint8_t * const ptr= dest_cb + block_offset[i];
idct_add(ptr, h->mb + i*16, uvlinesize);
}
}
for(i=20; i<20+4; i++){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
uint8_t * const ptr= dest_cr + block_offset[i];
idct_add(ptr, h->mb + i*16, uvlinesize);
}
for(i=16; i<16+8; i++){
if(h->non_zero_count_cache[ scan8[i] ])
idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
else if(h->mb[i*16])
idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
} }
}else{ }else{
for(i=16; i<16+4; i++){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
uint8_t * const ptr= dest_cb + block_offset[i];
svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
}
}
for(i=20; i<20+4; i++){
for(i=16; i<16+8; i++){
if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
uint8_t * const ptr= dest_cr + block_offset[i];
uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2); svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
} }
} }
@@ -5131,7 +5152,7 @@ decode_intra_mb:
return -1; return -1;
} }
nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ]; nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
nnz[0] |= nnz[1] | nnz[8] | nnz[9];
nnz[0] += nnz[1] + nnz[8] + nnz[9];
}else{ }else{
for(i4x4=0; i4x4<4; i4x4++){ for(i4x4=0; i4x4<4; i4x4++){
const int index= i4x4 + 4*i8x8; const int index= i4x4 + 4*i8x8;
@@ -5690,7 +5711,7 @@ static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n
h->non_zero_count_cache[scan8[16+n]] = coeff_count; h->non_zero_count_cache[scan8[16+n]] = coeff_count;
else { else {
assert( cat == 5 ); assert( cat == 5 );
fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, 1, 1);
fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
} }


for( i = coeff_count - 1; i >= 0; i-- ) { for( i = coeff_count - 1; i >= 0; i-- ) {


+ 25
- 0
libavcodec/h264idct.c View File

@@ -139,3 +139,28 @@ void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride){
dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ]; dst[i + 7*stride] = cm[ dst[i + 7*stride] + ((b0 - b7) >> 6) ];
} }
} }

// assumes all AC coefs are 0
void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
int i, j;
uint8_t *cm = cropTbl + MAX_NEG_CROP;
int dc = (block[0] + 32) >> 6;
for( j = 0; j < 4; j++ )
{
for( i = 0; i < 4; i++ )
dst[i] = cm[ dst[i] + dc ];
dst += stride;
}
}

void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){
int i, j;
uint8_t *cm = cropTbl + MAX_NEG_CROP;
int dc = (block[0] + 32) >> 6;
for( j = 0; j < 8; j++ )
{
for( i = 0; i < 8; i++ )
dst[i] = cm[ dst[i] + dc ];
dst += stride;
}
}

+ 2
- 0
libavcodec/i386/dsputil_mmx.c View File

@@ -2754,6 +2754,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#endif //CONFIG_ENCODERS #endif //CONFIG_ENCODERS


c->h264_idct_add= ff_h264_idct_add_mmx2; c->h264_idct_add= ff_h264_idct_add_mmx2;
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;


if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;


+ 81
- 0
libavcodec/i386/h264dsp_mmx.c View File

@@ -104,6 +104,87 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
); );
} }


void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
{
int dc = (block[0] + 32) >> 6;
asm volatile(
"movd %0, %%mm0 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"pmaxsw %%mm7, %%mm0 \n\t"
"pmaxsw %%mm7, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
asm volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dst+0*stride)),
"+m"(*(uint32_t*)(dst+1*stride)),
"+m"(*(uint32_t*)(dst+2*stride)),
"+m"(*(uint32_t*)(dst+3*stride))
);
}

void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
{
int dc = (block[0] + 32) >> 6;
int y;
asm volatile(
"movd %0, %%mm0 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"pmaxsw %%mm7, %%mm0 \n\t"
"pmaxsw %%mm7, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
for(y=2; y--; dst += 4*stride){
asm volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint64_t*)(dst+0*stride)),
"+m"(*(uint64_t*)(dst+1*stride)),
"+m"(*(uint64_t*)(dst+2*stride)),
"+m"(*(uint64_t*)(dst+3*stride))
);
}
}



/***********************************/ /***********************************/
/* deblocking */ /* deblocking */


Loading…
Cancel
Save