thus avoiding the calling overhead. New functions are not yet used. Originally committed as revision 16206 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -4281,6 +4281,10 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx) | |||
c->h264_idct8_add= ff_h264_idct8_add_c; | |||
c->h264_idct_dc_add= ff_h264_idct_dc_add_c; | |||
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c; | |||
c->h264_idct_add16 = ff_h264_idct_add16_c; | |||
c->h264_idct8_add4 = ff_h264_idct8_add4_c; | |||
c->h264_idct_add8 = ff_h264_idct_add8_c; | |||
c->h264_idct_add16intra= ff_h264_idct_add16intra_c; | |||
} | |||
c->get_pixels = get_pixels_c; | |||
@@ -60,6 +60,10 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); | |||
void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride); | |||
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block); | |||
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block); | |||
void ff_h264_idct_add16_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_idct_add16intra_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]); | |||
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, | |||
const float *src2, int src3, int blocksize, int step); | |||
@@ -441,6 +445,10 @@ typedef struct DSPContext { | |||
void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride); | |||
void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride); | |||
void (*h264_dct)(DCTELEM block[4][4]); | |||
void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]); | |||
/* snow wavelet */ | |||
void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width); | |||
@@ -165,3 +165,55 @@ void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride){ | |||
dst += stride; | |||
} | |||
} | |||
//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split | |||
static const uint8_t scan8[16 + 2*4]={ | |||
4+1*8, 5+1*8, 4+2*8, 5+2*8, | |||
6+1*8, 7+1*8, 6+2*8, 7+2*8, | |||
4+3*8, 5+3*8, 4+4*8, 5+4*8, | |||
6+3*8, 7+3*8, 6+4*8, 7+4*8, | |||
1+1*8, 2+1*8, | |||
1+2*8, 2+2*8, | |||
1+4*8, 2+4*8, | |||
1+5*8, 2+5*8, | |||
}; | |||
void ff_h264_idct_add16_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
int nnz = nnzc[ scan8[i] ]; | |||
if(nnz){ | |||
if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); | |||
else idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); | |||
} | |||
} | |||
} | |||
void ff_h264_idct_add16intra_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
if(nnzc[ scan8[i] ]) idct_internal (dst + block_offset[i], block + i*16, stride, 4, 6, 1); | |||
else if(block[i*16]) ff_h264_idct_dc_add_c(dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
void ff_h264_idct8_add4_c(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i+=4){ | |||
int nnz = nnzc[ scan8[i] ]; | |||
if(nnz){ | |||
if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_c(dst + block_offset[i], block + i*16, stride); | |||
else ff_h264_idct8_add_c (dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
} | |||
void ff_h264_idct_add8_c(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=16; i<16+8; i++){ | |||
if(nnzc[ scan8[i] ]) | |||
ff_h264_idct_add_c (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
else if(block[i*16]) | |||
ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
} | |||
} |
@@ -2629,8 +2629,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
c->h264_idct_add= ff_h264_idct_add_mmx; | |||
c->h264_idct8_dc_add= | |||
c->h264_idct8_add= ff_h264_idct8_add_mmx; | |||
if (mm_flags & FF_MM_SSE2) | |||
c->h264_idct8_add= ff_h264_idct8_add_sse2; | |||
c->h264_idct_add16 = ff_h264_idct_add16_mmx; | |||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx; | |||
c->h264_idct_add8 = ff_h264_idct_add8_mmx; | |||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx; | |||
if (mm_flags & FF_MM_MMXEXT) { | |||
c->prefetch = prefetch_mmx2; | |||
@@ -2651,6 +2654,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; | |||
c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; | |||
c->h264_idct_add16 = ff_h264_idct_add16_mmx2; | |||
c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2; | |||
c->h264_idct_add8 = ff_h264_idct_add8_mmx2; | |||
c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2; | |||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ | |||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; | |||
@@ -2807,6 +2814,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) | |||
H264_QPEL_FUNCS(0, 0, sse2); | |||
} | |||
if(mm_flags & FF_MM_SSE2){ | |||
c->h264_idct8_add = ff_h264_idct8_add_sse2; | |||
c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |||
H264_QPEL_FUNCS(0, 1, sse2); | |||
H264_QPEL_FUNCS(0, 2, sse2); | |||
H264_QPEL_FUNCS(0, 3, sse2); | |||
@@ -376,6 +376,101 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | |||
} | |||
} | |||
//FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split | |||
static const uint8_t scan8[16 + 2*4]={ | |||
4+1*8, 5+1*8, 4+2*8, 5+2*8, | |||
6+1*8, 7+1*8, 6+2*8, 7+2*8, | |||
4+3*8, 5+3*8, 4+4*8, 5+4*8, | |||
6+3*8, 7+3*8, 6+4*8, 7+4*8, | |||
1+1*8, 2+1*8, | |||
1+2*8, 2+2*8, | |||
1+4*8, 2+4*8, | |||
1+5*8, 2+5*8, | |||
}; | |||
static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
if(nnzc[ scan8[i] ]) | |||
ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i+=4){ | |||
if(nnzc[ scan8[i] ]) | |||
ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
int nnz = nnzc[ scan8[i] ]; | |||
if(nnz){ | |||
if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |||
else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
} | |||
static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
if(nnzc[ scan8[i] ] || block[i*16]) | |||
ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i++){ | |||
if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride); | |||
else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i+=4){ | |||
int nnz = nnzc[ scan8[i] ]; | |||
if(nnz){ | |||
if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |||
else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
} | |||
static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=0; i<16; i+=4){ | |||
int nnz = nnzc[ scan8[i] ]; | |||
if(nnz){ | |||
if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride); | |||
else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
} | |||
static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=16; i<16+8; i++){ | |||
if(nnzc[ scan8[i] ] || block[i*16]) | |||
ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){ | |||
int i; | |||
for(i=16; i<16+8; i++){ | |||
if(nnzc[ scan8[i] ]) | |||
ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
else if(block[i*16]) | |||
ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride); | |||
} | |||
} | |||
/***********************************/ | |||
/* deblocking */ | |||