Gives average 13-20% mpeg decoding speedup on x86 systems. Originally committed as revision 30 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -21,6 +21,7 @@ | |||||
#include "avcodec.h" | #include "avcodec.h" | ||||
#include "dsputil.h" | #include "dsputil.h" | ||||
void (*ff_idct)(DCTELEM *block); | |||||
void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); | void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); | ||||
void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | ||||
void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | ||||
@@ -363,6 +364,7 @@ void dsputil_init(void) | |||||
squareTbl[i] = (i - 256) * (i - 256); | squareTbl[i] = (i - 256) * (i - 256); | ||||
} | } | ||||
ff_idct = j_rev_dct; | |||||
get_pixels = get_pixels_c; | get_pixels = get_pixels_c; | ||||
put_pixels_clamped = put_pixels_clamped_c; | put_pixels_clamped = put_pixels_clamped_c; | ||||
add_pixels_clamped = add_pixels_clamped_c; | add_pixels_clamped = add_pixels_clamped_c; | ||||
@@ -25,6 +25,7 @@ void dsputil_init(void); | |||||
/* pixel ops : interface with DCT */ | /* pixel ops : interface with DCT */ | ||||
extern void (*ff_idct)(DCTELEM *block); | |||||
extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); | extern void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size); | ||||
extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | extern void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | ||||
extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | extern void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size); | ||||
@@ -29,6 +29,16 @@ int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | |||||
int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | ||||
int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); | ||||
#ifdef USE_MMX_IDCT | |||||
/* external functions, defined in libmpeg2 */ | |||||
void mmx_idct(DCTELEM *block); | |||||
void mmxext_idct(DCTELEM *block); | |||||
/* this should be in dsputil.h? -- A'rpi */ | |||||
extern UINT8 ff_alternate_horizontal_scan[64]; | |||||
extern UINT8 ff_alternate_vertical_scan[64]; | |||||
extern UINT8 zigzag_direct[64]; | |||||
#endif | |||||
/* pixel operations */ | /* pixel operations */ | ||||
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; | ||||
static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; | ||||
@@ -1039,5 +1049,23 @@ void dsputil_init_mmx(void) | |||||
sub_pixels_tab[1] = sub_pixels_x2_3dnow; | sub_pixels_tab[1] = sub_pixels_x2_3dnow; | ||||
sub_pixels_tab[2] = sub_pixels_y2_3dnow; | sub_pixels_tab[2] = sub_pixels_y2_3dnow; | ||||
} | } | ||||
#ifdef USE_MMX_IDCT | |||||
/* use MMX / MMXEXT iDCT code from libmpeg2 */ | |||||
//printf("LIBAVCODEC: Using MMX%s iDCT code\n",(mm_flags & MM_MMXEXT)?"EXT":""); | |||||
ff_idct = (mm_flags & MM_MMXEXT) ? mmxext_idct : mmx_idct; | |||||
/* the mmx/mmxext idct uses a reordered input, so we patch scan tables */ | |||||
{ int i,j; | |||||
for (i = 0; i < 64; i++) { | |||||
j = zigzag_direct[i]; | |||||
zigzag_direct[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |||||
j = ff_alternate_horizontal_scan[i]; | |||||
ff_alternate_horizontal_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |||||
j = ff_alternate_vertical_scan[i]; | |||||
ff_alternate_vertical_scan[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2); | |||||
} | |||||
} | |||||
#endif | |||||
} | } | ||||
} | } |
@@ -331,7 +331,8 @@ static const UINT8 mbMotionVectorTable[17][2] = { | |||||
{ 0xc, 10 }, | { 0xc, 10 }, | ||||
}; | }; | ||||
const UINT8 zigzag_direct[64] = { | |||||
//const | |||||
UINT8 zigzag_direct[64] = { | |||||
0, 1, 8, 16, 9, 2, 3, 10, | 0, 1, 8, 16, 9, 2, 3, 10, | ||||
17, 24, 32, 25, 18, 11, 4, 5, | 17, 24, 32, 25, 18, 11, 4, 5, | ||||
12, 19, 26, 33, 40, 48, 41, 34, | 12, 19, 26, 33, 40, 48, 41, 34, | ||||
@@ -634,7 +634,7 @@ static inline void put_dct(MpegEncContext *s, | |||||
{ | { | ||||
if (!s->mpeg2) | if (!s->mpeg2) | ||||
s->dct_unquantize(s, block, i, s->qscale); | s->dct_unquantize(s, block, i, s->qscale); | ||||
j_rev_dct (block); | |||||
ff_idct (block); | |||||
put_pixels_clamped(block, dest, line_size); | put_pixels_clamped(block, dest, line_size); | ||||
} | } | ||||
@@ -645,7 +645,7 @@ static inline void add_dct(MpegEncContext *s, | |||||
if (s->block_last_index[i] >= 0) { | if (s->block_last_index[i] >= 0) { | ||||
if (!s->mpeg2) | if (!s->mpeg2) | ||||
s->dct_unquantize(s, block, i, s->qscale); | s->dct_unquantize(s, block, i, s->qscale); | ||||
j_rev_dct (block); | |||||
ff_idct (block); | |||||
add_pixels_clamped(block, dest, line_size); | add_pixels_clamped(block, dest, line_size); | ||||
} | } | ||||
} | } | ||||
@@ -179,7 +179,8 @@ typedef struct MpegEncContext { | |||||
DCTELEM *block, int n, int qscale); | DCTELEM *block, int n, int qscale); | ||||
} MpegEncContext; | } MpegEncContext; | ||||
extern const UINT8 zigzag_direct[64]; | |||||
//const | |||||
extern UINT8 zigzag_direct[64]; | |||||
int MPV_common_init(MpegEncContext *s); | int MPV_common_init(MpegEncContext *s); | ||||
void MPV_common_end(MpegEncContext *s); | void MPV_common_end(MpegEncContext *s); | ||||