takes the most time, and it allows for more efficient unaligned access and better control over memory latencies. Originally committed as revision 711 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -22,6 +22,8 @@ | |||||
void simple_idct_axp(DCTELEM *block); | void simple_idct_axp(DCTELEM *block); | ||||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
int line_size, int h); | |||||
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | ||||
int line_size); | int line_size); | ||||
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | ||||
@@ -232,12 +234,12 @@ static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |||||
void dsputil_init_alpha(void) | void dsputil_init_alpha(void) | ||||
{ | { | ||||
put_pixels_tab[0] = put_pixels_axp; | |||||
put_pixels_tab[0] = put_pixels_axp_asm; | |||||
put_pixels_tab[1] = put_pixels_x2_axp; | put_pixels_tab[1] = put_pixels_x2_axp; | ||||
put_pixels_tab[2] = put_pixels_y2_axp; | put_pixels_tab[2] = put_pixels_y2_axp; | ||||
put_pixels_tab[3] = put_pixels_xy2_axp; | put_pixels_tab[3] = put_pixels_xy2_axp; | ||||
put_no_rnd_pixels_tab[0] = put_pixels_axp; | |||||
put_no_rnd_pixels_tab[0] = put_pixels_axp_asm; | |||||
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; | put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; | ||||
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | ||||
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | ||||
@@ -43,6 +43,123 @@ | |||||
.arch pca56 | .arch pca56 | ||||
.text | .text | ||||
/************************************************************************ | |||||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
* int line_size, int h) | |||||
*/ | |||||
.align 6 | |||||
.globl put_pixels_axp_asm | |||||
.ent put_pixels_axp_asm | |||||
put_pixels_axp_asm: | |||||
.frame sp, 0, ra | |||||
.prologue 0 | |||||
#ifdef HAVE_GPROF | |||||
lda AT, _mcount | |||||
jsr AT, (AT), _mcount | |||||
#endif | |||||
and a1, 7, t0 | |||||
beq t0, $aligned | |||||
.align 4 | |||||
$unaligned: | |||||
ldq_u t0, 0(a1) | |||||
ldq_u t1, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t2, 0(a1) | |||||
ldq_u t3, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t4, 0(a1) | |||||
ldq_u t5, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t6, 0(a1) | |||||
ldq_u t7, 8(a1) | |||||
extql t0, a1, t0 | |||||
addq a1, a2, a1 | |||||
extqh t1, a1, t1 | |||||
addq a0, a2, t8 | |||||
extql t2, a1, t2 | |||||
addq t8, a2, t9 | |||||
extqh t3, a1, t3 | |||||
addq t9, a2, ta | |||||
extql t4, a1, t4 | |||||
or t0, t1, t0 | |||||
extqh t5, a1, t5 | |||||
or t2, t3, t2 | |||||
extql t6, a1, t6 | |||||
or t4, t5, t4 | |||||
extqh t7, a1, t7 | |||||
or t6, t7, t6 | |||||
stq t0, 0(a0) | |||||
stq t2, 0(t8) | |||||
stq t4, 0(t9) | |||||
subq a3, 4, a3 | |||||
stq t6, 0(ta) | |||||
addq ta, a2, a0 | |||||
bne a3, $unaligned | |||||
ret | |||||
.align 4 | |||||
$aligned: | |||||
ldq t0, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t1, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t2, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t3, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t4, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t5, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t6, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t7, 0(a1) | |||||
addq a1, a2, a1 | |||||
addq a0, a2, t8 | |||||
stq t0, 0(a0) | |||||
addq t8, a2, t9 | |||||
stq t1, 0(t8) | |||||
addq t9, a2, ta | |||||
stq t2, 0(t9) | |||||
addq ta, a2, tb | |||||
stq t3, 0(ta) | |||||
addq tb, a2, tc | |||||
stq t4, 0(tb) | |||||
addq tc, a2, td | |||||
stq t5, 0(tc) | |||||
addq td, a2, te | |||||
stq t6, 0(td) | |||||
addq te, a2, a0 | |||||
stq t7, 0(te) | |||||
subq a3, 8, a3 | |||||
bne a3, $aligned | |||||
ret | |||||
.end put_pixels_axp_asm | |||||
/************************************************************************ | /************************************************************************ | ||||
* void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | ||||
* int line_size) | * int line_size) | ||||