takes the most time, and it allows for more efficient unaligned access and better control over memory latencies. Originally committed as revision 711 to svn://svn.ffmpeg.org/ffmpeg/trunktags/v0.5
@@ -22,6 +22,8 @@ | |||
void simple_idct_axp(DCTELEM *block); | |||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
int line_size, int h); | |||
void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |||
int line_size); | |||
void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |||
@@ -232,12 +234,12 @@ static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) | |||
void dsputil_init_alpha(void) | |||
{ | |||
put_pixels_tab[0] = put_pixels_axp; | |||
put_pixels_tab[0] = put_pixels_axp_asm; | |||
put_pixels_tab[1] = put_pixels_x2_axp; | |||
put_pixels_tab[2] = put_pixels_y2_axp; | |||
put_pixels_tab[3] = put_pixels_xy2_axp; | |||
put_no_rnd_pixels_tab[0] = put_pixels_axp; | |||
put_no_rnd_pixels_tab[0] = put_pixels_axp_asm; | |||
put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; | |||
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; | |||
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; | |||
@@ -43,6 +43,123 @@ | |||
.arch pca56 | |||
.text | |||
/************************************************************************ | |||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
* int line_size, int h) | |||
*/ | |||
.align 6 | |||
.globl put_pixels_axp_asm | |||
.ent put_pixels_axp_asm | |||
put_pixels_axp_asm: | |||
.frame sp, 0, ra | |||
.prologue 0 | |||
#ifdef HAVE_GPROF | |||
lda AT, _mcount | |||
jsr AT, (AT), _mcount | |||
#endif | |||
and a1, 7, t0 | |||
beq t0, $aligned | |||
.align 4 | |||
$unaligned: | |||
ldq_u t0, 0(a1) | |||
ldq_u t1, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t2, 0(a1) | |||
ldq_u t3, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t4, 0(a1) | |||
ldq_u t5, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t6, 0(a1) | |||
ldq_u t7, 8(a1) | |||
extql t0, a1, t0 | |||
addq a1, a2, a1 | |||
extqh t1, a1, t1 | |||
addq a0, a2, t8 | |||
extql t2, a1, t2 | |||
addq t8, a2, t9 | |||
extqh t3, a1, t3 | |||
addq t9, a2, ta | |||
extql t4, a1, t4 | |||
or t0, t1, t0 | |||
extqh t5, a1, t5 | |||
or t2, t3, t2 | |||
extql t6, a1, t6 | |||
or t4, t5, t4 | |||
extqh t7, a1, t7 | |||
or t6, t7, t6 | |||
stq t0, 0(a0) | |||
stq t2, 0(t8) | |||
stq t4, 0(t9) | |||
subq a3, 4, a3 | |||
stq t6, 0(ta) | |||
addq ta, a2, a0 | |||
bne a3, $unaligned | |||
ret | |||
.align 4 | |||
$aligned: | |||
ldq t0, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t1, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t2, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t3, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t4, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t5, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t6, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t7, 0(a1) | |||
addq a1, a2, a1 | |||
addq a0, a2, t8 | |||
stq t0, 0(a0) | |||
addq t8, a2, t9 | |||
stq t1, 0(t8) | |||
addq t9, a2, ta | |||
stq t2, 0(t9) | |||
addq ta, a2, tb | |||
stq t3, 0(ta) | |||
addq tb, a2, tc | |||
stq t4, 0(tb) | |||
addq tc, a2, td | |||
stq t5, 0(tc) | |||
addq td, a2, te | |||
stq t6, 0(td) | |||
addq te, a2, a0 | |||
stq t7, 0(te) | |||
subq a3, 8, a3 | |||
bne a3, $aligned | |||
ret | |||
.end put_pixels_axp_asm | |||
/************************************************************************ | |||
* void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |||
* int line_size) | |||