|
|
@@ -420,6 +420,44 @@ static void clear_blocks_mmx(DCTELEM *blocks) |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
static int pix_sum16_mmx(UINT8 * pix, int line_size){ |
|
|
|
const int h=16; |
|
|
|
int sum; |
|
|
|
int index= -line_size*h; |
|
|
|
|
|
|
|
__asm __volatile( |
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
"pxor %%mm6, %%mm6 \n\t" |
|
|
|
"1: \n\t" |
|
|
|
"movq (%2, %1), %%mm0 \n\t" |
|
|
|
"movq (%2, %1), %%mm1 \n\t" |
|
|
|
"movq 8(%2, %1), %%mm2 \n\t" |
|
|
|
"movq 8(%2, %1), %%mm3 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
"punpckhbw %%mm7, %%mm1 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
"punpckhbw %%mm7, %%mm3 \n\t" |
|
|
|
"paddw %%mm0, %%mm1 \n\t" |
|
|
|
"paddw %%mm2, %%mm3 \n\t" |
|
|
|
"paddw %%mm1, %%mm3 \n\t" |
|
|
|
"paddw %%mm3, %%mm6 \n\t" |
|
|
|
"addl %3, %1 \n\t" |
|
|
|
" js 1b \n\t" |
|
|
|
"movq %%mm6, %%mm5 \n\t" |
|
|
|
"psrlq $32, %%mm6 \n\t" |
|
|
|
"paddw %%mm5, %%mm6 \n\t" |
|
|
|
"movq %%mm6, %%mm5 \n\t" |
|
|
|
"psrlq $16, %%mm6 \n\t" |
|
|
|
"paddw %%mm5, %%mm6 \n\t" |
|
|
|
"movd %%mm6, %0 \n\t" |
|
|
|
"andl $0xFFFF, %0 \n\t" |
|
|
|
: "=&r" (sum), "+r" (index) |
|
|
|
: "r" (pix - index), "r" (line_size) |
|
|
|
); |
|
|
|
|
|
|
|
return sum; |
|
|
|
} |
|
|
|
|
|
|
|
#if 0 |
|
|
|
static void just_return() { return; } |
|
|
|
#endif |
|
|
@@ -448,6 +486,7 @@ void dsputil_init_mmx(void) |
|
|
|
put_pixels_clamped = put_pixels_clamped_mmx; |
|
|
|
add_pixels_clamped = add_pixels_clamped_mmx; |
|
|
|
clear_blocks= clear_blocks_mmx; |
|
|
|
pix_sum= pix_sum16_mmx; |
|
|
|
|
|
|
|
pix_abs16x16 = pix_abs16x16_mmx; |
|
|
|
pix_abs16x16_x2 = pix_abs16x16_x2_mmx; |
|
|
|