|
|
|
@@ -1306,42 +1306,43 @@ int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t |
|
|
|
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
int sum; |
|
|
|
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
{ |
|
|
|
const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); |
|
|
|
register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0); |
|
|
|
register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
|
|
|
#ifdef CONFIG_DARWIN |
|
|
|
const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); |
|
|
|
const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); |
|
|
|
const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); |
|
|
|
const_vector unsigned char perm1 = (const_vector unsigned char) |
|
|
|
{ |
|
|
|
register const_vector signed short vprod1 = (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); |
|
|
|
register const_vector signed short vprod2 = (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); |
|
|
|
register const_vector signed short vprod3 = (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); |
|
|
|
register const_vector unsigned char perm1 = (const_vector unsigned char) |
|
|
|
(0x02, 0x03, 0x00, 0x01, |
|
|
|
0x06, 0x07, 0x04, 0x05, |
|
|
|
0x0A, 0x0B, 0x08, 0x09, |
|
|
|
0x0E, 0x0F, 0x0C, 0x0D); |
|
|
|
const_vector unsigned char perm2 = (const_vector unsigned char) |
|
|
|
register const_vector unsigned char perm2 = (const_vector unsigned char) |
|
|
|
(0x04, 0x05, 0x06, 0x07, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x08, 0x09, 0x0A, 0x0B); |
|
|
|
const_vector unsigned char perm3 = (const_vector unsigned char) |
|
|
|
register const_vector unsigned char perm3 = (const_vector unsigned char) |
|
|
|
(0x08, 0x09, 0x0A, 0x0B, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x04, 0x05, 0x06, 0x07); |
|
|
|
#else |
|
|
|
const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; |
|
|
|
const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; |
|
|
|
const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; |
|
|
|
const_vector unsigned char perm1 = (const_vector unsigned char) |
|
|
|
register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; |
|
|
|
register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; |
|
|
|
register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; |
|
|
|
register const_vector unsigned char perm1 = (const_vector unsigned char) |
|
|
|
{0x02, 0x03, 0x00, 0x01, |
|
|
|
0x06, 0x07, 0x04, 0x05, |
|
|
|
0x0A, 0x0B, 0x08, 0x09, |
|
|
|
0x0E, 0x0F, 0x0C, 0x0D}; |
|
|
|
const_vector unsigned char perm2 = (const_vector unsigned char) |
|
|
|
register const_vector unsigned char perm2 = (const_vector unsigned char) |
|
|
|
{0x04, 0x05, 0x06, 0x07, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x08, 0x09, 0x0A, 0x0B}; |
|
|
|
const_vector unsigned char perm3 = (const_vector unsigned char) |
|
|
|
register const_vector unsigned char perm3 = (const_vector unsigned char) |
|
|
|
{0x08, 0x09, 0x0A, 0x0B, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
@@ -1350,8 +1351,8 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
|
|
|
|
#define ONEITERBUTTERFLY(i, res) \ |
|
|
|
{ \ |
|
|
|
vector unsigned char src1, src2, srcO; \ |
|
|
|
vector unsigned char dst1, dst2, dstO; \ |
|
|
|
register vector unsigned char src1, src2, srcO; \ |
|
|
|
register vector unsigned char dst1, dst2, dstO; \ |
|
|
|
src1 = vec_ld(stride * i, src); \ |
|
|
|
if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ |
|
|
|
src2 = vec_ld((stride * i) + 16, src); \ |
|
|
|
@@ -1362,20 +1363,19 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
|
|
/* promote the unsigned chars to signed shorts */ \ |
|
|
|
/* we're in the 8x8 function, we only care for the first 8 */ \ |
|
|
|
vector signed short srcV = \ |
|
|
|
register vector signed short srcV = \ |
|
|
|
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ |
|
|
|
vector signed short dstV = \ |
|
|
|
register vector signed short dstV = \ |
|
|
|
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ |
|
|
|
/* substractions inside the first butterfly */ \ |
|
|
|
vector signed short but0 = vec_sub(srcV, dstV); \ |
|
|
|
vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
|
|
vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
|
|
vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
|
|
vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
|
|
vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
|
|
register vector signed short but0 = vec_sub(srcV, dstV); \ |
|
|
|
register vector signed short op1 = vec_perm(but0, but0, perm1); \ |
|
|
|
register vector signed short but1 = vec_mladd(but0, vprod1, op1); \ |
|
|
|
register vector signed short op2 = vec_perm(but1, but1, perm2); \ |
|
|
|
register vector signed short but2 = vec_mladd(but1, vprod2, op2); \ |
|
|
|
register vector signed short op3 = vec_perm(but2, but2, perm3); \ |
|
|
|
res = vec_mladd(but2, vprod3, op3); \ |
|
|
|
} |
|
|
|
vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
|
|
|
ONEITERBUTTERFLY(0, temp0); |
|
|
|
ONEITERBUTTERFLY(1, temp1); |
|
|
|
ONEITERBUTTERFLY(2, temp2); |
|
|
|
@@ -1384,53 +1384,275 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
ONEITERBUTTERFLY(5, temp5); |
|
|
|
ONEITERBUTTERFLY(6, temp6); |
|
|
|
ONEITERBUTTERFLY(7, temp7); |
|
|
|
} |
|
|
|
#undef ONEITERBUTTERFLY |
|
|
|
{ |
|
|
|
vector signed int vsum; |
|
|
|
vector signed short line0 = vec_add(temp0, temp1); |
|
|
|
vector signed short line1 = vec_sub(temp0, temp1); |
|
|
|
vector signed short line2 = vec_add(temp2, temp3); |
|
|
|
vector signed short line3 = vec_sub(temp2, temp3); |
|
|
|
vector signed short line4 = vec_add(temp4, temp5); |
|
|
|
vector signed short line5 = vec_sub(temp4, temp5); |
|
|
|
vector signed short line6 = vec_add(temp6, temp7); |
|
|
|
vector signed short line7 = vec_sub(temp6, temp7); |
|
|
|
{ |
|
|
|
register vector signed int vsum; |
|
|
|
register vector signed short line0 = vec_add(temp0, temp1); |
|
|
|
register vector signed short line1 = vec_sub(temp0, temp1); |
|
|
|
register vector signed short line2 = vec_add(temp2, temp3); |
|
|
|
register vector signed short line3 = vec_sub(temp2, temp3); |
|
|
|
register vector signed short line4 = vec_add(temp4, temp5); |
|
|
|
register vector signed short line5 = vec_sub(temp4, temp5); |
|
|
|
register vector signed short line6 = vec_add(temp6, temp7); |
|
|
|
register vector signed short line7 = vec_sub(temp6, temp7); |
|
|
|
|
|
|
|
register vector signed short line0B = vec_add(line0, line2); |
|
|
|
register vector signed short line2B = vec_sub(line0, line2); |
|
|
|
register vector signed short line1B = vec_add(line1, line3); |
|
|
|
register vector signed short line3B = vec_sub(line1, line3); |
|
|
|
register vector signed short line4B = vec_add(line4, line6); |
|
|
|
register vector signed short line6B = vec_sub(line4, line6); |
|
|
|
register vector signed short line5B = vec_add(line5, line7); |
|
|
|
register vector signed short line7B = vec_sub(line5, line7); |
|
|
|
|
|
|
|
register vector signed short line0C = vec_add(line0B, line4B); |
|
|
|
register vector signed short line4C = vec_sub(line0B, line4B); |
|
|
|
register vector signed short line1C = vec_add(line1B, line5B); |
|
|
|
register vector signed short line5C = vec_sub(line1B, line5B); |
|
|
|
register vector signed short line2C = vec_add(line2B, line6B); |
|
|
|
register vector signed short line6C = vec_sub(line2B, line6B); |
|
|
|
register vector signed short line3C = vec_add(line3B, line7B); |
|
|
|
register vector signed short line7C = vec_sub(line3B, line7B); |
|
|
|
|
|
|
|
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
|
|
vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
|
|
vsum = vec_sums(vsum, (vector signed int)vzero); |
|
|
|
vsum = vec_splat(vsum, 3); |
|
|
|
vec_ste(vsum, 0, &sum); |
|
|
|
} |
|
|
|
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
return sum; |
|
|
|
} |
|
|
|
|
|
|
|
/* |
|
|
|
16x8 works with 16 elements ; it allows to avoid replicating |
|
|
|
loads, and give the compiler more rooms for scheduling. |
|
|
|
It's only used from inside hadamard8_diff16_altivec. |
|
|
|
|
|
|
|
Unfortunately, it seems gcc-3.3 is a bit dumb, and |
|
|
|
the compiled code has a LOT of spill code, it seems |
|
|
|
gcc (unlike xlc) cannot keep everything in registers |
|
|
|
by itself. The following code include hand-made |
|
|
|
registers allocation. It's not clean, but on |
|
|
|
a 7450 the resulting code is much faster (best case |
|
|
|
fall from 700+ cycles to 550). |
|
|
|
|
|
|
|
xlc doesn't add spill code, but it doesn't know how to |
|
|
|
schedule for the 7450, and its code isn't much faster than |
|
|
|
gcc-3.3 on the 7450 (but uses 25% less instructions...) |
|
|
|
|
|
|
|
On the 970, the hand-made RA is still a win (arount 690 |
|
|
|
vs. around 780), but xlc goes to around 660 on the |
|
|
|
regular C code... |
|
|
|
*/ |
|
|
|
|
|
|
|
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { |
|
|
|
int sum; |
|
|
|
register vector signed short |
|
|
|
temp0 asm ("v0"), |
|
|
|
temp1 asm ("v1"), |
|
|
|
temp2 asm ("v2"), |
|
|
|
temp3 asm ("v3"), |
|
|
|
temp4 asm ("v4"), |
|
|
|
temp5 asm ("v5"), |
|
|
|
temp6 asm ("v6"), |
|
|
|
temp7 asm ("v7"); |
|
|
|
register vector signed short |
|
|
|
temp0S asm ("v8"), |
|
|
|
temp1S asm ("v9"), |
|
|
|
temp2S asm ("v10"), |
|
|
|
temp3S asm ("v11"), |
|
|
|
temp4S asm ("v12"), |
|
|
|
temp5S asm ("v13"), |
|
|
|
temp6S asm ("v14"), |
|
|
|
temp7S asm ("v15"); |
|
|
|
register const_vector unsigned char vzero asm ("v31")= (const_vector unsigned char)vec_splat_u8(0); |
|
|
|
{ |
|
|
|
#ifdef CONFIG_DARWIN |
|
|
|
register const_vector signed short vprod1 asm ("v16")= (const_vector signed short)( 1,-1, 1,-1, 1,-1, 1,-1); |
|
|
|
register const_vector signed short vprod2 asm ("v17")= (const_vector signed short)( 1, 1,-1,-1, 1, 1,-1,-1); |
|
|
|
register const_vector signed short vprod3 asm ("v18")= (const_vector signed short)( 1, 1, 1, 1,-1,-1,-1,-1); |
|
|
|
register const_vector unsigned char perm1 asm ("v19")= (const_vector unsigned char) |
|
|
|
(0x02, 0x03, 0x00, 0x01, |
|
|
|
0x06, 0x07, 0x04, 0x05, |
|
|
|
0x0A, 0x0B, 0x08, 0x09, |
|
|
|
0x0E, 0x0F, 0x0C, 0x0D); |
|
|
|
register const_vector unsigned char perm2 asm ("v20")= (const_vector unsigned char) |
|
|
|
(0x04, 0x05, 0x06, 0x07, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x08, 0x09, 0x0A, 0x0B); |
|
|
|
register const_vector unsigned char perm3 asm ("v21")= (const_vector unsigned char) |
|
|
|
(0x08, 0x09, 0x0A, 0x0B, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x04, 0x05, 0x06, 0x07); |
|
|
|
#else |
|
|
|
register const_vector signed short vprod1 = (const_vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1}; |
|
|
|
register const_vector signed short vprod2 = (const_vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1}; |
|
|
|
register const_vector signed short vprod3 = (const_vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1}; |
|
|
|
register const_vector unsigned char perm1 = (const_vector unsigned char) |
|
|
|
{0x02, 0x03, 0x00, 0x01, |
|
|
|
0x06, 0x07, 0x04, 0x05, |
|
|
|
0x0A, 0x0B, 0x08, 0x09, |
|
|
|
0x0E, 0x0F, 0x0C, 0x0D}; |
|
|
|
register const_vector unsigned char perm2 = (const_vector unsigned char) |
|
|
|
{0x04, 0x05, 0x06, 0x07, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x08, 0x09, 0x0A, 0x0B}; |
|
|
|
register const_vector unsigned char perm3 = (const_vector unsigned char) |
|
|
|
{0x08, 0x09, 0x0A, 0x0B, |
|
|
|
0x0C, 0x0D, 0x0E, 0x0F, |
|
|
|
0x00, 0x01, 0x02, 0x03, |
|
|
|
0x04, 0x05, 0x06, 0x07}; |
|
|
|
#endif |
|
|
|
#define ONEITERBUTTERFLY(i, res1, res2) \ |
|
|
|
{ \ |
|
|
|
register vector unsigned char src1 asm ("v22"), src2 asm ("v23"); \ |
|
|
|
register vector unsigned char dst1 asm ("v24"), dst2 asm ("v25"); \ |
|
|
|
src1 = vec_ld(stride * i, src); \ |
|
|
|
src2 = vec_ld((stride * i) + 16, src); \ |
|
|
|
register vector unsigned char srcO asm ("v22") = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ |
|
|
|
dst1 = vec_ld(stride * i, dst); \ |
|
|
|
dst2 = vec_ld((stride * i) + 16, dst); \ |
|
|
|
register vector unsigned char dstO asm ("v23") = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ |
|
|
|
/* promote the unsigned chars to signed shorts */ \ |
|
|
|
register vector signed short srcV asm ("v24") = \ |
|
|
|
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)srcO); \ |
|
|
|
register vector signed short dstV asm ("v25") = \ |
|
|
|
(vector signed short)vec_mergeh((vector signed char)vzero, (vector signed char)dstO); \ |
|
|
|
register vector signed short srcW asm ("v26") = \ |
|
|
|
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)srcO); \ |
|
|
|
register vector signed short dstW asm ("v27") = \ |
|
|
|
(vector signed short)vec_mergel((vector signed char)vzero, (vector signed char)dstO); \ |
|
|
|
/* substractions inside the first butterfly */ \ |
|
|
|
register vector signed short but0 asm ("v28") = vec_sub(srcV, dstV); \ |
|
|
|
register vector signed short but0S asm ("v29") = vec_sub(srcW, dstW); \ |
|
|
|
register vector signed short op1 asm ("v30") = vec_perm(but0, but0, perm1); \ |
|
|
|
register vector signed short but1 asm ("v22") = vec_mladd(but0, vprod1, op1); \ |
|
|
|
register vector signed short op1S asm ("v23") = vec_perm(but0S, but0S, perm1); \ |
|
|
|
register vector signed short but1S asm ("v24") = vec_mladd(but0S, vprod1, op1S); \ |
|
|
|
register vector signed short op2 asm ("v25") = vec_perm(but1, but1, perm2); \ |
|
|
|
register vector signed short but2 asm ("v26") = vec_mladd(but1, vprod2, op2); \ |
|
|
|
register vector signed short op2S asm ("v27") = vec_perm(but1S, but1S, perm2); \ |
|
|
|
register vector signed short but2S asm ("v28") = vec_mladd(but1S, vprod2, op2S); \ |
|
|
|
register vector signed short op3 asm ("v29") = vec_perm(but2, but2, perm3); \ |
|
|
|
res1 = vec_mladd(but2, vprod3, op3); \ |
|
|
|
register vector signed short op3S asm ("v30") = vec_perm(but2S, but2S, perm3); \ |
|
|
|
res2 = vec_mladd(but2S, vprod3, op3S); \ |
|
|
|
} |
|
|
|
ONEITERBUTTERFLY(0, temp0, temp0S); |
|
|
|
ONEITERBUTTERFLY(1, temp1, temp1S); |
|
|
|
ONEITERBUTTERFLY(2, temp2, temp2S); |
|
|
|
ONEITERBUTTERFLY(3, temp3, temp3S); |
|
|
|
ONEITERBUTTERFLY(4, temp4, temp4S); |
|
|
|
ONEITERBUTTERFLY(5, temp5, temp5S); |
|
|
|
ONEITERBUTTERFLY(6, temp6, temp6S); |
|
|
|
ONEITERBUTTERFLY(7, temp7, temp7S); |
|
|
|
} |
|
|
|
#undef ONEITERBUTTERFLY |
|
|
|
{ |
|
|
|
register vector signed int vsum; |
|
|
|
register vector signed short line0 = vec_add(temp0, temp1); |
|
|
|
register vector signed short line1 = vec_sub(temp0, temp1); |
|
|
|
register vector signed short line2 = vec_add(temp2, temp3); |
|
|
|
register vector signed short line3 = vec_sub(temp2, temp3); |
|
|
|
register vector signed short line4 = vec_add(temp4, temp5); |
|
|
|
register vector signed short line5 = vec_sub(temp4, temp5); |
|
|
|
register vector signed short line6 = vec_add(temp6, temp7); |
|
|
|
register vector signed short line7 = vec_sub(temp6, temp7); |
|
|
|
|
|
|
|
vector signed short line0B = vec_add(line0, line2); |
|
|
|
vector signed short line2B = vec_sub(line0, line2); |
|
|
|
vector signed short line1B = vec_add(line1, line3); |
|
|
|
vector signed short line3B = vec_sub(line1, line3); |
|
|
|
vector signed short line4B = vec_add(line4, line6); |
|
|
|
vector signed short line6B = vec_sub(line4, line6); |
|
|
|
vector signed short line5B = vec_add(line5, line7); |
|
|
|
vector signed short line7B = vec_sub(line5, line7); |
|
|
|
register vector signed short line0B = vec_add(line0, line2); |
|
|
|
register vector signed short line2B = vec_sub(line0, line2); |
|
|
|
register vector signed short line1B = vec_add(line1, line3); |
|
|
|
register vector signed short line3B = vec_sub(line1, line3); |
|
|
|
register vector signed short line4B = vec_add(line4, line6); |
|
|
|
register vector signed short line6B = vec_sub(line4, line6); |
|
|
|
register vector signed short line5B = vec_add(line5, line7); |
|
|
|
register vector signed short line7B = vec_sub(line5, line7); |
|
|
|
|
|
|
|
vector signed short line0C = vec_add(line0B, line4B); |
|
|
|
vector signed short line4C = vec_sub(line0B, line4B); |
|
|
|
vector signed short line1C = vec_add(line1B, line5B); |
|
|
|
vector signed short line5C = vec_sub(line1B, line5B); |
|
|
|
vector signed short line2C = vec_add(line2B, line6B); |
|
|
|
vector signed short line6C = vec_sub(line2B, line6B); |
|
|
|
vector signed short line3C = vec_add(line3B, line7B); |
|
|
|
vector signed short line7C = vec_sub(line3B, line7B); |
|
|
|
register vector signed short line0C = vec_add(line0B, line4B); |
|
|
|
register vector signed short line4C = vec_sub(line0B, line4B); |
|
|
|
register vector signed short line1C = vec_add(line1B, line5B); |
|
|
|
register vector signed short line5C = vec_sub(line1B, line5B); |
|
|
|
register vector signed short line2C = vec_add(line2B, line6B); |
|
|
|
register vector signed short line6C = vec_sub(line2B, line6B); |
|
|
|
register vector signed short line3C = vec_add(line3B, line7B); |
|
|
|
register vector signed short line7C = vec_sub(line3B, line7B); |
|
|
|
|
|
|
|
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
|
|
vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
|
|
vsum = vec_sums(vsum, (vector signed int)vzero); |
|
|
|
vsum = vec_splat(vsum, 3); |
|
|
|
vec_ste(vsum, 0, &sum); |
|
|
|
} |
|
|
|
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); |
|
|
|
vsum = vec_sum4s(vec_abs(line1C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line2C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line3C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line4C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line5C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line6C), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line7C), vsum); |
|
|
|
|
|
|
|
register vector signed short line0S = vec_add(temp0S, temp1S); |
|
|
|
register vector signed short line1S = vec_sub(temp0S, temp1S); |
|
|
|
register vector signed short line2S = vec_add(temp2S, temp3S); |
|
|
|
register vector signed short line3S = vec_sub(temp2S, temp3S); |
|
|
|
register vector signed short line4S = vec_add(temp4S, temp5S); |
|
|
|
register vector signed short line5S = vec_sub(temp4S, temp5S); |
|
|
|
register vector signed short line6S = vec_add(temp6S, temp7S); |
|
|
|
register vector signed short line7S = vec_sub(temp6S, temp7S); |
|
|
|
|
|
|
|
register vector signed short line0BS = vec_add(line0S, line2S); |
|
|
|
register vector signed short line2BS = vec_sub(line0S, line2S); |
|
|
|
register vector signed short line1BS = vec_add(line1S, line3S); |
|
|
|
register vector signed short line3BS = vec_sub(line1S, line3S); |
|
|
|
register vector signed short line4BS = vec_add(line4S, line6S); |
|
|
|
register vector signed short line6BS = vec_sub(line4S, line6S); |
|
|
|
register vector signed short line5BS = vec_add(line5S, line7S); |
|
|
|
register vector signed short line7BS = vec_sub(line5S, line7S); |
|
|
|
|
|
|
|
register vector signed short line0CS = vec_add(line0BS, line4BS); |
|
|
|
register vector signed short line4CS = vec_sub(line0BS, line4BS); |
|
|
|
register vector signed short line1CS = vec_add(line1BS, line5BS); |
|
|
|
register vector signed short line5CS = vec_sub(line1BS, line5BS); |
|
|
|
register vector signed short line2CS = vec_add(line2BS, line6BS); |
|
|
|
register vector signed short line6CS = vec_sub(line2BS, line6BS); |
|
|
|
register vector signed short line3CS = vec_add(line3BS, line7BS); |
|
|
|
register vector signed short line7CS = vec_sub(line3BS, line7BS); |
|
|
|
|
|
|
|
vsum = vec_sum4s(vec_abs(line0CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line1CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line2CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line3CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line4CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line5CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line6CS), vsum); |
|
|
|
vsum = vec_sum4s(vec_abs(line7CS), vsum); |
|
|
|
vsum = vec_sums(vsum, (vector signed int)vzero); |
|
|
|
vsum = vec_splat(vsum, 3); |
|
|
|
vec_ste(vsum, 0, &sum); |
|
|
|
} |
|
|
|
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); |
|
|
|
return sum; |
|
|
|
} |
|
|
|
|
|
|
|
int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ |
|
|
|
POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); |
|
|
|
int score; |
|
|
|
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); |
|
|
|
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
|
|
if (h==16) { |
|
|
|
dst += 8*stride; |
|
|
|
src += 8*stride; |
|
|
|
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); |
|
|
|
} |
|
|
|
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); |
|
|
|
return score; |
|
|
|
} |
|
|
|
|
|
|
|
int has_altivec(void) |
|
|
|
{ |
|
|
|
#ifdef CONFIG_DARWIN |
|
|
|
|