|
|
@@ -3242,6 +3242,69 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride) |
|
|
|
#endif |
|
|
|
} |
|
|
|
|
|
|
|
#if ARCH_X86 && TEMPLATE_PP_MMXEXT |
|
|
|
static inline void RENAME(prefetchnta)(const void *p) |
|
|
|
{ |
|
|
|
__asm__ volatile( "prefetchnta (%0)\n\t" |
|
|
|
: : "r" (p) |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
static inline void RENAME(prefetcht0)(const void *p) |
|
|
|
{ |
|
|
|
__asm__ volatile( "prefetcht0 (%0)\n\t" |
|
|
|
: : "r" (p) |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
static inline void RENAME(prefetcht1)(const void *p) |
|
|
|
{ |
|
|
|
__asm__ volatile( "prefetcht1 (%0)\n\t" |
|
|
|
: : "r" (p) |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
static inline void RENAME(prefetcht2)(const void *p) |
|
|
|
{ |
|
|
|
__asm__ volatile( "prefetcht2 (%0)\n\t" |
|
|
|
: : "r" (p) |
|
|
|
); |
|
|
|
} |
|
|
|
#elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2) |
|
|
|
static inline void RENAME(prefetchnta)(const void *p) |
|
|
|
{ |
|
|
|
__builtin_prefetch(p,0,0); |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht0)(const void *p) |
|
|
|
{ |
|
|
|
__builtin_prefetch(p,0,1); |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht1)(const void *p) |
|
|
|
{ |
|
|
|
__builtin_prefetch(p,0,2); |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht2)(const void *p) |
|
|
|
{ |
|
|
|
__builtin_prefetch(p,0,3); |
|
|
|
} |
|
|
|
#else |
|
|
|
static inline void RENAME(prefetchnta)(const void *p) |
|
|
|
{ |
|
|
|
return; |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht0)(const void *p) |
|
|
|
{ |
|
|
|
return; |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht1)(const void *p) |
|
|
|
{ |
|
|
|
return; |
|
|
|
} |
|
|
|
static inline void RENAME(prefetcht2)(const void *p) |
|
|
|
{ |
|
|
|
return; |
|
|
|
} |
|
|
|
#endif |
|
|
|
/** |
|
|
|
* Filter array of bytes (Y or U or V values) |
|
|
|
*/ |
|
|
@@ -3368,34 +3431,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ |
|
|
|
// finish 1 block before the next otherwise we might have a problem |
|
|
|
// with the L1 Cache of the P4 ... or only a few blocks at a time or something |
|
|
|
for(x=0; x<width; x+=BLOCK_SIZE){ |
|
|
|
|
|
|
|
#if TEMPLATE_PP_MMXEXT && HAVE_6REGS |
|
|
|
/* |
|
|
|
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
|
|
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
|
|
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
|
|
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
|
|
*/ |
|
|
|
|
|
|
|
__asm__( |
|
|
|
"mov %4, %%"REG_a" \n\t" |
|
|
|
"shr $2, %%"REG_a" \n\t" |
|
|
|
"and $6, %%"REG_a" \n\t" |
|
|
|
"add %5, %%"REG_a" \n\t" |
|
|
|
"mov %%"REG_a", %%"REG_d" \n\t" |
|
|
|
"imul %1, %%"REG_a" \n\t" |
|
|
|
"imul %3, %%"REG_d" \n\t" |
|
|
|
"prefetchnta 32(%%"REG_a", %0) \n\t" |
|
|
|
"prefetcht0 32(%%"REG_d", %2) \n\t" |
|
|
|
"add %1, %%"REG_a" \n\t" |
|
|
|
"add %3, %%"REG_d" \n\t" |
|
|
|
"prefetchnta 32(%%"REG_a", %0) \n\t" |
|
|
|
"prefetcht0 32(%%"REG_d", %2) \n\t" |
|
|
|
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
|
|
|
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
|
|
|
: "%"REG_a, "%"REG_d |
|
|
|
); |
|
|
|
#endif |
|
|
|
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); |
|
|
|
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); |
|
|
|
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); |
|
|
|
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); |
|
|
|
|
|
|
|
RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
|
|
|
srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
|
@@ -3474,33 +3513,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[ |
|
|
|
uint8_t *dstBlockStart = dstBlock; |
|
|
|
const uint8_t *srcBlockStart = srcBlock; |
|
|
|
for(; x < endx; x+=BLOCK_SIZE){ |
|
|
|
#if TEMPLATE_PP_MMXEXT && HAVE_6REGS |
|
|
|
/* |
|
|
|
prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
|
|
|
prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
|
|
|
prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
|
|
|
prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
|
|
|
*/ |
|
|
|
|
|
|
|
__asm__( |
|
|
|
"mov %4, %%"REG_a" \n\t" |
|
|
|
"shr $2, %%"REG_a" \n\t" |
|
|
|
"and $6, %%"REG_a" \n\t" |
|
|
|
"add %5, %%"REG_a" \n\t" |
|
|
|
"mov %%"REG_a", %%"REG_d" \n\t" |
|
|
|
"imul %1, %%"REG_a" \n\t" |
|
|
|
"imul %3, %%"REG_d" \n\t" |
|
|
|
"prefetchnta 32(%%"REG_a", %0) \n\t" |
|
|
|
"prefetcht0 32(%%"REG_d", %2) \n\t" |
|
|
|
"add %1, %%"REG_a" \n\t" |
|
|
|
"add %3, %%"REG_d" \n\t" |
|
|
|
"prefetchnta 32(%%"REG_a", %0) \n\t" |
|
|
|
"prefetcht0 32(%%"REG_d", %2) \n\t" |
|
|
|
:: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
|
|
|
"g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
|
|
|
: "%"REG_a, "%"REG_d |
|
|
|
); |
|
|
|
#endif |
|
|
|
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); |
|
|
|
RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); |
|
|
|
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); |
|
|
|
RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); |
|
|
|
|
|
|
|
RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
|
|
|
srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
|
|