|
|
|
@@ -5,6 +5,11 @@ |
|
|
|
// current version mostly by Michael Niedermayer (michaelni@gmx.at) |
|
|
|
// the parts written by michael are under GNU GPL |
|
|
|
|
|
|
|
/* TODO |
|
|
|
Move static / global vars into a struct so multiple scalers can be used |
|
|
|
write vertical cubic upscale / linear downscale stuff |
|
|
|
*/ |
|
|
|
|
|
|
|
#undef MOVNTQ |
|
|
|
#undef PAVGB |
|
|
|
#undef PREFETCH |
|
|
|
@@ -1154,10 +1159,176 @@ static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *buf1, uint16_t *uv |
|
|
|
#endif |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) |
|
|
|
// Bilinear / Bicubic scaling |
|
|
|
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, |
|
|
|
int16_t *filter, int16_t *filterPos, int filterSize) |
|
|
|
{ |
|
|
|
#ifdef HAVE_MMX |
|
|
|
if(filterSize==4) // allways true for upscaling, sometimes for down too |
|
|
|
{ |
|
|
|
int counter= -2*dstW; |
|
|
|
filter-= counter*2; |
|
|
|
filterPos-= counter/2; |
|
|
|
dst-= counter/2; |
|
|
|
asm volatile( |
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
"movq w02, %%mm6 \n\t" |
|
|
|
"pushl %%ebp \n\t" // we use 7 regs here ... |
|
|
|
"movl %%eax, %%ebp \n\t" |
|
|
|
".balign 16 \n\t" |
|
|
|
"1: \n\t" |
|
|
|
"movzwl (%2, %%ebp), %%eax \n\t" |
|
|
|
"movzwl 2(%2, %%ebp), %%ebx \n\t" |
|
|
|
"movq (%1, %%ebp, 4), %%mm1 \n\t" |
|
|
|
"movq 8(%1, %%ebp, 4), %%mm3 \n\t" |
|
|
|
"movd (%3, %%eax), %%mm0 \n\t" |
|
|
|
"movd (%3, %%ebx), %%mm2 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
"pmaddwd %%mm1, %%mm0 \n\t" |
|
|
|
"pmaddwd %%mm2, %%mm3 \n\t" |
|
|
|
"psrad $8, %%mm0 \n\t" |
|
|
|
"psrad $8, %%mm3 \n\t" |
|
|
|
"packssdw %%mm3, %%mm0 \n\t" |
|
|
|
"pmaddwd %%mm6, %%mm0 \n\t" |
|
|
|
"packssdw %%mm0, %%mm0 \n\t" |
|
|
|
"movd %%mm0, (%4, %%ebp) \n\t" |
|
|
|
"addl $4, %%ebp \n\t" |
|
|
|
" jnc 1b \n\t" |
|
|
|
|
|
|
|
"popl %%ebp \n\t" |
|
|
|
: "+a" (counter) |
|
|
|
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst) |
|
|
|
: "%ebx" |
|
|
|
); |
|
|
|
} |
|
|
|
else if(filterSize==8) |
|
|
|
{ |
|
|
|
int counter= -2*dstW; |
|
|
|
filter-= counter*4; |
|
|
|
filterPos-= counter/2; |
|
|
|
dst-= counter/2; |
|
|
|
asm volatile( |
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
"movq w02, %%mm6 \n\t" |
|
|
|
"pushl %%ebp \n\t" // we use 7 regs here ... |
|
|
|
"movl %%eax, %%ebp \n\t" |
|
|
|
".balign 16 \n\t" |
|
|
|
"1: \n\t" |
|
|
|
"movzwl (%2, %%ebp), %%eax \n\t" |
|
|
|
"movzwl 2(%2, %%ebp), %%ebx \n\t" |
|
|
|
"movq (%1, %%ebp, 8), %%mm1 \n\t" |
|
|
|
"movq 16(%1, %%ebp, 8), %%mm3 \n\t" |
|
|
|
"movd (%3, %%eax), %%mm0 \n\t" |
|
|
|
"movd (%3, %%ebx), %%mm2 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
"pmaddwd %%mm1, %%mm0 \n\t" |
|
|
|
"pmaddwd %%mm2, %%mm3 \n\t" |
|
|
|
|
|
|
|
"movq 8(%1, %%ebp, 8), %%mm1 \n\t" |
|
|
|
"movq 24(%1, %%ebp, 8), %%mm5 \n\t" |
|
|
|
"movd 4(%3, %%eax), %%mm4 \n\t" |
|
|
|
"movd 4(%3, %%ebx), %%mm2 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm4 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
"pmaddwd %%mm1, %%mm4 \n\t" |
|
|
|
"pmaddwd %%mm2, %%mm5 \n\t" |
|
|
|
"paddd %%mm4, %%mm0 \n\t" |
|
|
|
"paddd %%mm5, %%mm3 \n\t" |
|
|
|
|
|
|
|
"psrad $8, %%mm0 \n\t" |
|
|
|
"psrad $8, %%mm3 \n\t" |
|
|
|
"packssdw %%mm3, %%mm0 \n\t" |
|
|
|
"pmaddwd %%mm6, %%mm0 \n\t" |
|
|
|
"packssdw %%mm0, %%mm0 \n\t" |
|
|
|
"movd %%mm0, (%4, %%ebp) \n\t" |
|
|
|
"addl $4, %%ebp \n\t" |
|
|
|
" jnc 1b \n\t" |
|
|
|
|
|
|
|
"popl %%ebp \n\t" |
|
|
|
: "+a" (counter) |
|
|
|
: "c" (filter), "d" (filterPos), "S" (src), "D" (dst) |
|
|
|
: "%ebx" |
|
|
|
); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
int counter= -2*dstW; |
|
|
|
// filter-= counter*filterSize/2; |
|
|
|
filterPos-= counter/2; |
|
|
|
dst-= counter/2; |
|
|
|
asm volatile( |
|
|
|
"pxor %%mm7, %%mm7 \n\t" |
|
|
|
"movq w02, %%mm6 \n\t" |
|
|
|
".balign 16 \n\t" |
|
|
|
"1: \n\t" |
|
|
|
"movl %2, %%ecx \n\t" |
|
|
|
"movzwl (%%ecx, %0), %%eax \n\t" |
|
|
|
"movzwl 2(%%ecx, %0), %%ebx \n\t" |
|
|
|
"movl %5, %%ecx \n\t" |
|
|
|
"pxor %%mm4, %%mm4 \n\t" |
|
|
|
"pxor %%mm5, %%mm5 \n\t" |
|
|
|
"2: \n\t" |
|
|
|
"movq (%1), %%mm1 \n\t" |
|
|
|
"movq (%1, %6), %%mm3 \n\t" |
|
|
|
"movd (%%ecx, %%eax), %%mm0 \n\t" |
|
|
|
"movd (%%ecx, %%ebx), %%mm2 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm0 \n\t" |
|
|
|
"punpcklbw %%mm7, %%mm2 \n\t" |
|
|
|
"pmaddwd %%mm1, %%mm0 \n\t" |
|
|
|
"pmaddwd %%mm2, %%mm3 \n\t" |
|
|
|
"paddd %%mm3, %%mm5 \n\t" |
|
|
|
"paddd %%mm0, %%mm4 \n\t" |
|
|
|
"addl $8, %1 \n\t" |
|
|
|
"addl $4, %%ecx \n\t" |
|
|
|
"cmpl %4, %%ecx \n\t" |
|
|
|
" jb 2b \n\t" |
|
|
|
"addl %6, %1 \n\t" |
|
|
|
"psrad $8, %%mm4 \n\t" |
|
|
|
"psrad $8, %%mm5 \n\t" |
|
|
|
"packssdw %%mm5, %%mm4 \n\t" |
|
|
|
"pmaddwd %%mm6, %%mm4 \n\t" |
|
|
|
"packssdw %%mm4, %%mm4 \n\t" |
|
|
|
"movl %3, %%eax \n\t" |
|
|
|
"movd %%mm4, (%%eax, %0) \n\t" |
|
|
|
"addl $4, %0 \n\t" |
|
|
|
" jnc 1b \n\t" |
|
|
|
|
|
|
|
: "+r" (counter) |
|
|
|
: "r" (filter), "m" (filterPos), "m" (dst), "m"(src+filterSize), |
|
|
|
"m" (src), "r" (filterSize*2) |
|
|
|
: "%ebx", "%eax", "%ecx", "%edx" |
|
|
|
); |
|
|
|
} |
|
|
|
#else |
|
|
|
int i; |
|
|
|
for(i=0; i<dstW; i++) |
|
|
|
{ |
|
|
|
int j; |
|
|
|
int srcPos= filterPos[i]; |
|
|
|
int val=0; |
|
|
|
// printf("filterPos: %d\n", hFilterPos[i]); |
|
|
|
for(j=0; j<filterSize; j++) |
|
|
|
{ |
|
|
|
// printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]); |
|
|
|
val += ((int)src[srcPos + j])*filter[filterSize*i + j]; |
|
|
|
} |
|
|
|
// filter += hFilterSize; |
|
|
|
dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ... |
|
|
|
// dst[i] = val>>7; |
|
|
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
// *** horizontal scale Y line to temp buffer |
|
|
|
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc) |
|
|
|
{ |
|
|
|
if(sws_flags != SWS_FAST_BILINEAR) |
|
|
|
{ |
|
|
|
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); |
|
|
|
} |
|
|
|
else // Fast Bilinear upscale / crap downscale |
|
|
|
{ |
|
|
|
#ifdef ARCH_X86 |
|
|
|
#ifdef HAVE_MMX2 |
|
|
|
int i; |
|
|
|
@@ -1267,11 +1438,19 @@ FUNNY_Y_CODE |
|
|
|
xpos+=xInc; |
|
|
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, |
|
|
|
uint8_t *src1, uint8_t *src2, int srcW, int xInc) |
|
|
|
{ |
|
|
|
if(sws_flags != SWS_FAST_BILINEAR) |
|
|
|
{ |
|
|
|
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
|
|
|
RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
|
|
|
} |
|
|
|
else // Fast Bilinear upscale / crap downscale |
|
|
|
{ |
|
|
|
#ifdef ARCH_X86 |
|
|
|
#ifdef HAVE_MMX2 |
|
|
|
int i; |
|
|
|
@@ -1402,6 +1581,162 @@ FUNNYUVCODE |
|
|
|
xpos+=xInc; |
|
|
|
} |
|
|
|
#endif |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static void inline RENAME(initFilter)(int16_t *filter, int16_t *filterPos, int *filterSize, int xInc, |
|
|
|
int srcW, int dstW) |
|
|
|
{ |
|
|
|
int i; |
|
|
|
#ifdef HAVE_MMX |
|
|
|
asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS |
|
|
|
#endif |
|
|
|
|
|
|
|
if(xInc <= (1<<16)) // upscale / cubic interpolate |
|
|
|
{ |
|
|
|
int i; |
|
|
|
int xDstInSrc; |
|
|
|
if(sws_flags==SWS_BICUBIC) *filterSize= 4; |
|
|
|
else *filterSize= 2; |
|
|
|
// printf("%d %d %d\n", filterSize, srcW, dstW); |
|
|
|
#ifdef HAVE_MMX |
|
|
|
*filterSize= (*filterSize +3) & (~3); // -> *filterSize %4 == 0 |
|
|
|
#endif |
|
|
|
xDstInSrc= xInc - 0x8000; |
|
|
|
for(i=0; i<dstW; i++) |
|
|
|
{ |
|
|
|
int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1; |
|
|
|
int j; |
|
|
|
|
|
|
|
filterPos[i]= xx; |
|
|
|
if(sws_flags == SWS_BICUBIC) |
|
|
|
{ |
|
|
|
double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16); |
|
|
|
// int coeff; |
|
|
|
int y1,y2,y3,y4; |
|
|
|
double A= -0.75; |
|
|
|
// Equation is from VirtualDub |
|
|
|
y1 = (int)floor(0.5 + ( + A*d - 2.0*A*d*d + A*d*d*d) * 16384.0); |
|
|
|
y2 = (int)floor(0.5 + (+ 1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d) * 16384.0); |
|
|
|
y3 = (int)floor(0.5 + ( - A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d) * 16384.0); |
|
|
|
y4 = (int)floor(0.5 + ( + A*d*d - A*d*d*d) * 16384.0); |
|
|
|
|
|
|
|
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
|
|
|
filter[i*(*filterSize) + 0]= y1; |
|
|
|
filter[i*(*filterSize) + 1]= y2; |
|
|
|
filter[i*(*filterSize) + 2]= y3; |
|
|
|
filter[i*(*filterSize) + 3]= y4; |
|
|
|
// printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
for(j=0; j<*filterSize; j++) |
|
|
|
{ |
|
|
|
double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16); |
|
|
|
int coeff; |
|
|
|
coeff= (int)(0.5 + (1.0 - d)*(1<<14)); |
|
|
|
if(coeff<0) coeff=0; |
|
|
|
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
|
|
|
filter[i*(*filterSize) + j]= coeff; |
|
|
|
xx++; |
|
|
|
} |
|
|
|
} |
|
|
|
xDstInSrc+= xInc; |
|
|
|
} |
|
|
|
} |
|
|
|
else // downscale |
|
|
|
{ |
|
|
|
int xDstInSrc; |
|
|
|
if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW); |
|
|
|
else *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW); |
|
|
|
// printf("%d %d %d\n", *filterSize, srcW, dstW); |
|
|
|
#ifdef HAVE_MMX |
|
|
|
*filterSize= (*filterSize +3) & (~3); // -> *filterSize %4 == 0 |
|
|
|
#endif |
|
|
|
xDstInSrc= xInc - 0x8000; |
|
|
|
for(i=0; i<dstW; i++) |
|
|
|
{ |
|
|
|
int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5); |
|
|
|
int j; |
|
|
|
|
|
|
|
filterPos[i]= xx; |
|
|
|
for(j=0; j<*filterSize; j++) |
|
|
|
{ |
|
|
|
double d= ABS((xx<<16) - xDstInSrc)/(double)xInc; |
|
|
|
int coeff; |
|
|
|
if(sws_flags == SWS_BICUBIC) |
|
|
|
{ |
|
|
|
double A= -0.75; |
|
|
|
// d*=2; |
|
|
|
// Equation is from VirtualDub |
|
|
|
if(d<1.0) |
|
|
|
coeff = (int)floor(0.5 + (1.0 - (A+3.0)*d*d |
|
|
|
+ (A+2.0)*d*d*d) * (1<<14)); |
|
|
|
else if(d<2.0) |
|
|
|
coeff = (int)floor(0.5 + (-4.0*A + 8.0*A*d |
|
|
|
- 5.0*A*d*d + A*d*d*d) * (1<<14)); |
|
|
|
else |
|
|
|
coeff=0; |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
coeff= (int)(0.5 + (1.0 - d)*(1<<14)); |
|
|
|
if(coeff<0) coeff=0; |
|
|
|
} |
|
|
|
// printf("%d %d %d \n", coeff, (int)d, xDstInSrc); |
|
|
|
filter[i*(*filterSize) + j]= coeff; |
|
|
|
xx++; |
|
|
|
} |
|
|
|
xDstInSrc+= xInc; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//fix borders |
|
|
|
for(i=0; i<dstW; i++) |
|
|
|
{ |
|
|
|
int j; |
|
|
|
if(filterPos[i] < 0) |
|
|
|
{ |
|
|
|
// Move filter coeffs left to compensate for filterPos |
|
|
|
for(j=1; j<*filterSize; j++) |
|
|
|
{ |
|
|
|
int left= MAX(j + filterPos[i], 0); |
|
|
|
filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j]; |
|
|
|
filter[i*(*filterSize) + j]=0; |
|
|
|
} |
|
|
|
filterPos[i]= 0; |
|
|
|
} |
|
|
|
|
|
|
|
if(filterPos[i] + *filterSize > srcW) |
|
|
|
{ |
|
|
|
int shift= filterPos[i] + *filterSize - srcW; |
|
|
|
// Move filter coeffs right to compensate for filterPos |
|
|
|
for(j=*filterSize-2; j>=0; j--) |
|
|
|
{ |
|
|
|
int right= MIN(j + shift, *filterSize-1); |
|
|
|
filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j]; |
|
|
|
filter[i*(*filterSize) +j]=0; |
|
|
|
} |
|
|
|
filterPos[i]= srcW - *filterSize; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//Normalize |
|
|
|
for(i=0; i<dstW; i++) |
|
|
|
{ |
|
|
|
int j; |
|
|
|
double sum=0; |
|
|
|
double scale=1<<14; |
|
|
|
for(j=0; j<*filterSize; j++) |
|
|
|
{ |
|
|
|
sum+= filter[i*(*filterSize) + j]; |
|
|
|
} |
|
|
|
scale/= sum; |
|
|
|
for(j=0; j<*filterSize; j++) |
|
|
|
{ |
|
|
|
filter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY , |
|
|
|
@@ -1421,11 +1756,8 @@ static int dstY; |
|
|
|
static int lastLumSrcY; |
|
|
|
static int lastChrSrcY; |
|
|
|
|
|
|
|
#ifdef HAVE_MMX2 |
|
|
|
// used to detect a horizontal size change |
|
|
|
static int old_dstW= -1; |
|
|
|
static int old_s_xinc= -1; |
|
|
|
#endif |
|
|
|
static int oldDstW= -1; |
|
|
|
static int oldSrcW= -1; |
|
|
|
|
|
|
|
int dstUVw; |
|
|
|
int i; |
|
|
|
@@ -1469,10 +1801,19 @@ else s_xinc2= s_xinc; |
|
|
|
= pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128; |
|
|
|
pix_buf_y[0][i]= pix_buf_y[1][i]= 0; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
//precalculate horizontal scaler filter coefficients |
|
|
|
if(oldDstW!=dstW || oldSrcW!=srcW) |
|
|
|
{ |
|
|
|
// int i; |
|
|
|
oldDstW= dstW; oldSrcW= srcW; |
|
|
|
|
|
|
|
RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, s_xinc, srcW, dstW); |
|
|
|
RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, s_xinc2, srcW, dstW); |
|
|
|
|
|
|
|
#ifdef HAVE_MMX2 |
|
|
|
// cant downscale !!! |
|
|
|
if((old_s_xinc != s_xinc || old_dstW!=dstW) && canMMX2BeUsed) |
|
|
|
if(canMMX2BeUsed) |
|
|
|
{ |
|
|
|
uint8_t *fragment; |
|
|
|
int imm8OfPShufW1; |
|
|
|
@@ -1481,9 +1822,6 @@ else s_xinc2= s_xinc; |
|
|
|
|
|
|
|
int xpos, i; |
|
|
|
|
|
|
|
old_s_xinc= s_xinc; |
|
|
|
old_dstW= dstW; |
|
|
|
|
|
|
|
// create an optimized horizontal scaling routine |
|
|
|
|
|
|
|
//code fragment |
|
|
|
@@ -1532,20 +1870,6 @@ else s_xinc2= s_xinc; |
|
|
|
|
|
|
|
xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers |
|
|
|
|
|
|
|
/* choose xinc so that all 8 parts fit exactly |
|
|
|
Note: we cannot use just 1 part because it would not fit in the code cache */ |
|
|
|
// s_xinc2_diff= -((((s_xinc2*(dstW/8))&0xFFFF))/(dstW/8))-10; |
|
|
|
// s_xinc_diff= -((((s_xinc*(dstW/8))&0xFFFF))/(dstW/8)); |
|
|
|
#ifdef ALT_ERROR |
|
|
|
// s_xinc2_diff+= ((0x10000/(dstW/8))); |
|
|
|
#endif |
|
|
|
// s_xinc_diff= s_xinc2_diff*2; |
|
|
|
|
|
|
|
// s_xinc2+= s_xinc2_diff; |
|
|
|
// s_xinc+= s_xinc_diff; |
|
|
|
|
|
|
|
// old_s_xinc= s_xinc; |
|
|
|
|
|
|
|
for(i=0; i<dstW/8; i++) |
|
|
|
{ |
|
|
|
int xx=xpos>>16; |
|
|
|
@@ -1602,6 +1926,7 @@ else s_xinc2= s_xinc; |
|
|
|
} |
|
|
|
|
|
|
|
#endif // HAVE_MMX2 |
|
|
|
} // Init stuff |
|
|
|
} // reset counters |
|
|
|
|
|
|
|
while(1){ |
|
|
|
|