moved mmx variables to top to avoid alignment issues mmx2 code should work fine now if and only if the input width is %16=0 and the output width is %32=0 reordered some code (5% faster with a simply -benchmark) first line bug fixed (i hope i didnt introduce any new bugs with that ...) changed a lot of the vertical scale setup code, i hope i fixed something and didnt mess it up :) a few known bugs left (rightmost line is wrong) MMX2 code will only be used for upscaling & acceptable width´s 16bit dithering can be disabled Originally committed as revision 2265 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -1,28 +1,29 @@ | |||
| // Software scaling and colorspace conversion routines for MPlayer | |||
| // Orginal C implementation by ? | |||
| // current version mostly by Michael Niedermayer (michaelni@gmx.at) | |||
| #include <inttypes.h> | |||
| #include "../config.h" | |||
| #undef HAVE_MMX2 //code is buggy | |||
| //#undef HAVE_MMX2 | |||
| //#undef HAVE_MMX | |||
| //#undef ARCH_X86 | |||
| #define DITHER16BPP | |||
| #define ALT_ERROR | |||
| #define RET 0xC3 //near return opcode | |||
| /* | |||
| NOTES | |||
| // temporary storage for 4 yuv lines: | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| known BUGS with known cause (no bugreports please!) | |||
| line at the right (c,asm and mmx2) | |||
| code reads 1 sample too much (might cause a sig11) | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| // yuv->rgb conversion tables: | |||
| static int yuvtab_2568[256]; | |||
| static int yuvtab_3343[256]; | |||
| static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| TODO | |||
| check alignment off everything | |||
| */ | |||
| static uint64_t yCoeff= 0x2568256825682568LL; | |||
| static uint64_t ubCoeff= 0x3343334333433343LL; | |||
| @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | |||
| static uint64_t temp0; | |||
| // temporary storage for 4 yuv lines: | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| // yuv->rgb conversion tables: | |||
| static int yuvtab_2568[256]; | |||
| static int yuvtab_3343[256]; | |||
| static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| static uint8_t funnyYCode[10000]; | |||
| static uint8_t funnyUVCode[10000]; | |||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | |||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | |||
| // *** Designed to upscale, but may work for downscale too. | |||
| @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, | |||
| //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | |||
| //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | |||
| unsigned int s_xinc2=s_xinc>>1; | |||
| unsigned int s_xinc2; | |||
| static int s_srcypos; | |||
| static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||
| static int s_ypos; | |||
| // last horzontally interpolated lines, used to avoid unnecessary calculations | |||
| static int s_last_ypos; | |||
| static int s_last_y1pos; | |||
| static int static_dstw; | |||
| #ifdef HAVE_MMX2 | |||
| // used to detect a horizontal size change | |||
| static int old_dstw= -1; | |||
| static int old_s_xinc= -1; | |||
| // difference between the requested xinc and the required one for the mmx2 routine | |||
| static int s_xinc_diff=0; | |||
| static int s_xinc2_diff=0; | |||
| #endif | |||
| int canMMX2BeUsed; | |||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||
| // we need that precission at least for the mmx2 code | |||
| s_xinc*= 256; | |||
| s_xinc2=s_xinc>>1; | |||
| canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; | |||
| if(y==0){ | |||
| s_srcypos=-2*s_yinc; | |||
| s_ypos=-2; | |||
| s_last_ypos=-2; | |||
| s_srcypos= s_yinc/2 - 0x8000; | |||
| s_ypos=0; | |||
| // force calculation of the horizontal interpolation of the first line | |||
| s_last_ypos=-99; | |||
| s_last_y1pos=-99; | |||
| #ifdef HAVE_MMX2 | |||
| // cant downscale !!! | |||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||
| if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) | |||
| { | |||
| uint8_t *fragment; | |||
| int imm8OfPShufW1; | |||
| @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| //code fragment | |||
| // fragmentLength=0; | |||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||
| asm volatile( | |||
| "jmp 9f \n\t" | |||
| // Begin | |||
| "0: \n\t" | |||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||
| "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "psrlq $8, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF | |||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |||
| "1: \n\t" | |||
| "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry | |||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
| "2: \n\t" | |||
| "psrlw $9, %%mm3 \n\t" | |||
| "psubw %%mm1, %%mm0 \n\t" | |||
| "psraw $1, %%mm0 \n\t" | |||
| "pmullw %%mm2, %%mm0 \n\t" | |||
| "pmullw %%mm3, %%mm0 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |||
| "psllw $7, %%mm1 \n\t" | |||
| "paddw %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "addl $8, %%eax \n\t" | |||
| // End | |||
| @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| ); | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| /* choose xinc so that all 8 parts fit exactly | |||
| Note: we cannot use just 1 part because it would not fit in the code cache */ | |||
| s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; | |||
| // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); | |||
| #ifdef ALT_ERROR | |||
| s_xinc2_diff+= ((0x10000/(dstw/8))); | |||
| #endif | |||
| s_xinc_diff= s_xinc2_diff*2; | |||
| s_xinc2+= s_xinc2_diff; | |||
| s_xinc+= s_xinc_diff; | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| int xx=xpos>>16; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc)>>8) - xx; | |||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||
| int b=((xpos+s_xinc)>>16) - xx; | |||
| int c=((xpos+s_xinc*2)>>16) - xx; | |||
| int d=((xpos+s_xinc*3)>>16) - xx; | |||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| int xx=xpos>>16; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc2)>>8) - xx; | |||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||
| int b=((xpos+s_xinc2)>>16) - xx; | |||
| int c=((xpos+s_xinc2*2)>>16) - xx; | |||
| int d=((xpos+s_xinc2*3)>>16) - xx; | |||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| } | |||
| #endif | |||
| if(canMMX2BeUsed) | |||
| { | |||
| s_xinc+= s_xinc_diff; | |||
| s_xinc2+= s_xinc2_diff; | |||
| } | |||
| #endif // HAVE_MMX2 | |||
| } // reset counters | |||
| while(1){ | |||
| unsigned char *dest=dstptr+dststride*s_ypos; | |||
| int y0=2+(s_srcypos>>16); | |||
| int y1=1+(s_srcypos>>17); | |||
| int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line | |||
| // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||
| int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | |||
| int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | |||
| int yalpha=(s_srcypos&0xFFFF)>>7; | |||
| int yalpha1=yalpha^511; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||
| int uvalpha=(srcuvpos&0x1FFFF)>>8; | |||
| int uvalpha1=uvalpha^511; | |||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | |||
| int i; | |||
| if(y0>=y+h) break; | |||
| // if this is before the first line than use only the first src line | |||
| if(y0==0) buf0= buf1; | |||
| if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 | |||
| if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway | |||
| // if this is after the last line than use only the last src line | |||
| if(y0>=y+h) | |||
| { | |||
| buf1= buf0; | |||
| s_last_ypos=y0; | |||
| } | |||
| if(y1>=(y+h)/2) | |||
| { | |||
| uvbuf1= uvbuf0; | |||
| s_last_y1pos=y1; | |||
| } | |||
| s_ypos++; s_srcypos+=s_yinc; | |||
| //only interpolate the src line horizontally if we didnt do it allready | |||
| if(s_last_ypos!=y0){ | |||
| unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | |||
| unsigned int xpos=0; | |||
| s_last_ypos=y0; | |||
| // *** horizontal scale Y line to temp buffer | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #ifdef ARCH_X86 | |||
| #elif defined (ARCH_X86) | |||
| #ifdef HAVE_MMX2 | |||
| if(canMMX2BeUsed) | |||
| { | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "xorl %%ebx, %%ebx \n\t" | |||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), | |||
| "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| //FIXME add prefetch | |||
| asm volatile( | |||
| @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| " jb 1b \n\t" | |||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #ifdef HAVE_MMX2 | |||
| } //if MMX2 cant be used | |||
| #endif | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| register unsigned int xx=xpos>>16; | |||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | |||
| xpos+=s_xinc; | |||
| } | |||
| #endif | |||
| } | |||
| // *** horizontal scale U and V lines to temp buffer | |||
| if(!(y0&1)){ | |||
| if(s_last_y1pos!=y1){ | |||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | |||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | |||
| xpos=0; | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| int xpos=0; | |||
| s_last_y1pos= y1; | |||
| #ifdef ARCH_X86 | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| if(canMMX2BeUsed) | |||
| { | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| "xorl %%ebx, %%ebx \n\t" | |||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||
| // "int $3\n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| #define FUNNYUVCODE \ | |||
| "call funnyUVCode \n\t"\ | |||
| "movq temp0, %%mm2 \n\t"\ | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %6, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "addl $4096, %%edi \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), | |||
| "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $1, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), | |||
| "r" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #ifdef HAVE_MMX2 | |||
| } //if MMX2 cant be used | |||
| #endif | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>16; | |||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |||
| xpos+=s_xinc2; | |||
| } | |||
| #endif | |||
| } | |||
| if(!y0) continue; | |||
| #endif | |||
| } | |||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | |||
| // Re: Note1: ok n*4 for now | |||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | |||
| @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "1: \n\t"\ | |||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| \ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | |||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | |||
| \ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| \ | |||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| \ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "paddw %%mm4, %%mm2 \n\t"\ | |||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |||
| \ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "packuswb %%mm1, %%mm1 \n\t" | |||
| YSCALEYUV2RGB | |||
| @@ -610,9 +675,11 @@ YSCALEYUV2RGB | |||
| asm volatile( | |||
| YSCALEYUV2RGB | |||
| #ifdef DITHER16BPP | |||
| "paddusb g16Dither, %%mm1 \n\t" | |||
| "paddusb b16Dither, %%mm0 \n\t" | |||
| "paddusb b16Dither, %%mm3 \n\t" | |||
| #endif | |||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |||
| @@ -699,8 +766,6 @@ YSCALEYUV2RGB | |||
| #elif defined (HAVE_MMX) | |||
| asm volatile("emms"); | |||
| #endif | |||
| } | |||
| @@ -1,28 +1,29 @@ | |||
| // Software scaling and colorspace conversion routines for MPlayer | |||
| // Orginal C implementation by ? | |||
| // current version mostly by Michael Niedermayer (michaelni@gmx.at) | |||
| #include <inttypes.h> | |||
| #include "../config.h" | |||
| #undef HAVE_MMX2 //code is buggy | |||
| //#undef HAVE_MMX2 | |||
| //#undef HAVE_MMX | |||
| //#undef ARCH_X86 | |||
| #define DITHER16BPP | |||
| #define ALT_ERROR | |||
| #define RET 0xC3 //near return opcode | |||
| /* | |||
| NOTES | |||
| // temporary storage for 4 yuv lines: | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| known BUGS with known cause (no bugreports please!) | |||
| line at the right (c,asm and mmx2) | |||
| code reads 1 sample too much (might cause a sig11) | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| // yuv->rgb conversion tables: | |||
| static int yuvtab_2568[256]; | |||
| static int yuvtab_3343[256]; | |||
| static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| TODO | |||
| check alignment off everything | |||
| */ | |||
| static uint64_t yCoeff= 0x2568256825682568LL; | |||
| static uint64_t ubCoeff= 0x3343334333433343LL; | |||
| @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | |||
| static uint64_t temp0; | |||
| // temporary storage for 4 yuv lines: | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| // yuv->rgb conversion tables: | |||
| static int yuvtab_2568[256]; | |||
| static int yuvtab_3343[256]; | |||
| static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| static uint8_t funnyYCode[10000]; | |||
| static uint8_t funnyUVCode[10000]; | |||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | |||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | |||
| // *** Designed to upscale, but may work for downscale too. | |||
| @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, | |||
| //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | |||
| //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | |||
| unsigned int s_xinc2=s_xinc>>1; | |||
| unsigned int s_xinc2; | |||
| static int s_srcypos; | |||
| static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||
| static int s_ypos; | |||
| // last horzontally interpolated lines, used to avoid unnecessary calculations | |||
| static int s_last_ypos; | |||
| static int s_last_y1pos; | |||
| static int static_dstw; | |||
| #ifdef HAVE_MMX2 | |||
| // used to detect a horizontal size change | |||
| static int old_dstw= -1; | |||
| static int old_s_xinc= -1; | |||
| // difference between the requested xinc and the required one for the mmx2 routine | |||
| static int s_xinc_diff=0; | |||
| static int s_xinc2_diff=0; | |||
| #endif | |||
| int canMMX2BeUsed; | |||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||
| // we need that precission at least for the mmx2 code | |||
| s_xinc*= 256; | |||
| s_xinc2=s_xinc>>1; | |||
| canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; | |||
| if(y==0){ | |||
| s_srcypos=-2*s_yinc; | |||
| s_ypos=-2; | |||
| s_last_ypos=-2; | |||
| s_srcypos= s_yinc/2 - 0x8000; | |||
| s_ypos=0; | |||
| // force calculation of the horizontal interpolation of the first line | |||
| s_last_ypos=-99; | |||
| s_last_y1pos=-99; | |||
| #ifdef HAVE_MMX2 | |||
| // cant downscale !!! | |||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||
| if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) | |||
| { | |||
| uint8_t *fragment; | |||
| int imm8OfPShufW1; | |||
| @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| //code fragment | |||
| // fragmentLength=0; | |||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||
| asm volatile( | |||
| "jmp 9f \n\t" | |||
| // Begin | |||
| "0: \n\t" | |||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||
| "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "psrlq $8, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF | |||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |||
| "1: \n\t" | |||
| "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry | |||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
| "2: \n\t" | |||
| "psrlw $9, %%mm3 \n\t" | |||
| "psubw %%mm1, %%mm0 \n\t" | |||
| "psraw $1, %%mm0 \n\t" | |||
| "pmullw %%mm2, %%mm0 \n\t" | |||
| "pmullw %%mm3, %%mm0 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |||
| "psllw $7, %%mm1 \n\t" | |||
| "paddw %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "addl $8, %%eax \n\t" | |||
| // End | |||
| @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| ); | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| /* choose xinc so that all 8 parts fit exactly | |||
| Note: we cannot use just 1 part because it would not fit in the code cache */ | |||
| s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; | |||
| // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); | |||
| #ifdef ALT_ERROR | |||
| s_xinc2_diff+= ((0x10000/(dstw/8))); | |||
| #endif | |||
| s_xinc_diff= s_xinc2_diff*2; | |||
| s_xinc2+= s_xinc2_diff; | |||
| s_xinc+= s_xinc_diff; | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| int xx=xpos>>16; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc)>>8) - xx; | |||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||
| int b=((xpos+s_xinc)>>16) - xx; | |||
| int c=((xpos+s_xinc*2)>>16) - xx; | |||
| int d=((xpos+s_xinc*3)>>16) - xx; | |||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| int xx=xpos>>16; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc2)>>8) - xx; | |||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||
| int b=((xpos+s_xinc2)>>16) - xx; | |||
| int c=((xpos+s_xinc2*2)>>16) - xx; | |||
| int d=((xpos+s_xinc2*3)>>16) - xx; | |||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| } | |||
| #endif | |||
| if(canMMX2BeUsed) | |||
| { | |||
| s_xinc+= s_xinc_diff; | |||
| s_xinc2+= s_xinc2_diff; | |||
| } | |||
| #endif // HAVE_MMX2 | |||
| } // reset counters | |||
| while(1){ | |||
| unsigned char *dest=dstptr+dststride*s_ypos; | |||
| int y0=2+(s_srcypos>>16); | |||
| int y1=1+(s_srcypos>>17); | |||
| int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line | |||
| // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||
| int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | |||
| int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | |||
| int yalpha=(s_srcypos&0xFFFF)>>7; | |||
| int yalpha1=yalpha^511; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||
| int uvalpha=(srcuvpos&0x1FFFF)>>8; | |||
| int uvalpha1=uvalpha^511; | |||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | |||
| int i; | |||
| if(y0>=y+h) break; | |||
| // if this is before the first line than use only the first src line | |||
| if(y0==0) buf0= buf1; | |||
| if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 | |||
| if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway | |||
| // if this is after the last line than use only the last src line | |||
| if(y0>=y+h) | |||
| { | |||
| buf1= buf0; | |||
| s_last_ypos=y0; | |||
| } | |||
| if(y1>=(y+h)/2) | |||
| { | |||
| uvbuf1= uvbuf0; | |||
| s_last_y1pos=y1; | |||
| } | |||
| s_ypos++; s_srcypos+=s_yinc; | |||
| //only interpolate the src line horizontally if we didnt do it allready | |||
| if(s_last_ypos!=y0){ | |||
| unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | |||
| unsigned int xpos=0; | |||
| s_last_ypos=y0; | |||
| // *** horizontal scale Y line to temp buffer | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #ifdef ARCH_X86 | |||
| #elif defined (ARCH_X86) | |||
| #ifdef HAVE_MMX2 | |||
| if(canMMX2BeUsed) | |||
| { | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "xorl %%ebx, %%ebx \n\t" | |||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), | |||
| "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| //FIXME add prefetch | |||
| asm volatile( | |||
| @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, 2(%%edi, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| " jb 1b \n\t" | |||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #ifdef HAVE_MMX2 | |||
| } //if MMX2 cant be used | |||
| #endif | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| register unsigned int xx=xpos>>16; | |||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | |||
| xpos+=s_xinc; | |||
| } | |||
| #endif | |||
| } | |||
| // *** horizontal scale U and V lines to temp buffer | |||
| if(!(y0&1)){ | |||
| if(s_last_y1pos!=y1){ | |||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | |||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | |||
| xpos=0; | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| int xpos=0; | |||
| s_last_y1pos= y1; | |||
| #ifdef ARCH_X86 | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| if(canMMX2BeUsed) | |||
| { | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "paddw %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| "xorl %%ebx, %%ebx \n\t" | |||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||
| // "int $3\n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| #define FUNNYUVCODE \ | |||
| "call funnyUVCode \n\t"\ | |||
| "movq temp0, %%mm2 \n\t"\ | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %6, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "addl $4096, %%edi \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| FUNNYUVCODE | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), | |||
| "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| } | |||
| else | |||
| { | |||
| #endif | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "shll $16, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "shrl $9, %%esi \n\t" | |||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $1, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), | |||
| "r" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #ifdef HAVE_MMX2 | |||
| } //if MMX2 cant be used | |||
| #endif | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>16; | |||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |||
| xpos+=s_xinc2; | |||
| } | |||
| #endif | |||
| } | |||
| if(!y0) continue; | |||
| #endif | |||
| } | |||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | |||
| // Re: Note1: ok n*4 for now | |||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | |||
| @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||
| "1: \n\t"\ | |||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| \ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | |||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | |||
| \ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| \ | |||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| \ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "paddw %%mm4, %%mm2 \n\t"\ | |||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |||
| \ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "packuswb %%mm1, %%mm1 \n\t" | |||
| YSCALEYUV2RGB | |||
| @@ -610,9 +675,11 @@ YSCALEYUV2RGB | |||
| asm volatile( | |||
| YSCALEYUV2RGB | |||
| #ifdef DITHER16BPP | |||
| "paddusb g16Dither, %%mm1 \n\t" | |||
| "paddusb b16Dither, %%mm0 \n\t" | |||
| "paddusb b16Dither, %%mm3 \n\t" | |||
| #endif | |||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |||
| @@ -699,8 +766,6 @@ YSCALEYUV2RGB | |||
| #elif defined (HAVE_MMX) | |||
| asm volatile("emms"); | |||
| #endif | |||
| } | |||