32, 24, 16, 15 bit support in C (only 32 & 16 tested) 32, 16 bit support in mmx (16 with dithering) ranges of some variables changed so mmx likes them more Originally committed as revision 2233 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -1,9 +1,18 @@ | |||
| // Software scaling and colorspace conversion routines for MPlayer | |||
| #include <inttypes.h> | |||
| #include "../config.h" | |||
| #undef HAVE_MMX2 //code is buggy | |||
| //#undef HAVE_MMX | |||
| #define RET 0xC3 //near return opcode | |||
| // temporary storage for 4 yuv lines: | |||
| static unsigned int pix_buf_y[4][2048]; | |||
| static unsigned int pix_buf_uv[2][2048*2]; | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| @@ -15,6 +24,31 @@ static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| static uint64_t yCoeff= 0x2568256825682568LL; | |||
| static uint64_t ubCoeff= 0x3343334333433343LL; | |||
| static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; | |||
| static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; | |||
| static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL; | |||
| static uint64_t w80= 0x0080008000800080LL; | |||
| static uint64_t w10= 0x0010001000100010LL; | |||
| static uint64_t b16Dither= 0x0004000400040004LL; | |||
| static uint64_t b16Dither1=0x0004000400040004LL; | |||
| static uint64_t b16Dither2=0x0602060206020602LL; | |||
| static uint64_t g16Dither= 0x0002000200020002LL; | |||
| static uint64_t g16Dither1=0x0002000200020002LL; | |||
| static uint64_t g16Dither2=0x0301030103010301LL; | |||
| static uint64_t b16Mask= 0x001F001F001F001FLL; | |||
| static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | |||
| static uint64_t temp0; | |||
| static uint8_t funnyYCode[10000]; | |||
| static uint8_t funnyUVCode[10000]; | |||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | |||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | |||
| // *** Designed to upscale, but may work for downscale too. | |||
| @@ -33,25 +67,149 @@ unsigned int s_xinc2=s_xinc>>1; | |||
| static int s_srcypos; | |||
| static int s_ypos; | |||
| static int s_last_ypos; | |||
| static int static_dstw; | |||
| #ifdef HAVE_MMX2 | |||
| static int old_dstw= -1; | |||
| static int old_s_xinc= -1; | |||
| #endif | |||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||
| if(y==0){ | |||
| s_srcypos=-2*s_yinc; | |||
| s_ypos=-2; | |||
| s_last_ypos=-2; | |||
| #ifdef HAVE_MMX2 | |||
| // cant downscale !!! | |||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||
| { | |||
| uint8_t *fragment; | |||
| int imm8OfPShufW1; | |||
| int imm8OfPShufW2; | |||
| int fragmentLength; | |||
| int xpos, xx, xalpha, i; | |||
| old_s_xinc= s_xinc; | |||
| old_dstw= dstw; | |||
| static_dstw= dstw; | |||
| // create an optimized horizontal scaling routine | |||
| //code fragment | |||
| // fragmentLength=0; | |||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||
| asm volatile( | |||
| "jmp 9f \n\t" | |||
| // Begin | |||
| "0: \n\t" | |||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "psrlq $8, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |||
| "1: \n\t" | |||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
| "2: \n\t" | |||
| "psubw %%mm1, %%mm0 \n\t" | |||
| "psraw $1, %%mm0 \n\t" | |||
| "pmullw %%mm2, %%mm0 \n\t" | |||
| "psllw $7, %%mm1 \n\t" | |||
| "paddw %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||
| "addl $8, %%eax \n\t" | |||
| // End | |||
| "9: \n\t" | |||
| // "int $3\n\t" | |||
| "leal 0b, %0 \n\t" | |||
| "leal 1b, %1 \n\t" | |||
| "leal 2b, %2 \n\t" | |||
| "decl %1 \n\t" | |||
| "decl %2 \n\t" | |||
| "subl %0, %1 \n\t" | |||
| "subl %0, %2 \n\t" | |||
| "leal 9b, %3 \n\t" | |||
| "subl %0, %3 \n\t" | |||
| :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | |||
| "=r" (fragmentLength) | |||
| ); | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc)>>8) - xx; | |||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= | |||
| funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= | |||
| a | (b<<2) | (c<<4) | (d<<6); | |||
| funnyYCode[fragmentLength*(i+4)/4]= RET; | |||
| } | |||
| xpos+=s_xinc; | |||
| } | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc2)>>8) - xx; | |||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | |||
| funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | |||
| a | (b<<2) | (c<<4) | (d<<6); | |||
| funnyUVCode[fragmentLength*(i+4)/4]= RET; | |||
| } | |||
| xpos+=s_xinc2; | |||
| } | |||
| // funnyCode[0]= RET; | |||
| } | |||
| #endif | |||
| } // reset counters | |||
| while(1){ | |||
| unsigned char *dest=dstptr+dststride*s_ypos; | |||
| int y0=2+(s_srcypos>>16); | |||
| int y1=1+(s_srcypos>>17); | |||
| int yalpha=(s_srcypos&0xFFFF)>>8; | |||
| int yalpha1=yalpha^255; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>8; | |||
| int uvalpha1=uvalpha^255; | |||
| unsigned int *buf0=pix_buf_y[y0&3]; | |||
| unsigned int *buf1=pix_buf_y[((y0+1)&3)]; | |||
| unsigned int *uvbuf0=pix_buf_uv[y1&1]; | |||
| unsigned int *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| int yalpha=(s_srcypos&0xFFFF)>>7; | |||
| int yalpha1=yalpha^511; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||
| int uvalpha1=uvalpha^511; | |||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| int i; | |||
| if(y0>=y+h) break; | |||
| @@ -64,54 +222,423 @@ static int s_last_ypos; | |||
| s_last_ypos=y0; | |||
| // *** horizontal scale Y line to temp buffer | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| //FIXME add prefetch | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |||
| "1: \n\t" | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, (%1, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, 2(%1, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $2, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (src), "r" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=xpos&0xFF; | |||
| buf1[i]=(src[xx]*(xalpha^255)+src[xx+1]*xalpha); | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | |||
| xpos+=s_xinc; | |||
| } | |||
| #endif | |||
| // *** horizontal scale U and V lines to temp buffer | |||
| if(!(y0&1)){ | |||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | |||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | |||
| xpos=0; | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %6, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "addl $4096, %%edi \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |||
| "1: \n\t" | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $1, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||
| "r" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=xpos&0xFF; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^255)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^255)+src2[xx+1]*xalpha); | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |||
| xpos+=s_xinc2; | |||
| } | |||
| #endif | |||
| } | |||
| if(!y0) continue; | |||
| } | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | |||
| // Re: Note1: ok n*4 for now | |||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | |||
| // Re: Note2: yep | |||
| // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this: | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| #if 1 | |||
| // 24/32 bpp | |||
| dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| // Re: done (32 & 16) and 16 has dithering :) but 16 is untested | |||
| #ifdef HAVE_MMX | |||
| //FIXME write lq version with less uv ... | |||
| //FIXME reorder / optimize | |||
| if(dstbpp == 4) | |||
| { | |||
| asm volatile( | |||
| #define YSCALEYUV2RGB \ | |||
| "pxor %%mm7, %%mm7 \n\t"\ | |||
| "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |||
| "punpcklwd %%mm6, %%mm6 \n\t"\ | |||
| "punpcklwd %%mm6, %%mm6 \n\t"\ | |||
| "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |||
| "punpcklwd %%mm5, %%mm5 \n\t"\ | |||
| "punpcklwd %%mm5, %%mm5 \n\t"\ | |||
| "xorl %%eax, %%eax \n\t"\ | |||
| "1: \n\t"\ | |||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |||
| \ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | |||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | |||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | |||
| \ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| \ | |||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | |||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |||
| \ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "paddw %%mm4, %%mm2 \n\t"\ | |||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |||
| \ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "packuswb %%mm1, %%mm1 \n\t" | |||
| YSCALEYUV2RGB | |||
| "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |||
| "movq %%mm3, %%mm1 \n\t" | |||
| "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |||
| "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |||
| #ifdef HAVE_MMX2 | |||
| "movntq %%mm3, (%4, %%eax, 4) \n\t" | |||
| "movntq %%mm1, 8(%4, %%eax, 4) \n\t" | |||
| #else | |||
| unsigned short *d=dest; | |||
| unsigned int b=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| unsigned int g=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| unsigned int r=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| // d[0]=((r>>3)<<10)|((g>>3)<<5)|((b>>3)); // 15bpp | |||
| d[0]=((r>>3)<<11)|((g>>2)<<5)|((b>>3)); // 16bpp | |||
| "movq %%mm3, (%4, %%eax, 4) \n\t" | |||
| "movq %%mm1, 8(%4, %%eax, 4) \n\t" | |||
| #endif | |||
| dest+=dstbpp; | |||
| } | |||
| "addl $4, %%eax \n\t" | |||
| "cmpl %5, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |||
| "m" (yalpha1), "m" (uvalpha1) | |||
| : "%eax" | |||
| ); | |||
| } | |||
| else if(dstbpp==2) | |||
| { | |||
| asm volatile( | |||
| YSCALEYUV2RGB | |||
| "paddusb g16Dither, %%mm1 \n\t" | |||
| "paddusb b16Dither, %%mm0 \n\t" | |||
| "paddusb b16Dither, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |||
| "psrlw $3, %%mm3 \n\t" | |||
| "psllw $3, %%mm1 \n\t" | |||
| "psllw $8, %%mm0 \n\t" | |||
| "pand g16Mask, %%mm1 \n\t" | |||
| "pand r16Mask, %%mm0 \n\t" | |||
| "por %%mm3, %%mm1 \n\t" | |||
| "por %%mm1, %%mm0 \n\t" | |||
| #ifdef HAVE_MMX2 | |||
| "movntq %%mm0, (%4, %%eax, 2) \n\t" | |||
| #else | |||
| "movq %%mm0, (%4, %%eax, 2) \n\t" | |||
| #endif | |||
| "addl $4, %%eax \n\t" | |||
| "cmpl %5, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |||
| "m" (yalpha1), "m" (uvalpha1) | |||
| : "%eax" | |||
| ); | |||
| } | |||
| #else | |||
| if(dstbpp==4 || dstbpp==3) | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| else if(dstbpp==2) //16bit | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| ((uint16_t*)dest)[0] = | |||
| (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |||
| (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |||
| (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| else if(dstbpp==2) //15bit FIXME how do i figure out if its 15 or 16? | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| ((uint16_t*)dest)[0] = | |||
| (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |||
| (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |||
| (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| #endif | |||
| b16Dither= b16Dither1; | |||
| b16Dither1= b16Dither2; | |||
| b16Dither2= b16Dither; | |||
| g16Dither= g16Dither1; | |||
| g16Dither1= g16Dither2; | |||
| g16Dither2= g16Dither; | |||
| } | |||
| } | |||
| @@ -1,9 +1,18 @@ | |||
| // Software scaling and colorspace conversion routines for MPlayer | |||
| #include <inttypes.h> | |||
| #include "../config.h" | |||
| #undef HAVE_MMX2 //code is buggy | |||
| //#undef HAVE_MMX | |||
| #define RET 0xC3 //near return opcode | |||
| // temporary storage for 4 yuv lines: | |||
| static unsigned int pix_buf_y[4][2048]; | |||
| static unsigned int pix_buf_uv[2][2048*2]; | |||
| // 16bit for now (mmx likes it more compact) | |||
| static uint16_t pix_buf_y[4][2048]; | |||
| static uint16_t pix_buf_uv[2][2048*2]; | |||
| // clipping helper table for C implementations: | |||
| static unsigned char clip_table[768]; | |||
| @@ -15,6 +24,31 @@ static int yuvtab_0c92[256]; | |||
| static int yuvtab_1a1e[256]; | |||
| static int yuvtab_40cf[256]; | |||
| static uint64_t yCoeff= 0x2568256825682568LL; | |||
| static uint64_t ubCoeff= 0x3343334333433343LL; | |||
| static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; | |||
| static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; | |||
| static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL; | |||
| static uint64_t w80= 0x0080008000800080LL; | |||
| static uint64_t w10= 0x0010001000100010LL; | |||
| static uint64_t b16Dither= 0x0004000400040004LL; | |||
| static uint64_t b16Dither1=0x0004000400040004LL; | |||
| static uint64_t b16Dither2=0x0602060206020602LL; | |||
| static uint64_t g16Dither= 0x0002000200020002LL; | |||
| static uint64_t g16Dither1=0x0002000200020002LL; | |||
| static uint64_t g16Dither2=0x0301030103010301LL; | |||
| static uint64_t b16Mask= 0x001F001F001F001FLL; | |||
| static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | |||
| static uint64_t temp0; | |||
| static uint8_t funnyYCode[10000]; | |||
| static uint8_t funnyUVCode[10000]; | |||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | |||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | |||
| // *** Designed to upscale, but may work for downscale too. | |||
| @@ -33,25 +67,149 @@ unsigned int s_xinc2=s_xinc>>1; | |||
| static int s_srcypos; | |||
| static int s_ypos; | |||
| static int s_last_ypos; | |||
| static int static_dstw; | |||
| #ifdef HAVE_MMX2 | |||
| static int old_dstw= -1; | |||
| static int old_s_xinc= -1; | |||
| #endif | |||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||
| if(y==0){ | |||
| s_srcypos=-2*s_yinc; | |||
| s_ypos=-2; | |||
| s_last_ypos=-2; | |||
| #ifdef HAVE_MMX2 | |||
| // cant downscale !!! | |||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||
| { | |||
| uint8_t *fragment; | |||
| int imm8OfPShufW1; | |||
| int imm8OfPShufW2; | |||
| int fragmentLength; | |||
| int xpos, xx, xalpha, i; | |||
| old_s_xinc= s_xinc; | |||
| old_dstw= dstw; | |||
| static_dstw= dstw; | |||
| // create an optimized horizontal scaling routine | |||
| //code fragment | |||
| // fragmentLength=0; | |||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||
| asm volatile( | |||
| "jmp 9f \n\t" | |||
| // Begin | |||
| "0: \n\t" | |||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||
| "movq %%mm0, %%mm1 \n\t" | |||
| "psrlq $8, %%mm0 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" | |||
| "punpcklbw %%mm7, %%mm0 \n\t" | |||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | |||
| "1: \n\t" | |||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
| "2: \n\t" | |||
| "psubw %%mm1, %%mm0 \n\t" | |||
| "psraw $1, %%mm0 \n\t" | |||
| "pmullw %%mm2, %%mm0 \n\t" | |||
| "psllw $7, %%mm1 \n\t" | |||
| "paddw %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||
| "addl $8, %%eax \n\t" | |||
| // End | |||
| "9: \n\t" | |||
| // "int $3\n\t" | |||
| "leal 0b, %0 \n\t" | |||
| "leal 1b, %1 \n\t" | |||
| "leal 2b, %2 \n\t" | |||
| "decl %1 \n\t" | |||
| "decl %2 \n\t" | |||
| "subl %0, %1 \n\t" | |||
| "subl %0, %2 \n\t" | |||
| "leal 9b, %3 \n\t" | |||
| "subl %0, %3 \n\t" | |||
| :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2), | |||
| "=r" (fragmentLength) | |||
| ); | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc)>>8) - xx; | |||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]= | |||
| funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= | |||
| a | (b<<2) | (c<<4) | (d<<6); | |||
| funnyYCode[fragmentLength*(i+4)/4]= RET; | |||
| } | |||
| xpos+=s_xinc; | |||
| } | |||
| xpos= xx=xalpha= 0; | |||
| //FIXME choose size and or xinc so that they fit exactly | |||
| for(i=0; i<dstw/8; i++) | |||
| { | |||
| int xx=xpos>>8; | |||
| if((i&3) == 0) | |||
| { | |||
| int a=0; | |||
| int b=((xpos+s_xinc2)>>8) - xx; | |||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | |||
| funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]= | |||
| funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= | |||
| a | (b<<2) | (c<<4) | (d<<6); | |||
| funnyUVCode[fragmentLength*(i+4)/4]= RET; | |||
| } | |||
| xpos+=s_xinc2; | |||
| } | |||
| // funnyCode[0]= RET; | |||
| } | |||
| #endif | |||
| } // reset counters | |||
| while(1){ | |||
| unsigned char *dest=dstptr+dststride*s_ypos; | |||
| int y0=2+(s_srcypos>>16); | |||
| int y1=1+(s_srcypos>>17); | |||
| int yalpha=(s_srcypos&0xFFFF)>>8; | |||
| int yalpha1=yalpha^255; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>8; | |||
| int uvalpha1=uvalpha^255; | |||
| unsigned int *buf0=pix_buf_y[y0&3]; | |||
| unsigned int *buf1=pix_buf_y[((y0+1)&3)]; | |||
| unsigned int *uvbuf0=pix_buf_uv[y1&1]; | |||
| unsigned int *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| int yalpha=(s_srcypos&0xFFFF)>>7; | |||
| int yalpha1=yalpha^511; | |||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||
| int uvalpha1=uvalpha^511; | |||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||
| int i; | |||
| if(y0>=y+h) break; | |||
| @@ -64,54 +222,423 @@ static int s_last_ypos; | |||
| s_last_ypos=y0; | |||
| // *** horizontal scale Y line to temp buffer | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyYCode \n\t" | |||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| //FIXME add prefetch | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |||
| "1: \n\t" | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, (%1, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, 2(%1, %%eax, 2) \n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $2, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (src), "r" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=xpos&0xFF; | |||
| buf1[i]=(src[xx]*(xalpha^255)+src[xx+1]*xalpha); | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | |||
| xpos+=s_xinc; | |||
| } | |||
| #endif | |||
| // *** horizontal scale U and V lines to temp buffer | |||
| if(!(y0&1)){ | |||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | |||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | |||
| xpos=0; | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| #ifdef HAVE_MMX2 | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "movq %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" | |||
| "paddb %%mm6, %%mm2 \n\t" | |||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||
| "movq %%mm2, temp0 \n\t" | |||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %0, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||
| "xorl %%ecx, %%ecx \n\t" | |||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||
| // "int $3\n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "movl %6, %%esi \n\t" // src | |||
| "movl %1, %%edi \n\t" // buf1 | |||
| "addl $4096, %%edi \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| "movq temp0, %%mm2 \n\t" | |||
| "xorb %%cl, %%cl \n\t" | |||
| "call funnyUVCode \n\t" | |||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||
| ); | |||
| #elif defined (ARCH_X86) | |||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | |||
| asm volatile( | |||
| "xorl %%eax, %%eax \n\t" // i | |||
| "xorl %%ebx, %%ebx \n\t" // xx | |||
| "xorl %%ecx, %%ecx \n\t" // 2*xalpha | |||
| "1: \n\t" | |||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | |||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | |||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | |||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | |||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | |||
| "shll $8, %%edi \n\t" | |||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | |||
| "movl %1, %%edi \n\t" | |||
| "shrl $1, %%esi \n\t" | |||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | |||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | |||
| "addl $1, %%eax \n\t" | |||
| "cmpl %2, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||
| "r" (src2) | |||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | |||
| ); | |||
| #else | |||
| for(i=0;i<dstw;i++){ | |||
| register unsigned int xx=xpos>>8; | |||
| register unsigned int xalpha=xpos&0xFF; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^255)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^255)+src2[xx+1]*xalpha); | |||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | |||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | |||
| xpos+=s_xinc2; | |||
| } | |||
| #endif | |||
| } | |||
| if(!y0) continue; | |||
| } | |||
| // this loop should be rewritten in MMX assembly!!!! | |||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | |||
| // Re: Note1: ok n*4 for now | |||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | |||
| // Re: Note2: yep | |||
| // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this: | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| #if 1 | |||
| // 24/32 bpp | |||
| dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| // Re: done (32 & 16) and 16 has dithering :) but 16 is untested | |||
| #ifdef HAVE_MMX | |||
| //FIXME write lq version with less uv ... | |||
| //FIXME reorder / optimize | |||
| if(dstbpp == 4) | |||
| { | |||
| asm volatile( | |||
| #define YSCALEYUV2RGB \ | |||
| "pxor %%mm7, %%mm7 \n\t"\ | |||
| "movd %6, %%mm6 \n\t" /*yalpha1*/\ | |||
| "punpcklwd %%mm6, %%mm6 \n\t"\ | |||
| "punpcklwd %%mm6, %%mm6 \n\t"\ | |||
| "movd %7, %%mm5 \n\t" /*uvalpha1*/\ | |||
| "punpcklwd %%mm5, %%mm5 \n\t"\ | |||
| "punpcklwd %%mm5, %%mm5 \n\t"\ | |||
| "xorl %%eax, %%eax \n\t"\ | |||
| "1: \n\t"\ | |||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | |||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | |||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||
| \ | |||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | |||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | |||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | |||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | |||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | |||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | |||
| \ | |||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | |||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | |||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | |||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | |||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | |||
| \ | |||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||
| \ | |||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | |||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | |||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | |||
| \ | |||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||
| "paddw %%mm4, %%mm2 \n\t"\ | |||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | |||
| \ | |||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||
| "packuswb %%mm1, %%mm1 \n\t" | |||
| YSCALEYUV2RGB | |||
| "punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 | |||
| "movq %%mm3, %%mm1 \n\t" | |||
| "punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 | |||
| "punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 | |||
| #ifdef HAVE_MMX2 | |||
| "movntq %%mm3, (%4, %%eax, 4) \n\t" | |||
| "movntq %%mm1, 8(%4, %%eax, 4) \n\t" | |||
| #else | |||
| unsigned short *d=dest; | |||
| unsigned int b=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| unsigned int g=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| unsigned int r=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| // d[0]=((r>>3)<<10)|((g>>3)<<5)|((b>>3)); // 15bpp | |||
| d[0]=((r>>3)<<11)|((g>>2)<<5)|((b>>3)); // 16bpp | |||
| "movq %%mm3, (%4, %%eax, 4) \n\t" | |||
| "movq %%mm1, 8(%4, %%eax, 4) \n\t" | |||
| #endif | |||
| dest+=dstbpp; | |||
| } | |||
| "addl $4, %%eax \n\t" | |||
| "cmpl %5, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |||
| "m" (yalpha1), "m" (uvalpha1) | |||
| : "%eax" | |||
| ); | |||
| } | |||
| else if(dstbpp==2) | |||
| { | |||
| asm volatile( | |||
| YSCALEYUV2RGB | |||
| "paddusb g16Dither, %%mm1 \n\t" | |||
| "paddusb b16Dither, %%mm0 \n\t" | |||
| "paddusb b16Dither, %%mm3 \n\t" | |||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | |||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | |||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | |||
| "psrlw $3, %%mm3 \n\t" | |||
| "psllw $3, %%mm1 \n\t" | |||
| "psllw $8, %%mm0 \n\t" | |||
| "pand g16Mask, %%mm1 \n\t" | |||
| "pand r16Mask, %%mm0 \n\t" | |||
| "por %%mm3, %%mm1 \n\t" | |||
| "por %%mm1, %%mm0 \n\t" | |||
| #ifdef HAVE_MMX2 | |||
| "movntq %%mm0, (%4, %%eax, 2) \n\t" | |||
| #else | |||
| "movq %%mm0, (%4, %%eax, 2) \n\t" | |||
| #endif | |||
| "addl $4, %%eax \n\t" | |||
| "cmpl %5, %%eax \n\t" | |||
| " jb 1b \n\t" | |||
| :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), | |||
| "m" (yalpha1), "m" (uvalpha1) | |||
| : "%eax" | |||
| ); | |||
| } | |||
| #else | |||
| if(dstbpp==4 || dstbpp==3) | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; | |||
| dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; | |||
| dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| else if(dstbpp==2) //16bit | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| ((uint16_t*)dest)[0] = | |||
| (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |||
| (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 | | |||
| (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| else if(dstbpp==2) //15bit FIXME how do i figure out if its 15 or 16? | |||
| { | |||
| for(i=0;i<dstw;i++){ | |||
| // vertical linear interpolation && yuv2rgb in a single step: | |||
| int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; | |||
| int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); | |||
| int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); | |||
| ((uint16_t*)dest)[0] = | |||
| (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) | | |||
| (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 | | |||
| (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; | |||
| dest+=dstbpp; | |||
| } | |||
| } | |||
| #endif | |||
| b16Dither= b16Dither1; | |||
| b16Dither1= b16Dither2; | |||
| b16Dither2= b16Dither; | |||
| g16Dither= g16Dither1; | |||
| g16Dither1= g16Dither2; | |||
| g16Dither2= g16Dither; | |||
| } | |||
| } | |||