moved mmx variables to top to avoid alignment issues mmx2 code should work fine now if and only if the input width is %16=0 and the output width is %32=0 reordered some code (5% faster with a simply -benchmark) first line bug fixed (i hope i didnt introduce any new bugs with that ...) changed a lot of the vertical scale setup code, i hope i fixed something and didnt mess it up :) a few known bugs left (rightmost line is wrong) MMX2 code will only be used for upscaling & acceptable width´s 16bit dithering can be disabled Originally committed as revision 2265 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -1,28 +1,29 @@ | |||||
| // Software scaling and colorspace conversion routines for MPlayer | // Software scaling and colorspace conversion routines for MPlayer | ||||
| // Orginal C implementation by ? | |||||
| // current version mostly by Michael Niedermayer (michaelni@gmx.at) | |||||
| #include <inttypes.h> | #include <inttypes.h> | ||||
| #include "../config.h" | #include "../config.h" | ||||
| #undef HAVE_MMX2 //code is buggy | |||||
| //#undef HAVE_MMX2 | |||||
| //#undef HAVE_MMX | //#undef HAVE_MMX | ||||
| //#undef ARCH_X86 | |||||
| #define DITHER16BPP | |||||
| #define ALT_ERROR | |||||
| #define RET 0xC3 //near return opcode | #define RET 0xC3 //near return opcode | ||||
| /* | |||||
| NOTES | |||||
| // temporary storage for 4 yuv lines: | |||||
| // 16bit for now (mmx likes it more compact) | |||||
| static uint16_t pix_buf_y[4][2048]; | |||||
| static uint16_t pix_buf_uv[2][2048*2]; | |||||
| known BUGS with known cause (no bugreports please!) | |||||
| line at the right (c,asm and mmx2) | |||||
| code reads 1 sample too much (might cause a sig11) | |||||
| // clipping helper table for C implementations: | |||||
| static unsigned char clip_table[768]; | |||||
| // yuv->rgb conversion tables: | |||||
| static int yuvtab_2568[256]; | |||||
| static int yuvtab_3343[256]; | |||||
| static int yuvtab_0c92[256]; | |||||
| static int yuvtab_1a1e[256]; | |||||
| static int yuvtab_40cf[256]; | |||||
| TODO | |||||
| check alignment off everything | |||||
| */ | |||||
| static uint64_t yCoeff= 0x2568256825682568LL; | static uint64_t yCoeff= 0x2568256825682568LL; | ||||
| static uint64_t ubCoeff= 0x3343334333433343LL; | static uint64_t ubCoeff= 0x3343334333433343LL; | ||||
| @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | static uint64_t r16Mask= 0xF800F800F800F800LL; | ||||
| static uint64_t temp0; | static uint64_t temp0; | ||||
| // temporary storage for 4 yuv lines: | |||||
| // 16bit for now (mmx likes it more compact) | |||||
| static uint16_t pix_buf_y[4][2048]; | |||||
| static uint16_t pix_buf_uv[2][2048*2]; | |||||
| // clipping helper table for C implementations: | |||||
| static unsigned char clip_table[768]; | |||||
| // yuv->rgb conversion tables: | |||||
| static int yuvtab_2568[256]; | |||||
| static int yuvtab_3343[256]; | |||||
| static int yuvtab_0c92[256]; | |||||
| static int yuvtab_1a1e[256]; | |||||
| static int yuvtab_40cf[256]; | |||||
| static uint8_t funnyYCode[10000]; | static uint8_t funnyYCode[10000]; | ||||
| static uint8_t funnyUVCode[10000]; | static uint8_t funnyUVCode[10000]; | ||||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | ||||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | // *** Note: it's called multiple times while decoding a frame, first time y==0 | ||||
| // *** Designed to upscale, but may work for downscale too. | // *** Designed to upscale, but may work for downscale too. | ||||
| @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, | |||||
| //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | ||||
| //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | ||||
| unsigned int s_xinc2=s_xinc>>1; | |||||
| unsigned int s_xinc2; | |||||
| static int s_srcypos; | |||||
| static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||||
| static int s_ypos; | static int s_ypos; | ||||
| // last horzontally interpolated lines, used to avoid unnecessary calculations | |||||
| static int s_last_ypos; | static int s_last_ypos; | ||||
| static int s_last_y1pos; | |||||
| static int static_dstw; | static int static_dstw; | ||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| // used to detect a horizontal size change | |||||
| static int old_dstw= -1; | static int old_dstw= -1; | ||||
| static int old_s_xinc= -1; | static int old_s_xinc= -1; | ||||
| // difference between the requested xinc and the required one for the mmx2 routine | |||||
| static int s_xinc_diff=0; | |||||
| static int s_xinc2_diff=0; | |||||
| #endif | #endif | ||||
| int canMMX2BeUsed; | |||||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||||
| // we need that precission at least for the mmx2 code | |||||
| s_xinc*= 256; | |||||
| s_xinc2=s_xinc>>1; | |||||
| canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; | |||||
| if(y==0){ | if(y==0){ | ||||
| s_srcypos=-2*s_yinc; | |||||
| s_ypos=-2; | |||||
| s_last_ypos=-2; | |||||
| s_srcypos= s_yinc/2 - 0x8000; | |||||
| s_ypos=0; | |||||
| // force calculation of the horizontal interpolation of the first line | |||||
| s_last_ypos=-99; | |||||
| s_last_y1pos=-99; | |||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| // cant downscale !!! | // cant downscale !!! | ||||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||||
| if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) | |||||
| { | { | ||||
| uint8_t *fragment; | uint8_t *fragment; | ||||
| int imm8OfPShufW1; | int imm8OfPShufW1; | ||||
| @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| //code fragment | //code fragment | ||||
| // fragmentLength=0; | |||||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||||
| asm volatile( | asm volatile( | ||||
| "jmp 9f \n\t" | "jmp 9f \n\t" | ||||
| // Begin | // Begin | ||||
| "0: \n\t" | "0: \n\t" | ||||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||||
| "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |||||
| "movq %%mm0, %%mm1 \n\t" | "movq %%mm0, %%mm1 \n\t" | ||||
| "psrlq $8, %%mm0 \n\t" | "psrlq $8, %%mm0 \n\t" | ||||
| "punpcklbw %%mm7, %%mm1 \n\t" | "punpcklbw %%mm7, %%mm1 \n\t" | ||||
| "movq %%mm2, %%mm3 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | "punpcklbw %%mm7, %%mm0 \n\t" | ||||
| "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF | |||||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | "pshufw $0xFF, %%mm1, %%mm1 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry | |||||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | ||||
| "2: \n\t" | "2: \n\t" | ||||
| "psrlw $9, %%mm3 \n\t" | |||||
| "psubw %%mm1, %%mm0 \n\t" | "psubw %%mm1, %%mm0 \n\t" | ||||
| "psraw $1, %%mm0 \n\t" | |||||
| "pmullw %%mm2, %%mm0 \n\t" | |||||
| "pmullw %%mm3, %%mm0 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |||||
| "psllw $7, %%mm1 \n\t" | "psllw $7, %%mm1 \n\t" | ||||
| "paddw %%mm1, %%mm0 \n\t" | "paddw %%mm1, %%mm0 \n\t" | ||||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||||
| "addl $8, %%eax \n\t" | "addl $8, %%eax \n\t" | ||||
| // End | // End | ||||
| @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| ); | ); | ||||
| xpos= xx=xalpha= 0; | xpos= xx=xalpha= 0; | ||||
| //FIXME choose size and or xinc so that they fit exactly | |||||
| /* choose xinc so that all 8 parts fit exactly | |||||
| Note: we cannot use just 1 part because it would not fit in the code cache */ | |||||
| s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; | |||||
| // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); | |||||
| #ifdef ALT_ERROR | |||||
| s_xinc2_diff+= ((0x10000/(dstw/8))); | |||||
| #endif | |||||
| s_xinc_diff= s_xinc2_diff*2; | |||||
| s_xinc2+= s_xinc2_diff; | |||||
| s_xinc+= s_xinc_diff; | |||||
| for(i=0; i<dstw/8; i++) | for(i=0; i<dstw/8; i++) | ||||
| { | { | ||||
| int xx=xpos>>8; | |||||
| int xx=xpos>>16; | |||||
| if((i&3) == 0) | if((i&3) == 0) | ||||
| { | { | ||||
| int a=0; | int a=0; | ||||
| int b=((xpos+s_xinc)>>8) - xx; | |||||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||||
| int b=((xpos+s_xinc)>>16) - xx; | |||||
| int c=((xpos+s_xinc*2)>>16) - xx; | |||||
| int d=((xpos+s_xinc*3)>>16) - xx; | |||||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | ||||
| @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| //FIXME choose size and or xinc so that they fit exactly | //FIXME choose size and or xinc so that they fit exactly | ||||
| for(i=0; i<dstw/8; i++) | for(i=0; i<dstw/8; i++) | ||||
| { | { | ||||
| int xx=xpos>>8; | |||||
| int xx=xpos>>16; | |||||
| if((i&3) == 0) | if((i&3) == 0) | ||||
| { | { | ||||
| int a=0; | int a=0; | ||||
| int b=((xpos+s_xinc2)>>8) - xx; | |||||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||||
| int b=((xpos+s_xinc2)>>16) - xx; | |||||
| int c=((xpos+s_xinc2*2)>>16) - xx; | |||||
| int d=((xpos+s_xinc2*3)>>16) - xx; | |||||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | ||||
| @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| } | } | ||||
| #endif | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| s_xinc+= s_xinc_diff; | |||||
| s_xinc2+= s_xinc2_diff; | |||||
| } | |||||
| #endif // HAVE_MMX2 | |||||
| } // reset counters | } // reset counters | ||||
| while(1){ | while(1){ | ||||
| unsigned char *dest=dstptr+dststride*s_ypos; | unsigned char *dest=dstptr+dststride*s_ypos; | ||||
| int y0=2+(s_srcypos>>16); | |||||
| int y1=1+(s_srcypos>>17); | |||||
| int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line | |||||
| // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||||
| int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | |||||
| int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | |||||
| int yalpha=(s_srcypos&0xFFFF)>>7; | int yalpha=(s_srcypos&0xFFFF)>>7; | ||||
| int yalpha1=yalpha^511; | int yalpha1=yalpha^511; | ||||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||||
| int uvalpha=(srcuvpos&0x1FFFF)>>8; | |||||
| int uvalpha1=uvalpha^511; | int uvalpha1=uvalpha^511; | ||||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||||
| uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice | |||||
| uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | |||||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | |||||
| uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | |||||
| int i; | int i; | ||||
| if(y0>=y+h) break; | |||||
| // if this is before the first line than use only the first src line | |||||
| if(y0==0) buf0= buf1; | |||||
| if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 | |||||
| if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway | |||||
| // if this is after the last line than use only the last src line | |||||
| if(y0>=y+h) | |||||
| { | |||||
| buf1= buf0; | |||||
| s_last_ypos=y0; | |||||
| } | |||||
| if(y1>=(y+h)/2) | |||||
| { | |||||
| uvbuf1= uvbuf0; | |||||
| s_last_y1pos=y1; | |||||
| } | |||||
| s_ypos++; s_srcypos+=s_yinc; | s_ypos++; s_srcypos+=s_yinc; | ||||
| //only interpolate the src line horizontally if we didnt do it allready | |||||
| if(s_last_ypos!=y0){ | if(s_last_ypos!=y0){ | ||||
| unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | ||||
| unsigned int xpos=0; | unsigned int xpos=0; | ||||
| s_last_ypos=y0; | s_last_ypos=y0; | ||||
| // *** horizontal scale Y line to temp buffer | // *** horizontal scale Y line to temp buffer | ||||
| // this loop should be rewritten in MMX assembly!!!! | |||||
| #ifdef HAVE_MMX2 | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "movq %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "movq %%mm2, temp0 \n\t" | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "xorl %%eax, %%eax \n\t" // i | |||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %0, %%esi \n\t" // src | |||||
| "movl %1, %%edi \n\t" // buf1 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||||
| // "int $3\n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||||
| ); | |||||
| #ifdef ARCH_X86 | |||||
| #elif defined (ARCH_X86) | |||||
| #ifdef HAVE_MMX2 | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "movq %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "movq %%mm2, temp0 \n\t" | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "xorl %%eax, %%eax \n\t" // i | |||||
| "movl %0, %%esi \n\t" // src | |||||
| "movl %1, %%edi \n\t" // buf1 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "xorl %%ebx, %%ebx \n\t" | |||||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||||
| // "int $3\n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), | |||||
| "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||||
| ); | |||||
| } | |||||
| else | |||||
| { | |||||
| #endif | |||||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | ||||
| //FIXME add prefetch | //FIXME add prefetch | ||||
| asm volatile( | asm volatile( | ||||
| @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | "movw %%si, (%%edi, %%eax, 2) \n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | ||||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, 2(%%edi, %%eax, 2) \n\t" | "movw %%si, 2(%%edi, %%eax, 2) \n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| " jb 1b \n\t" | " jb 1b \n\t" | ||||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | ||||
| ); | ); | ||||
| #ifdef HAVE_MMX2 | |||||
| } //if MMX2 cant be used | |||||
| #endif | |||||
| #else | #else | ||||
| for(i=0;i<dstw;i++){ | for(i=0;i<dstw;i++){ | ||||
| register unsigned int xx=xpos>>8; | |||||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||||
| register unsigned int xx=xpos>>16; | |||||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | ||||
| xpos+=s_xinc; | xpos+=s_xinc; | ||||
| } | } | ||||
| #endif | #endif | ||||
| } | |||||
| // *** horizontal scale U and V lines to temp buffer | // *** horizontal scale U and V lines to temp buffer | ||||
| if(!(y0&1)){ | |||||
| if(s_last_y1pos!=y1){ | |||||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | ||||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | ||||
| xpos=0; | |||||
| // this loop should be rewritten in MMX assembly!!!! | |||||
| int xpos=0; | |||||
| s_last_y1pos= y1; | |||||
| #ifdef ARCH_X86 | |||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| asm volatile( | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | "pxor %%mm7, %%mm7 \n\t" | ||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | ||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "movq %%mm6, %%mm2 \n\t" | "movq %%mm6, %%mm2 \n\t" | ||||
| "psllq $16, %%mm2 \n\t" | "psllq $16, %%mm2 \n\t" | ||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | "psllq $16, %%mm2 \n\t" | ||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF | |||||
| "movq %%mm2, temp0 \n\t" | "movq %%mm2, temp0 \n\t" | ||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %0, %%esi \n\t" // src | "movl %0, %%esi \n\t" // src | ||||
| "movl %1, %%edi \n\t" // buf1 | "movl %1, %%edi \n\t" // buf1 | ||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||||
| "xorl %%ecx, %%ecx \n\t" | "xorl %%ecx, %%ecx \n\t" | ||||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||||
| "xorl %%ebx, %%ebx \n\t" | |||||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||||
| // "int $3\n\t" | // "int $3\n\t" | ||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| #define FUNNYUVCODE \ | |||||
| "call funnyUVCode \n\t"\ | |||||
| "movq temp0, %%mm2 \n\t"\ | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %6, %%esi \n\t" // src | "movl %6, %%esi \n\t" // src | ||||
| "movl %1, %%edi \n\t" // buf1 | "movl %1, %%edi \n\t" // buf1 | ||||
| "addl $4096, %%edi \n\t" | "addl $4096, %%edi \n\t" | ||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), | |||||
| "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | ||||
| ); | ); | ||||
| #elif defined (ARCH_X86) | |||||
| } | |||||
| else | |||||
| { | |||||
| #endif | |||||
| asm volatile( | asm volatile( | ||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | "xorl %%ebx, %%ebx \n\t" // xx | ||||
| @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | "movw %%si, (%%edi, %%eax, 2) \n\t" | ||||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | ||||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| "addl $1, %%eax \n\t" | "addl $1, %%eax \n\t" | ||||
| "cmpl %2, %%eax \n\t" | "cmpl %2, %%eax \n\t" | ||||
| " jb 1b \n\t" | " jb 1b \n\t" | ||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), | |||||
| "r" (src2) | "r" (src2) | ||||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | ||||
| ); | ); | ||||
| #ifdef HAVE_MMX2 | |||||
| } //if MMX2 cant be used | |||||
| #endif | |||||
| #else | #else | ||||
| for(i=0;i<dstw;i++){ | |||||
| register unsigned int xx=xpos>>8; | |||||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||||
| for(i=0;i<dstw;i++){ | |||||
| register unsigned int xx=xpos>>16; | |||||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | ||||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | ||||
| xpos+=s_xinc2; | xpos+=s_xinc2; | ||||
| } | |||||
| #endif | |||||
| } | } | ||||
| if(!y0) continue; | |||||
| #endif | |||||
| } | } | ||||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | ||||
| // Re: Note1: ok n*4 for now | // Re: Note1: ok n*4 for now | ||||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | // Note2: instead of using lookup tabs, mmx version could do the multiply... | ||||
| @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "1: \n\t"\ | "1: \n\t"\ | ||||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | ||||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | ||||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||||
| \ | |||||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | ||||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | ||||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | ||||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | ||||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | ||||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | ||||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||||
| \ | |||||
| \ | \ | ||||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | ||||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | ||||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | ||||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | ||||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | ||||
| \ | \ | ||||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||||
| \ | \ | ||||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | ||||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | "pmulhw vrCoeff, %%mm0 \n\t"\ | ||||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | ||||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||||
| \ | \ | ||||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||||
| "paddw %%mm4, %%mm2 \n\t"\ | "paddw %%mm4, %%mm2 \n\t"\ | ||||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | ||||
| \ | \ | ||||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||||
| "packuswb %%mm1, %%mm1 \n\t" | "packuswb %%mm1, %%mm1 \n\t" | ||||
| YSCALEYUV2RGB | YSCALEYUV2RGB | ||||
| @@ -610,9 +675,11 @@ YSCALEYUV2RGB | |||||
| asm volatile( | asm volatile( | ||||
| YSCALEYUV2RGB | YSCALEYUV2RGB | ||||
| #ifdef DITHER16BPP | |||||
| "paddusb g16Dither, %%mm1 \n\t" | "paddusb g16Dither, %%mm1 \n\t" | ||||
| "paddusb b16Dither, %%mm0 \n\t" | "paddusb b16Dither, %%mm0 \n\t" | ||||
| "paddusb b16Dither, %%mm3 \n\t" | "paddusb b16Dither, %%mm3 \n\t" | ||||
| #endif | |||||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | ||||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | ||||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | ||||
| @@ -699,8 +766,6 @@ YSCALEYUV2RGB | |||||
| #elif defined (HAVE_MMX) | #elif defined (HAVE_MMX) | ||||
| asm volatile("emms"); | asm volatile("emms"); | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -1,28 +1,29 @@ | |||||
| // Software scaling and colorspace conversion routines for MPlayer | // Software scaling and colorspace conversion routines for MPlayer | ||||
| // Orginal C implementation by ? | |||||
| // current version mostly by Michael Niedermayer (michaelni@gmx.at) | |||||
| #include <inttypes.h> | #include <inttypes.h> | ||||
| #include "../config.h" | #include "../config.h" | ||||
| #undef HAVE_MMX2 //code is buggy | |||||
| //#undef HAVE_MMX2 | |||||
| //#undef HAVE_MMX | //#undef HAVE_MMX | ||||
| //#undef ARCH_X86 | |||||
| #define DITHER16BPP | |||||
| #define ALT_ERROR | |||||
| #define RET 0xC3 //near return opcode | #define RET 0xC3 //near return opcode | ||||
| /* | |||||
| NOTES | |||||
| // temporary storage for 4 yuv lines: | |||||
| // 16bit for now (mmx likes it more compact) | |||||
| static uint16_t pix_buf_y[4][2048]; | |||||
| static uint16_t pix_buf_uv[2][2048*2]; | |||||
| known BUGS with known cause (no bugreports please!) | |||||
| line at the right (c,asm and mmx2) | |||||
| code reads 1 sample too much (might cause a sig11) | |||||
| // clipping helper table for C implementations: | |||||
| static unsigned char clip_table[768]; | |||||
| // yuv->rgb conversion tables: | |||||
| static int yuvtab_2568[256]; | |||||
| static int yuvtab_3343[256]; | |||||
| static int yuvtab_0c92[256]; | |||||
| static int yuvtab_1a1e[256]; | |||||
| static int yuvtab_40cf[256]; | |||||
| TODO | |||||
| check alignment off everything | |||||
| */ | |||||
| static uint64_t yCoeff= 0x2568256825682568LL; | static uint64_t yCoeff= 0x2568256825682568LL; | ||||
| static uint64_t ubCoeff= 0x3343334333433343LL; | static uint64_t ubCoeff= 0x3343334333433343LL; | ||||
| @@ -46,11 +47,27 @@ static uint64_t g16Mask= 0x07E007E007E007E0LL; | |||||
| static uint64_t r16Mask= 0xF800F800F800F800LL; | static uint64_t r16Mask= 0xF800F800F800F800LL; | ||||
| static uint64_t temp0; | static uint64_t temp0; | ||||
| // temporary storage for 4 yuv lines: | |||||
| // 16bit for now (mmx likes it more compact) | |||||
| static uint16_t pix_buf_y[4][2048]; | |||||
| static uint16_t pix_buf_uv[2][2048*2]; | |||||
| // clipping helper table for C implementations: | |||||
| static unsigned char clip_table[768]; | |||||
| // yuv->rgb conversion tables: | |||||
| static int yuvtab_2568[256]; | |||||
| static int yuvtab_3343[256]; | |||||
| static int yuvtab_0c92[256]; | |||||
| static int yuvtab_1a1e[256]; | |||||
| static int yuvtab_40cf[256]; | |||||
| static uint8_t funnyYCode[10000]; | static uint8_t funnyYCode[10000]; | ||||
| static uint8_t funnyUVCode[10000]; | static uint8_t funnyUVCode[10000]; | ||||
| // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | // *** bilinear scaling and yuv->rgb conversion of yv12 slices: | ||||
| // *** Note: it's called multiple times while decoding a frame, first time y==0 | // *** Note: it's called multiple times while decoding a frame, first time y==0 | ||||
| // *** Designed to upscale, but may work for downscale too. | // *** Designed to upscale, but may work for downscale too. | ||||
| @@ -64,27 +81,43 @@ void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h, | |||||
| //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | //static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height; | ||||
| //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | //static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width; | ||||
| unsigned int s_xinc2=s_xinc>>1; | |||||
| unsigned int s_xinc2; | |||||
| static int s_srcypos; | |||||
| static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||||
| static int s_ypos; | static int s_ypos; | ||||
| // last horzontally interpolated lines, used to avoid unnecessary calculations | |||||
| static int s_last_ypos; | static int s_last_ypos; | ||||
| static int s_last_y1pos; | |||||
| static int static_dstw; | static int static_dstw; | ||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| // used to detect a horizontal size change | |||||
| static int old_dstw= -1; | static int old_dstw= -1; | ||||
| static int old_s_xinc= -1; | static int old_s_xinc= -1; | ||||
| // difference between the requested xinc and the required one for the mmx2 routine | |||||
| static int s_xinc_diff=0; | |||||
| static int s_xinc2_diff=0; | |||||
| #endif | #endif | ||||
| int canMMX2BeUsed; | |||||
| s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each other | |||||
| // we need that precission at least for the mmx2 code | |||||
| s_xinc*= 256; | |||||
| s_xinc2=s_xinc>>1; | |||||
| canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0) ? 1 : 0; | |||||
| if(y==0){ | if(y==0){ | ||||
| s_srcypos=-2*s_yinc; | |||||
| s_ypos=-2; | |||||
| s_last_ypos=-2; | |||||
| s_srcypos= s_yinc/2 - 0x8000; | |||||
| s_ypos=0; | |||||
| // force calculation of the horizontal interpolation of the first line | |||||
| s_last_ypos=-99; | |||||
| s_last_y1pos=-99; | |||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| // cant downscale !!! | // cant downscale !!! | ||||
| if(old_s_xinc != s_xinc || old_dstw!=dstw) | |||||
| if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed) | |||||
| { | { | ||||
| uint8_t *fragment; | uint8_t *fragment; | ||||
| int imm8OfPShufW1; | int imm8OfPShufW1; | ||||
| @@ -102,32 +135,30 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| //code fragment | //code fragment | ||||
| // fragmentLength=0; | |||||
| // printf("%d, %d\n", fragmentLength,imm8OfPShufW1); | |||||
| asm volatile( | asm volatile( | ||||
| "jmp 9f \n\t" | "jmp 9f \n\t" | ||||
| // Begin | // Begin | ||||
| "0: \n\t" | "0: \n\t" | ||||
| "movq (%%esi, %%ebx), %%mm0 \n\t" //FIXME Alignment | |||||
| "movq (%%esi), %%mm0 \n\t" //FIXME Alignment | |||||
| "movq %%mm0, %%mm1 \n\t" | "movq %%mm0, %%mm1 \n\t" | ||||
| "psrlq $8, %%mm0 \n\t" | "psrlq $8, %%mm0 \n\t" | ||||
| "punpcklbw %%mm7, %%mm1 \n\t" | "punpcklbw %%mm7, %%mm1 \n\t" | ||||
| "movq %%mm2, %%mm3 \n\t" | |||||
| "punpcklbw %%mm7, %%mm0 \n\t" | "punpcklbw %%mm7, %%mm0 \n\t" | ||||
| "addw %%bx, %%cx \n\t" //2*xalpha += (4*s_xinc)&0xFFFF | |||||
| "pshufw $0xFF, %%mm1, %%mm1 \n\t" | "pshufw $0xFF, %%mm1, %%mm1 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "adcl %%edx, %%esi \n\t" //xx+= (4*s_xinc)>>16 + carry | |||||
| "pshufw $0xFF, %%mm0, %%mm0 \n\t" | "pshufw $0xFF, %%mm0, %%mm0 \n\t" | ||||
| "2: \n\t" | "2: \n\t" | ||||
| "psrlw $9, %%mm3 \n\t" | |||||
| "psubw %%mm1, %%mm0 \n\t" | "psubw %%mm1, %%mm0 \n\t" | ||||
| "psraw $1, %%mm0 \n\t" | |||||
| "pmullw %%mm2, %%mm0 \n\t" | |||||
| "pmullw %%mm3, %%mm0 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFFFF | |||||
| "psllw $7, %%mm1 \n\t" | "psllw $7, %%mm1 \n\t" | ||||
| "paddw %%mm1, %%mm0 \n\t" | "paddw %%mm1, %%mm0 \n\t" | ||||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" // 2*alpha += xpos&0xFF | |||||
| "addb %%ch, %%cl \n\t" //2*xalpha += (4*s_xinc)&0xFF | |||||
| "adcl %%edx, %%ebx \n\t" //xx+= (4*s_xinc)>>8 + carry | |||||
| "movq %%mm0, (%%edi, %%eax) \n\t" | |||||
| "addl $8, %%eax \n\t" | "addl $8, %%eax \n\t" | ||||
| // End | // End | ||||
| @@ -147,17 +178,28 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| ); | ); | ||||
| xpos= xx=xalpha= 0; | xpos= xx=xalpha= 0; | ||||
| //FIXME choose size and or xinc so that they fit exactly | |||||
| /* choose xinc so that all 8 parts fit exactly | |||||
| Note: we cannot use just 1 part because it would not fit in the code cache */ | |||||
| s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))+10; | |||||
| // s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8)); | |||||
| #ifdef ALT_ERROR | |||||
| s_xinc2_diff+= ((0x10000/(dstw/8))); | |||||
| #endif | |||||
| s_xinc_diff= s_xinc2_diff*2; | |||||
| s_xinc2+= s_xinc2_diff; | |||||
| s_xinc+= s_xinc_diff; | |||||
| for(i=0; i<dstw/8; i++) | for(i=0; i<dstw/8; i++) | ||||
| { | { | ||||
| int xx=xpos>>8; | |||||
| int xx=xpos>>16; | |||||
| if((i&3) == 0) | if((i&3) == 0) | ||||
| { | { | ||||
| int a=0; | int a=0; | ||||
| int b=((xpos+s_xinc)>>8) - xx; | |||||
| int c=((xpos+s_xinc*2)>>8) - xx; | |||||
| int d=((xpos+s_xinc*3)>>8) - xx; | |||||
| int b=((xpos+s_xinc)>>16) - xx; | |||||
| int c=((xpos+s_xinc*2)>>16) - xx; | |||||
| int d=((xpos+s_xinc*3)>>16) - xx; | |||||
| memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength); | ||||
| @@ -174,14 +216,14 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| //FIXME choose size and or xinc so that they fit exactly | //FIXME choose size and or xinc so that they fit exactly | ||||
| for(i=0; i<dstw/8; i++) | for(i=0; i<dstw/8; i++) | ||||
| { | { | ||||
| int xx=xpos>>8; | |||||
| int xx=xpos>>16; | |||||
| if((i&3) == 0) | if((i&3) == 0) | ||||
| { | { | ||||
| int a=0; | int a=0; | ||||
| int b=((xpos+s_xinc2)>>8) - xx; | |||||
| int c=((xpos+s_xinc2*2)>>8) - xx; | |||||
| int d=((xpos+s_xinc2*3)>>8) - xx; | |||||
| int b=((xpos+s_xinc2)>>16) - xx; | |||||
| int c=((xpos+s_xinc2*2)>>16) - xx; | |||||
| int d=((xpos+s_xinc2*3)>>16) - xx; | |||||
| memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength); | ||||
| @@ -197,86 +239,117 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| } | } | ||||
| #endif | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| s_xinc+= s_xinc_diff; | |||||
| s_xinc2+= s_xinc2_diff; | |||||
| } | |||||
| #endif // HAVE_MMX2 | |||||
| } // reset counters | } // reset counters | ||||
| while(1){ | while(1){ | ||||
| unsigned char *dest=dstptr+dststride*s_ypos; | unsigned char *dest=dstptr+dststride*s_ypos; | ||||
| int y0=2+(s_srcypos>>16); | |||||
| int y1=1+(s_srcypos>>17); | |||||
| int y0=(s_srcypos + 0xFFFF)>>16; // first luminance source line number below the dst line | |||||
| // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) | |||||
| int srcuvpos= s_srcypos + s_yinc/2 - 0x8000; | |||||
| int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line | |||||
| int yalpha=(s_srcypos&0xFFFF)>>7; | int yalpha=(s_srcypos&0xFFFF)>>7; | ||||
| int yalpha1=yalpha^511; | int yalpha1=yalpha^511; | ||||
| int uvalpha=((s_srcypos>>1)&0xFFFF)>>7; | |||||
| int uvalpha=(srcuvpos&0x1FFFF)>>8; | |||||
| int uvalpha1=uvalpha^511; | int uvalpha1=uvalpha^511; | ||||
| uint16_t *buf0=pix_buf_y[y0&3]; | |||||
| uint16_t *buf1=pix_buf_y[((y0+1)&3)]; | |||||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; | |||||
| uint16_t *uvbuf1=pix_buf_uv[(y1&1)^1]; | |||||
| uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice | |||||
| uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice | |||||
| uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice | |||||
| uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1]; // bottom line of the interpolated slice | |||||
| int i; | int i; | ||||
| if(y0>=y+h) break; | |||||
| // if this is before the first line than use only the first src line | |||||
| if(y0==0) buf0= buf1; | |||||
| if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0 | |||||
| if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway | |||||
| // if this is after the last line than use only the last src line | |||||
| if(y0>=y+h) | |||||
| { | |||||
| buf1= buf0; | |||||
| s_last_ypos=y0; | |||||
| } | |||||
| if(y1>=(y+h)/2) | |||||
| { | |||||
| uvbuf1= uvbuf0; | |||||
| s_last_y1pos=y1; | |||||
| } | |||||
| s_ypos++; s_srcypos+=s_yinc; | s_ypos++; s_srcypos+=s_yinc; | ||||
| //only interpolate the src line horizontally if we didnt do it allready | |||||
| if(s_last_ypos!=y0){ | if(s_last_ypos!=y0){ | ||||
| unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | unsigned char *src=srcptr[0]+(y0-y)*stride[0]; | ||||
| unsigned int xpos=0; | unsigned int xpos=0; | ||||
| s_last_ypos=y0; | s_last_ypos=y0; | ||||
| // *** horizontal scale Y line to temp buffer | // *** horizontal scale Y line to temp buffer | ||||
| // this loop should be rewritten in MMX assembly!!!! | |||||
| #ifdef HAVE_MMX2 | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "movq %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "movq %%mm2, temp0 \n\t" | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "xorl %%eax, %%eax \n\t" // i | |||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %0, %%esi \n\t" // src | |||||
| "movl %1, %%edi \n\t" // buf1 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||||
| // "int $3\n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>8), | |||||
| "m" ((s_xinc*4)&0xFF), "m" (s_xinc&0xFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||||
| ); | |||||
| #ifdef ARCH_X86 | |||||
| #elif defined (ARCH_X86) | |||||
| #ifdef HAVE_MMX2 | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | |||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "movq %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "movq %%mm2, temp0 \n\t" | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | |||||
| "xorl %%eax, %%eax \n\t" // i | |||||
| "movl %0, %%esi \n\t" // src | |||||
| "movl %1, %%edi \n\t" // buf1 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "xorl %%ebx, %%ebx \n\t" | |||||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||||
| // "int $3\n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| "call funnyYCode \n\t" | |||||
| :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), | |||||
| "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | |||||
| ); | |||||
| } | |||||
| else | |||||
| { | |||||
| #endif | |||||
| //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | //NO MMX just normal asm ... FIXME try/write funny MMX2 variant | ||||
| //FIXME add prefetch | //FIXME add prefetch | ||||
| asm volatile( | asm volatile( | ||||
| @@ -288,24 +361,24 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | "movw %%si, (%%edi, %%eax, 2) \n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | "movzbl (%0, %%ebx), %%edi \n\t" //src[xx] | ||||
| "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%0, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, 2(%%edi, %%eax, 2) \n\t" | "movw %%si, 2(%%edi, %%eax, 2) \n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| @@ -314,106 +387,96 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| " jb 1b \n\t" | " jb 1b \n\t" | ||||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>8), "m" (s_xinc&0xFF) | |||||
| :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF) | |||||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | ||||
| ); | ); | ||||
| #ifdef HAVE_MMX2 | |||||
| } //if MMX2 cant be used | |||||
| #endif | |||||
| #else | #else | ||||
| for(i=0;i<dstw;i++){ | for(i=0;i<dstw;i++){ | ||||
| register unsigned int xx=xpos>>8; | |||||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||||
| register unsigned int xx=xpos>>16; | |||||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||||
| buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha); | ||||
| xpos+=s_xinc; | xpos+=s_xinc; | ||||
| } | } | ||||
| #endif | #endif | ||||
| } | |||||
| // *** horizontal scale U and V lines to temp buffer | // *** horizontal scale U and V lines to temp buffer | ||||
| if(!(y0&1)){ | |||||
| if(s_last_y1pos!=y1){ | |||||
| unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1]; | ||||
| unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2]; | ||||
| xpos=0; | |||||
| // this loop should be rewritten in MMX assembly!!!! | |||||
| int xpos=0; | |||||
| s_last_y1pos= y1; | |||||
| #ifdef ARCH_X86 | |||||
| #ifdef HAVE_MMX2 | #ifdef HAVE_MMX2 | ||||
| asm volatile( | |||||
| if(canMMX2BeUsed) | |||||
| { | |||||
| asm volatile( | |||||
| "pxor %%mm7, %%mm7 \n\t" | "pxor %%mm7, %%mm7 \n\t" | ||||
| "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | "pxor %%mm2, %%mm2 \n\t" // 2*xalpha | ||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFF | |||||
| "movd %5, %%mm6 \n\t" // s_xinc&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "movq %%mm6, %%mm2 \n\t" | "movq %%mm6, %%mm2 \n\t" | ||||
| "psllq $16, %%mm2 \n\t" | "psllq $16, %%mm2 \n\t" | ||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" | "psllq $16, %%mm2 \n\t" | ||||
| "paddb %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFF | |||||
| "paddw %%mm6, %%mm2 \n\t" | |||||
| "psllq $16, %%mm2 \n\t" //0,t,2t,3t t=s_xinc&0xFFFF | |||||
| "movq %%mm2, temp0 \n\t" | "movq %%mm2, temp0 \n\t" | ||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFF | |||||
| "movd %4, %%mm6 \n\t" //(s_xinc*4)&0xFFFF | |||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "punpcklwd %%mm6, %%mm6 \n\t" | "punpcklwd %%mm6, %%mm6 \n\t" | ||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %0, %%esi \n\t" // src | "movl %0, %%esi \n\t" // src | ||||
| "movl %1, %%edi \n\t" // buf1 | "movl %1, %%edi \n\t" // buf1 | ||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>8 | |||||
| "movl %3, %%edx \n\t" // (s_xinc*4)>>16 | |||||
| "xorl %%ecx, %%ecx \n\t" | "xorl %%ecx, %%ecx \n\t" | ||||
| "movb %4, %%ch \n\t" // (s_xinc*4)&0xFF | |||||
| "xorl %%ebx, %%ebx \n\t" | |||||
| "movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF | |||||
| // "int $3\n\t" | // "int $3\n\t" | ||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| #define FUNNYUVCODE \ | |||||
| "call funnyUVCode \n\t"\ | |||||
| "movq temp0, %%mm2 \n\t"\ | |||||
| "xorl %%ecx, %%ecx \n\t" | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | |||||
| "movl %6, %%esi \n\t" // src | "movl %6, %%esi \n\t" // src | ||||
| "movl %1, %%edi \n\t" // buf1 | "movl %1, %%edi \n\t" // buf1 | ||||
| "addl $4096, %%edi \n\t" | "addl $4096, %%edi \n\t" | ||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| "movq temp0, %%mm2 \n\t" | |||||
| "xorb %%cl, %%cl \n\t" | |||||
| "call funnyUVCode \n\t" | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>8), | |||||
| "m" ((s_xinc2*4)&0xFF), "m" (s_xinc2&0xFF), "m" (src2) | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| FUNNYUVCODE | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), | |||||
| "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) | |||||
| : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" | ||||
| ); | ); | ||||
| #elif defined (ARCH_X86) | |||||
| } | |||||
| else | |||||
| { | |||||
| #endif | |||||
| asm volatile( | asm volatile( | ||||
| "xorl %%eax, %%eax \n\t" // i | "xorl %%eax, %%eax \n\t" // i | ||||
| "xorl %%ebx, %%ebx \n\t" // xx | "xorl %%ebx, %%ebx \n\t" // xx | ||||
| @@ -424,46 +487,48 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%%esi, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, (%%edi, %%eax, 2) \n\t" | "movw %%si, (%%edi, %%eax, 2) \n\t" | ||||
| "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | "movzbl (%5, %%ebx), %%edi \n\t" //src[xx] | ||||
| "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | "movzbl 1(%5, %%ebx), %%esi \n\t" //src[xx+1] | ||||
| "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | "subl %%edi, %%esi \n\t" //src[xx+1] - src[xx] | ||||
| "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | "imull %%ecx, %%esi \n\t" //(src[xx+1] - src[xx])*2*xalpha | ||||
| "shll $8, %%edi \n\t" | |||||
| "shll $16, %%edi \n\t" | |||||
| "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | "addl %%edi, %%esi \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha) | ||||
| "movl %1, %%edi \n\t" | "movl %1, %%edi \n\t" | ||||
| "shrl $1, %%esi \n\t" | |||||
| "shrl $9, %%esi \n\t" | |||||
| "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | "movw %%si, 4096(%%edi, %%eax, 2)\n\t" | ||||
| "addb %4, %%cl \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "addw %4, %%cx \n\t" //2*xalpha += s_xinc&0xFF | |||||
| "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | "adcl %3, %%ebx \n\t" //xx+= s_xinc>>8 + carry | ||||
| "addl $1, %%eax \n\t" | "addl $1, %%eax \n\t" | ||||
| "cmpl %2, %%eax \n\t" | "cmpl %2, %%eax \n\t" | ||||
| " jb 1b \n\t" | " jb 1b \n\t" | ||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>8), "m" (s_xinc2&0xFF), | |||||
| :: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), | |||||
| "r" (src2) | "r" (src2) | ||||
| : "%eax", "%ebx", "%ecx", "%edi", "%esi" | : "%eax", "%ebx", "%ecx", "%edi", "%esi" | ||||
| ); | ); | ||||
| #ifdef HAVE_MMX2 | |||||
| } //if MMX2 cant be used | |||||
| #endif | |||||
| #else | #else | ||||
| for(i=0;i<dstw;i++){ | |||||
| register unsigned int xx=xpos>>8; | |||||
| register unsigned int xalpha=(xpos&0xFF)>>1; | |||||
| for(i=0;i<dstw;i++){ | |||||
| register unsigned int xx=xpos>>16; | |||||
| register unsigned int xalpha=(xpos&0xFFFF)>>9; | |||||
| uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); | ||||
| uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha); | ||||
| xpos+=s_xinc2; | xpos+=s_xinc2; | ||||
| } | |||||
| #endif | |||||
| } | } | ||||
| if(!y0) continue; | |||||
| #endif | |||||
| } | } | ||||
| // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... | ||||
| // Re: Note1: ok n*4 for now | // Re: Note1: ok n*4 for now | ||||
| // Note2: instead of using lookup tabs, mmx version could do the multiply... | // Note2: instead of using lookup tabs, mmx version could do the multiply... | ||||
| @@ -489,47 +554,47 @@ s_xinc&= -2; //clear last bit or uv and y might be shifted relative to each othe | |||||
| "1: \n\t"\ | "1: \n\t"\ | ||||
| "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | "movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ | ||||
| "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | "movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ | ||||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||||
| \ | |||||
| "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | "movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ | ||||
| "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | "movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ | ||||
| "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ | |||||
| "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ | ||||
| "pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ | |||||
| "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | "pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ | ||||
| "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>7*/\ | |||||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||||
| "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\ | ||||
| "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ | |||||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||||
| "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ | ||||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||||
| "psubw w10, %%mm1 \n\t" /* Y-16*/\ | |||||
| "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | "psubw w80, %%mm3 \n\t" /* (U-128)*/\ | ||||
| "psllw $3, %%mm1 \n\t" /* (y-16)*8*/\ | |||||
| "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | "psllw $3, %%mm3 \n\t" /*(U-128)8*/\ | ||||
| "pmulhw yCoeff, %%mm1 \n\t"\ | |||||
| \ | |||||
| \ | \ | ||||
| "movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ | |||||
| "movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ | |||||
| "psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ | |||||
| "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | "pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ | ||||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||||
| "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | "psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\ | ||||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||||
| "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | "paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ | ||||
| "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | "psubw w80, %%mm0 \n\t" /* (V-128)*/\ | ||||
| "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | "psllw $3, %%mm0 \n\t" /* (V-128)8*/\ | ||||
| \ | \ | ||||
| "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ | |||||
| "pmulhw ubCoeff, %%mm3 \n\t"\ | |||||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||||
| \ | \ | ||||
| "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | "movq %%mm0, %%mm4 \n\t" /* (V-128)8*/\ | ||||
| "pmulhw vrCoeff, %%mm0 \n\t"\ | "pmulhw vrCoeff, %%mm0 \n\t"\ | ||||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||||
| "paddw %%mm1, %%mm3 \n\t" /* B*/\ | |||||
| "paddw %%mm1, %%mm0 \n\t" /* R*/\ | "paddw %%mm1, %%mm0 \n\t" /* R*/\ | ||||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||||
| \ | \ | ||||
| "pmulhw ugCoeff, %%mm2 \n\t"\ | |||||
| "pmulhw vgCoeff, %%mm4 \n\t"\ | |||||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||||
| "paddw %%mm4, %%mm2 \n\t"\ | "paddw %%mm4, %%mm2 \n\t"\ | ||||
| "paddw %%mm2, %%mm1 \n\t" /* G*/\ | "paddw %%mm2, %%mm1 \n\t" /* G*/\ | ||||
| \ | \ | ||||
| "packuswb %%mm3, %%mm3 \n\t"\ | |||||
| "packuswb %%mm0, %%mm0 \n\t"\ | |||||
| "packuswb %%mm1, %%mm1 \n\t" | "packuswb %%mm1, %%mm1 \n\t" | ||||
| YSCALEYUV2RGB | YSCALEYUV2RGB | ||||
| @@ -610,9 +675,11 @@ YSCALEYUV2RGB | |||||
| asm volatile( | asm volatile( | ||||
| YSCALEYUV2RGB | YSCALEYUV2RGB | ||||
| #ifdef DITHER16BPP | |||||
| "paddusb g16Dither, %%mm1 \n\t" | "paddusb g16Dither, %%mm1 \n\t" | ||||
| "paddusb b16Dither, %%mm0 \n\t" | "paddusb b16Dither, %%mm0 \n\t" | ||||
| "paddusb b16Dither, %%mm3 \n\t" | "paddusb b16Dither, %%mm3 \n\t" | ||||
| #endif | |||||
| "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | "punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G | ||||
| "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | "punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B | ||||
| "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | "punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R | ||||
| @@ -699,8 +766,6 @@ YSCALEYUV2RGB | |||||
| #elif defined (HAVE_MMX) | #elif defined (HAVE_MMX) | ||||
| asm volatile("emms"); | asm volatile("emms"); | ||||
| #endif | #endif | ||||
| } | } | ||||