brightness_debug (draws luminance histogram & autodetected white/black level) Originally committed as revision 3014 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -21,11 +21,11 @@ | |||
| isVertDC Ec Ec | |||
| isVertMinMaxOk Ec Ec | |||
| doVertLowPass E e e | |||
| doVertDefFilter Ec Ec Ec | |||
| doVertDefFilter Ec Ec e e | |||
| isHorizDC Ec Ec | |||
| isHorizMinMaxOk a E | |||
| doHorizLowPass E e e | |||
| doHorizDefFilter Ec Ec Ec | |||
| doHorizDefFilter Ec Ec e e | |||
| deRing E e e* | |||
| Vertical RKAlgo1 E a a | |||
| Horizontal RKAlgo1 a a | |||
| @@ -63,8 +63,6 @@ optimize c versions | |||
| try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | |||
| smart blur | |||
| ... | |||
| Notes: | |||
| */ | |||
| //Changelog: use the CVS log | |||
| @@ -80,6 +78,7 @@ Notes: | |||
| //#undef HAVE_MMX2 | |||
| //#define HAVE_3DNOW | |||
| //#undef HAVE_MMX | |||
| //#define DEBUG_BRIGHTNESS | |||
| #include "postprocess.h" | |||
| #define MIN(a,b) ((a) > (b) ? (b) : (a)) | |||
| @@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |||
| static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |||
| { | |||
| #ifdef HAVE_MMX | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| /* | |||
| uint8_t tmp[16]; | |||
| const int l1= stride; | |||
| const int l2= stride + l1; | |||
| const int l3= stride + l2; | |||
| const int l4= (int)tmp - (int)src - stride*3; | |||
| const int l5= (int)tmp - (int)src - stride*3 + 8; | |||
| const int l6= stride*3 + l3; | |||
| const int l7= stride + l6; | |||
| const int l8= stride + l7; | |||
| memcpy(tmp, src+stride*7, 8); | |||
| memcpy(tmp+8, src+stride*8, 8); | |||
| */ | |||
| src+= stride*4; | |||
| asm volatile( | |||
| #if 0 //sligtly more accurate and slightly slower | |||
| "pxor %%mm7, %%mm7 \n\t" // 0 | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 | |||
| // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |||
| "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |||
| "movq (%0), %%mm1 \n\t" // l0 | |||
| "movq %%mm0, %%mm2 \n\t" // l2 | |||
| PAVGB(%%mm7, %%mm0) // ~l2/2 | |||
| PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |||
| PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |||
| "movq (%%eax), %%mm1 \n\t" // l1 | |||
| "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |||
| "movq %%mm1, %%mm4 \n\t" // l1 | |||
| PAVGB(%%mm7, %%mm1) // ~l1/2 | |||
| PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |||
| PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |||
| "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |||
| "psubusb %%mm1, %%mm0 \n\t" | |||
| "psubusb %%mm4, %%mm1 \n\t" | |||
| "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |||
| // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |||
| "movq %%mm0, %%mm4 \n\t" // l4 | |||
| PAVGB(%%mm7, %%mm0) // ~l4/2 | |||
| PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |||
| PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |||
| "movq (%%ebx), %%mm2 \n\t" // l5 | |||
| "movq %%mm3, %%mm5 \n\t" // l3 | |||
| PAVGB(%%mm7, %%mm3) // ~l3/2 | |||
| PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |||
| PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |||
| "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |||
| "psubusb %%mm3, %%mm0 \n\t" | |||
| "psubusb %%mm6, %%mm3 \n\t" | |||
| "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |||
| "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |||
| // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |||
| "movq (%%ebx, %1), %%mm6 \n\t" // l6 | |||
| "movq %%mm6, %%mm5 \n\t" // l6 | |||
| PAVGB(%%mm7, %%mm6) // ~l6/2 | |||
| PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |||
| PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |||
| "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 | |||
| "movq %%mm2, %%mm4 \n\t" // l5 | |||
| PAVGB(%%mm7, %%mm2) // ~l5/2 | |||
| PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |||
| PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |||
| "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |||
| "psubusb %%mm2, %%mm6 \n\t" | |||
| "psubusb %%mm4, %%mm2 \n\t" | |||
| "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |||
| // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |||
| PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |||
| "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ? | |||
| "paddusb b01, %%mm4 \n\t" | |||
| "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | |||
| "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |||
| "pand %%mm4, %%mm3 \n\t" | |||
| "movq %%mm3, %%mm1 \n\t" | |||
| // "psubusb b01, %%mm3 \n\t" | |||
| PAVGB(%%mm7, %%mm3) | |||
| PAVGB(%%mm7, %%mm3) | |||
| "paddusb %%mm1, %%mm3 \n\t" | |||
| // "paddusb b01, %%mm3 \n\t" | |||
| "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |||
| "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |||
| "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |||
| "psubusb %%mm6, %%mm5 \n\t" | |||
| "psubusb %%mm4, %%mm6 \n\t" | |||
| "por %%mm6, %%mm5 \n\t" // |l3-l4| | |||
| "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pand %%mm0, %%mm3 \n\t" | |||
| PMINUB(%%mm5, %%mm3, %%mm0) | |||
| "psubusb b01, %%mm3 \n\t" | |||
| PAVGB(%%mm7, %%mm3) | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pxor %%mm6, %%mm2 \n\t" | |||
| "psubb %%mm3, %%mm0 \n\t" | |||
| "paddb %%mm3, %%mm2 \n\t" | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pxor %%mm6, %%mm2 \n\t" | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| #endif | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |||
| // 0 1 2 3 4 5 6 7 | |||
| // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |||
| "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |||
| PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |||
| // mm1=-l3-1, mm0=128-q | |||
| "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |||
| "movq (%%eax, %1), %%mm3 \n\t" // l2 | |||
| "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |||
| "movq %%mm2, %%mm5 \n\t" // -l5-1 | |||
| "movq b80, %%mm4 \n\t" // 128 | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | |||
| PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |||
| PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |||
| PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |||
| // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |||
| "movq (%%eax), %%mm2 \n\t" // l1 | |||
| "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |||
| PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |||
| PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |||
| "movq b80, %%mm3 \n\t" // 128 | |||
| PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | |||
| PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |||
| PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |||
| // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |||
| PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | |||
| "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | |||
| "pxor %%mm6, %%mm1 \n\t" // -l7-1 | |||
| PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |||
| "movq b80, %%mm2 \n\t" // 128 | |||
| PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | |||
| PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |||
| PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |||
| // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |||
| "movq b00, %%mm1 \n\t" // 0 | |||
| "movq b00, %%mm5 \n\t" // 0 | |||
| "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | |||
| "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |||
| PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |||
| PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |||
| PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |||
| // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |||
| "movq b00, %%mm7 \n\t" // 0 | |||
| "movq pQPb, %%mm2 \n\t" // QP | |||
| PAVGB(%%mm6, %%mm2) // 128 + QP/2 | |||
| "psubb %%mm6, %%mm2 \n\t" | |||
| "movq %%mm4, %%mm1 \n\t" | |||
| "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |||
| "pxor %%mm1, %%mm4 \n\t" | |||
| "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |||
| "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |||
| "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |||
| // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |||
| "movq %%mm4, %%mm3 \n\t" // d | |||
| "psubusb b01, %%mm4 \n\t" | |||
| PAVGB(%%mm7, %%mm4) // d/32 | |||
| PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |||
| "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |||
| "pand %%mm2, %%mm4 \n\t" | |||
| "movq b80, %%mm5 \n\t" // 128 | |||
| "psubb %%mm0, %%mm5 \n\t" // q | |||
| "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |||
| "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |||
| "pxor %%mm7, %%mm5 \n\t" | |||
| PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |||
| "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |||
| "pand %%mm7, %%mm4 \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" | |||
| "pxor %%mm1, %%mm0 \n\t" | |||
| "pxor %%mm1, %%mm2 \n\t" | |||
| "paddb %%mm4, %%mm0 \n\t" | |||
| "psubb %%mm4, %%mm2 \n\t" | |||
| "pxor %%mm1, %%mm0 \n\t" | |||
| "pxor %%mm1, %%mm2 \n\t" | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| : | |||
| : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| /* | |||
| { | |||
| int x; | |||
| src-= stride; | |||
| for(x=0; x<BLOCK_SIZE; x++) | |||
| { | |||
| const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |||
| if(ABS(middleEnergy)< 8*QP) | |||
| { | |||
| const int q=(src[l4] - src[l5])/2; | |||
| const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |||
| const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |||
| int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |||
| d= MAX(d, 0); | |||
| d= (5*d + 32) >> 6; | |||
| d*= SIGN(-middleEnergy); | |||
| if(q>0) | |||
| { | |||
| d= d<0 ? 0 : d; | |||
| d= d>q ? q : d; | |||
| } | |||
| else | |||
| { | |||
| d= d>0 ? 0 : d; | |||
| d= d<q ? q : d; | |||
| } | |||
| src[l4]-= d; | |||
| src[l5]+= d; | |||
| } | |||
| src++; | |||
| } | |||
| src-=8; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| int y; | |||
| for(y=4; y<6; y++) | |||
| { | |||
| int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |||
| int ad= ABS(d); | |||
| static int max=0; | |||
| static int sum=0; | |||
| static int num=0; | |||
| static int bias=0; | |||
| if(max<ad) max=ad; | |||
| sum+= ad>3 ? 1 : 0; | |||
| if(ad>3) | |||
| { | |||
| src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |||
| } | |||
| if(y==4) bias+=d; | |||
| num++; | |||
| if(num%1000000 == 0) | |||
| { | |||
| printf(" %d %d %d %d\n", num, sum, max, bias); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| */ | |||
| #elif defined (HAVE_MMX) | |||
| src+= stride*4; | |||
| //FIXME try pmul for *5 stuff | |||
| // src[0]=0; | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "leal (%0, %1), %%eax \n\t" | |||
| @@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| uint8_t *dstBlock= &(dst[y*dstStride]); | |||
| memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | |||
| } | |||
| } | |||
| /* | |||
| for(x=0; x<width; x+=32) | |||
| { | |||
| int i; | |||
| i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | |||
| + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |||
| + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride] | |||
| + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride] | |||
| + dstBlock[x +15*dstStride]; | |||
| } | |||
| */ } | |||
| #ifdef HAVE_3DNOW | |||
| asm volatile("femms"); | |||
| #elif defined (HAVE_MMX) | |||
| @@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | |||
| , black, white); | |||
| #endif | |||
| #ifdef DEBUG_BRIGHTNESS | |||
| if(!isColor) | |||
| { | |||
| int max=1; | |||
| int i; | |||
| for(i=0; i<256; i++) | |||
| if(yHistogram[i] > max) max=yHistogram[i]; | |||
| for(i=1; i<256; i++) | |||
| { | |||
| int x; | |||
| int start=yHistogram[i-1]/(max/256+1); | |||
| int end=yHistogram[i]/(max/256+1); | |||
| int inc= end > start ? 1 : -1; | |||
| for(x=start; x!=end+inc; x+=inc) | |||
| dst[ i*dstStride + x]+=128; | |||
| } | |||
| for(i=0; i<100; i+=2) | |||
| { | |||
| dst[ (white)*dstStride + i]+=128; | |||
| dst[ (black)*dstStride + i]+=128; | |||
| } | |||
| } | |||
| #endif | |||
| } | |||
| @@ -21,11 +21,11 @@ | |||
| isVertDC Ec Ec | |||
| isVertMinMaxOk Ec Ec | |||
| doVertLowPass E e e | |||
| doVertDefFilter Ec Ec Ec | |||
| doVertDefFilter Ec Ec e e | |||
| isHorizDC Ec Ec | |||
| isHorizMinMaxOk a E | |||
| doHorizLowPass E e e | |||
| doHorizDefFilter Ec Ec Ec | |||
| doHorizDefFilter Ec Ec e e | |||
| deRing E e e* | |||
| Vertical RKAlgo1 E a a | |||
| Horizontal RKAlgo1 a a | |||
| @@ -63,8 +63,6 @@ optimize c versions | |||
| try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | |||
| smart blur | |||
| ... | |||
| Notes: | |||
| */ | |||
| //Changelog: use the CVS log | |||
| @@ -80,6 +78,7 @@ Notes: | |||
| //#undef HAVE_MMX2 | |||
| //#define HAVE_3DNOW | |||
| //#undef HAVE_MMX | |||
| //#define DEBUG_BRIGHTNESS | |||
| #include "postprocess.h" | |||
| #define MIN(a,b) ((a) > (b) ? (b) : (a)) | |||
| @@ -1067,10 +1066,299 @@ HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |||
| static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |||
| { | |||
| #ifdef HAVE_MMX | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| /* | |||
| uint8_t tmp[16]; | |||
| const int l1= stride; | |||
| const int l2= stride + l1; | |||
| const int l3= stride + l2; | |||
| const int l4= (int)tmp - (int)src - stride*3; | |||
| const int l5= (int)tmp - (int)src - stride*3 + 8; | |||
| const int l6= stride*3 + l3; | |||
| const int l7= stride + l6; | |||
| const int l8= stride + l7; | |||
| memcpy(tmp, src+stride*7, 8); | |||
| memcpy(tmp+8, src+stride*8, 8); | |||
| */ | |||
| src+= stride*4; | |||
| asm volatile( | |||
| #if 0 //sligtly more accurate and slightly slower | |||
| "pxor %%mm7, %%mm7 \n\t" // 0 | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 | |||
| // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |||
| "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |||
| "movq (%0), %%mm1 \n\t" // l0 | |||
| "movq %%mm0, %%mm2 \n\t" // l2 | |||
| PAVGB(%%mm7, %%mm0) // ~l2/2 | |||
| PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |||
| PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |||
| "movq (%%eax), %%mm1 \n\t" // l1 | |||
| "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |||
| "movq %%mm1, %%mm4 \n\t" // l1 | |||
| PAVGB(%%mm7, %%mm1) // ~l1/2 | |||
| PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |||
| PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |||
| "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |||
| "psubusb %%mm1, %%mm0 \n\t" | |||
| "psubusb %%mm4, %%mm1 \n\t" | |||
| "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |||
| // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |||
| "movq %%mm0, %%mm4 \n\t" // l4 | |||
| PAVGB(%%mm7, %%mm0) // ~l4/2 | |||
| PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |||
| PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |||
| "movq (%%ebx), %%mm2 \n\t" // l5 | |||
| "movq %%mm3, %%mm5 \n\t" // l3 | |||
| PAVGB(%%mm7, %%mm3) // ~l3/2 | |||
| PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |||
| PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |||
| "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |||
| "psubusb %%mm3, %%mm0 \n\t" | |||
| "psubusb %%mm6, %%mm3 \n\t" | |||
| "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |||
| "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |||
| // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |||
| "movq (%%ebx, %1), %%mm6 \n\t" // l6 | |||
| "movq %%mm6, %%mm5 \n\t" // l6 | |||
| PAVGB(%%mm7, %%mm6) // ~l6/2 | |||
| PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |||
| PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |||
| "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 | |||
| "movq %%mm2, %%mm4 \n\t" // l5 | |||
| PAVGB(%%mm7, %%mm2) // ~l5/2 | |||
| PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |||
| PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |||
| "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |||
| "psubusb %%mm2, %%mm6 \n\t" | |||
| "psubusb %%mm4, %%mm2 \n\t" | |||
| "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |||
| // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |||
| PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |||
| "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ? | |||
| "paddusb b01, %%mm4 \n\t" | |||
| "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | |||
| "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |||
| "pand %%mm4, %%mm3 \n\t" | |||
| "movq %%mm3, %%mm1 \n\t" | |||
| // "psubusb b01, %%mm3 \n\t" | |||
| PAVGB(%%mm7, %%mm3) | |||
| PAVGB(%%mm7, %%mm3) | |||
| "paddusb %%mm1, %%mm3 \n\t" | |||
| // "paddusb b01, %%mm3 \n\t" | |||
| "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |||
| "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |||
| "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |||
| "psubusb %%mm6, %%mm5 \n\t" | |||
| "psubusb %%mm4, %%mm6 \n\t" | |||
| "por %%mm6, %%mm5 \n\t" // |l3-l4| | |||
| "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pand %%mm0, %%mm3 \n\t" | |||
| PMINUB(%%mm5, %%mm3, %%mm0) | |||
| "psubusb b01, %%mm3 \n\t" | |||
| PAVGB(%%mm7, %%mm3) | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pxor %%mm6, %%mm2 \n\t" | |||
| "psubb %%mm3, %%mm0 \n\t" | |||
| "paddb %%mm3, %%mm2 \n\t" | |||
| "pxor %%mm6, %%mm0 \n\t" | |||
| "pxor %%mm6, %%mm2 \n\t" | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| #endif | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |||
| // 0 1 2 3 4 5 6 7 | |||
| // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |||
| "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |||
| PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |||
| // mm1=-l3-1, mm0=128-q | |||
| "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |||
| "movq (%%eax, %1), %%mm3 \n\t" // l2 | |||
| "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |||
| "movq %%mm2, %%mm5 \n\t" // -l5-1 | |||
| "movq b80, %%mm4 \n\t" // 128 | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | |||
| PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |||
| PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |||
| PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |||
| // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |||
| "movq (%%eax), %%mm2 \n\t" // l1 | |||
| "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |||
| PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |||
| PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |||
| "movq b80, %%mm3 \n\t" // 128 | |||
| PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | |||
| PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |||
| PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |||
| // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |||
| PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | |||
| "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | |||
| "pxor %%mm6, %%mm1 \n\t" // -l7-1 | |||
| PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |||
| "movq b80, %%mm2 \n\t" // 128 | |||
| PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | |||
| PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |||
| PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |||
| // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |||
| "movq b00, %%mm1 \n\t" // 0 | |||
| "movq b00, %%mm5 \n\t" // 0 | |||
| "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | |||
| "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |||
| PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |||
| PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |||
| PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |||
| // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |||
| "movq b00, %%mm7 \n\t" // 0 | |||
| "movq pQPb, %%mm2 \n\t" // QP | |||
| PAVGB(%%mm6, %%mm2) // 128 + QP/2 | |||
| "psubb %%mm6, %%mm2 \n\t" | |||
| "movq %%mm4, %%mm1 \n\t" | |||
| "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |||
| "pxor %%mm1, %%mm4 \n\t" | |||
| "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |||
| "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |||
| "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |||
| // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |||
| "movq %%mm4, %%mm3 \n\t" // d | |||
| "psubusb b01, %%mm4 \n\t" | |||
| PAVGB(%%mm7, %%mm4) // d/32 | |||
| PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |||
| "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |||
| "pand %%mm2, %%mm4 \n\t" | |||
| "movq b80, %%mm5 \n\t" // 128 | |||
| "psubb %%mm0, %%mm5 \n\t" // q | |||
| "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |||
| "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |||
| "pxor %%mm7, %%mm5 \n\t" | |||
| PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |||
| "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |||
| "pand %%mm7, %%mm4 \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" | |||
| "pxor %%mm1, %%mm0 \n\t" | |||
| "pxor %%mm1, %%mm2 \n\t" | |||
| "paddb %%mm4, %%mm0 \n\t" | |||
| "psubb %%mm4, %%mm2 \n\t" | |||
| "pxor %%mm1, %%mm0 \n\t" | |||
| "pxor %%mm1, %%mm2 \n\t" | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| : | |||
| : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| /* | |||
| { | |||
| int x; | |||
| src-= stride; | |||
| for(x=0; x<BLOCK_SIZE; x++) | |||
| { | |||
| const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |||
| if(ABS(middleEnergy)< 8*QP) | |||
| { | |||
| const int q=(src[l4] - src[l5])/2; | |||
| const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |||
| const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |||
| int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |||
| d= MAX(d, 0); | |||
| d= (5*d + 32) >> 6; | |||
| d*= SIGN(-middleEnergy); | |||
| if(q>0) | |||
| { | |||
| d= d<0 ? 0 : d; | |||
| d= d>q ? q : d; | |||
| } | |||
| else | |||
| { | |||
| d= d>0 ? 0 : d; | |||
| d= d<q ? q : d; | |||
| } | |||
| src[l4]-= d; | |||
| src[l5]+= d; | |||
| } | |||
| src++; | |||
| } | |||
| src-=8; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| int y; | |||
| for(y=4; y<6; y++) | |||
| { | |||
| int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |||
| int ad= ABS(d); | |||
| static int max=0; | |||
| static int sum=0; | |||
| static int num=0; | |||
| static int bias=0; | |||
| if(max<ad) max=ad; | |||
| sum+= ad>3 ? 1 : 0; | |||
| if(ad>3) | |||
| { | |||
| src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |||
| } | |||
| if(y==4) bias+=d; | |||
| num++; | |||
| if(num%1000000 == 0) | |||
| { | |||
| printf(" %d %d %d %d\n", num, sum, max, bias); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| */ | |||
| #elif defined (HAVE_MMX) | |||
| src+= stride*4; | |||
| //FIXME try pmul for *5 stuff | |||
| // src[0]=0; | |||
| asm volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "leal (%0, %1), %%eax \n\t" | |||
| @@ -3961,7 +4249,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| uint8_t *dstBlock= &(dst[y*dstStride]); | |||
| memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | |||
| } | |||
| } | |||
| /* | |||
| for(x=0; x<width; x+=32) | |||
| { | |||
| int i; | |||
| i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | |||
| + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |||
| + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride] | |||
| + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride] | |||
| + dstBlock[x +15*dstStride]; | |||
| } | |||
| */ } | |||
| #ifdef HAVE_3DNOW | |||
| asm volatile("femms"); | |||
| #elif defined (HAVE_MMX) | |||
| @@ -3977,4 +4275,31 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | |||
| , black, white); | |||
| #endif | |||
| #ifdef DEBUG_BRIGHTNESS | |||
| if(!isColor) | |||
| { | |||
| int max=1; | |||
| int i; | |||
| for(i=0; i<256; i++) | |||
| if(yHistogram[i] > max) max=yHistogram[i]; | |||
| for(i=1; i<256; i++) | |||
| { | |||
| int x; | |||
| int start=yHistogram[i-1]/(max/256+1); | |||
| int end=yHistogram[i]/(max/256+1); | |||
| int inc= end > start ? 1 : -1; | |||
| for(x=start; x!=end+inc; x+=inc) | |||
| dst[ i*dstStride + x]+=128; | |||
| } | |||
| for(i=0; i<100; i+=2) | |||
| { | |||
| dst[ (white)*dstStride + i]+=128; | |||
| dst[ (black)*dstStride + i]+=128; | |||
| } | |||
| } | |||
| #endif | |||
| } | |||