fixed the color range for yuv fixed the width %8!=0 bug (another 1% speed loss) Originally committed as revision 2286 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -122,7 +122,7 @@ static uint64_t temp3=0; | |||
| static uint64_t temp4=0; | |||
| static uint64_t temp5=0; | |||
| static uint64_t pQPb=0; | |||
| static uint8_t tempBlock[16*16]; | |||
| static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data | |||
| int hFlatnessThreshold= 56 - 16; | |||
| int vFlatnessThreshold= 56 - 16; | |||
| @@ -132,7 +132,7 @@ double maxClippedThreshold= 0.01; | |||
| int maxAllowedY=255; | |||
| //FIXME can never make a movie´s black brighter (anyone needs that?) | |||
| int minAllowedY=0; | |||
| int minAllowedY=16; | |||
| #ifdef TIMING | |||
| static inline long long rdtsc() | |||
| @@ -2398,6 +2398,13 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| static uint8_t *tempDst= NULL; | |||
| static uint8_t *tempSrc= NULL; | |||
| /* Temporary buffers for handling the last block */ | |||
| static uint8_t *tempDstBlock= NULL; | |||
| static uint8_t *tempSrcBlock= NULL; | |||
| uint8_t *dstBlockPtrBackup; | |||
| uint8_t *srcBlockPtrBackup; | |||
| #ifdef TIMING | |||
| long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | |||
| sumTime= rdtsc(); | |||
| @@ -2407,6 +2414,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| { | |||
| tempDst= (uint8_t*)memalign(8, 1024*24); | |||
| tempSrc= (uint8_t*)memalign(8, 1024*24); | |||
| tempDstBlock= (uint8_t*)memalign(8, 1024*24); | |||
| tempSrcBlock= (uint8_t*)memalign(8, 1024*24); | |||
| } | |||
| if(!yHistogram) | |||
| @@ -2414,6 +2423,12 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| int i; | |||
| yHistogram= (uint64_t*)malloc(8*256); | |||
| for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; | |||
| if(mode & FULL_Y_RANGE) | |||
| { | |||
| maxAllowedY=255; | |||
| minAllowedY=0; | |||
| } | |||
| } | |||
| if(!isColor) | |||
| @@ -2505,6 +2520,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| srcBlock= tempSrc; | |||
| } | |||
| // From this point on it is guranteed that we can read and write 16 lines downward | |||
| // finish 1 block before the next otherwise we´ll might have a problem | |||
| // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |||
| for(x=0; x<width; x+=BLOCK_SIZE) | |||
| @@ -2545,6 +2561,23 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++; | |||
| //can we mess with a 8x16 block, if not use a temp buffer, yes again | |||
| if(x+7 >= width) | |||
| { | |||
| int i; | |||
| dstBlockPtrBackup= dstBlock; | |||
| srcBlockPtrBackup= srcBlock; | |||
| for(i=0;i<BLOCK_SIZE*2; i++) | |||
| { | |||
| memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x); | |||
| memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x); | |||
| } | |||
| dstBlock= tempDstBlock; | |||
| srcBlock= tempSrcBlock; | |||
| } | |||
| blockCopy(dstBlock + dstStride*5, dstStride, | |||
| srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); | |||
| @@ -2593,7 +2626,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| } | |||
| /* check if we have a previous block to deblock it with dstBlock */ | |||
| if(x - 8 >= 0 && x<width) | |||
| if(x - 8 >= 0) | |||
| { | |||
| #ifdef MORE_TIMING | |||
| T0= rdtsc(); | |||
| @@ -2624,12 +2657,25 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| dering(dstBlock - stride*9 + width-9, stride, QP); | |||
| //FIXME dering filter will not be applied to last block (bottom right) | |||
| /* did we use a tmp-block buffer */ | |||
| if(x+7 >= width) | |||
| { | |||
| int i; | |||
| dstBlock= dstBlockPtrBackup; | |||
| srcBlock= srcBlockPtrBackup; | |||
| for(i=0;i<BLOCK_SIZE*2; i++) | |||
| { | |||
| memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x); | |||
| } | |||
| } | |||
| dstBlock+=8; | |||
| srcBlock+=8; | |||
| } | |||
| /* did we use a tmp buffer */ | |||
| if(y+15 > height) | |||
| if(y+15 >= height) | |||
| { | |||
| uint8_t *dstBlock= &(dst[y*dstStride]); | |||
| memcpy(dstBlock, tempDst, dstStride*(height-y) ); | |||
| @@ -46,6 +46,9 @@ | |||
| #define H_RK1_FILTER 0x1000 // 4096 (not implemented yet) | |||
| #define H_X1_FILTER 0x2000 // 8192 | |||
| // select between full y range (255-0) or standart one ( | |||
| #define FULL_Y_RANGE 0x8000 // 32768 | |||
| //Deinterlacing Filters | |||
| #define LINEAR_IPOL_DEINT_FILTER 0x10000 // 65536 | |||
| #define LINEAR_BLEND_DEINT_FILTER 0x20000 // 131072 | |||
| @@ -122,7 +122,7 @@ static uint64_t temp3=0; | |||
| static uint64_t temp4=0; | |||
| static uint64_t temp5=0; | |||
| static uint64_t pQPb=0; | |||
| static uint8_t tempBlock[16*16]; | |||
| static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data | |||
| int hFlatnessThreshold= 56 - 16; | |||
| int vFlatnessThreshold= 56 - 16; | |||
| @@ -132,7 +132,7 @@ double maxClippedThreshold= 0.01; | |||
| int maxAllowedY=255; | |||
| //FIXME can never make a movie´s black brighter (anyone needs that?) | |||
| int minAllowedY=0; | |||
| int minAllowedY=16; | |||
| #ifdef TIMING | |||
| static inline long long rdtsc() | |||
| @@ -2398,6 +2398,13 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| static uint8_t *tempDst= NULL; | |||
| static uint8_t *tempSrc= NULL; | |||
| /* Temporary buffers for handling the last block */ | |||
| static uint8_t *tempDstBlock= NULL; | |||
| static uint8_t *tempSrcBlock= NULL; | |||
| uint8_t *dstBlockPtrBackup; | |||
| uint8_t *srcBlockPtrBackup; | |||
| #ifdef TIMING | |||
| long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | |||
| sumTime= rdtsc(); | |||
| @@ -2407,6 +2414,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| { | |||
| tempDst= (uint8_t*)memalign(8, 1024*24); | |||
| tempSrc= (uint8_t*)memalign(8, 1024*24); | |||
| tempDstBlock= (uint8_t*)memalign(8, 1024*24); | |||
| tempSrcBlock= (uint8_t*)memalign(8, 1024*24); | |||
| } | |||
| if(!yHistogram) | |||
| @@ -2414,6 +2423,12 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| int i; | |||
| yHistogram= (uint64_t*)malloc(8*256); | |||
| for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; | |||
| if(mode & FULL_Y_RANGE) | |||
| { | |||
| maxAllowedY=255; | |||
| minAllowedY=0; | |||
| } | |||
| } | |||
| if(!isColor) | |||
| @@ -2505,6 +2520,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| srcBlock= tempSrc; | |||
| } | |||
| // From this point on it is guranteed that we can read and write 16 lines downward | |||
| // finish 1 block before the next otherwise we´ll might have a problem | |||
| // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |||
| for(x=0; x<width; x+=BLOCK_SIZE) | |||
| @@ -2545,6 +2561,23 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++; | |||
| //can we mess with a 8x16 block, if not use a temp buffer, yes again | |||
| if(x+7 >= width) | |||
| { | |||
| int i; | |||
| dstBlockPtrBackup= dstBlock; | |||
| srcBlockPtrBackup= srcBlock; | |||
| for(i=0;i<BLOCK_SIZE*2; i++) | |||
| { | |||
| memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x); | |||
| memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x); | |||
| } | |||
| dstBlock= tempDstBlock; | |||
| srcBlock= tempSrcBlock; | |||
| } | |||
| blockCopy(dstBlock + dstStride*5, dstStride, | |||
| srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); | |||
| @@ -2593,7 +2626,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| } | |||
| /* check if we have a previous block to deblock it with dstBlock */ | |||
| if(x - 8 >= 0 && x<width) | |||
| if(x - 8 >= 0) | |||
| { | |||
| #ifdef MORE_TIMING | |||
| T0= rdtsc(); | |||
| @@ -2624,12 +2657,25 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| dering(dstBlock - stride*9 + width-9, stride, QP); | |||
| //FIXME dering filter will not be applied to last block (bottom right) | |||
| /* did we use a tmp-block buffer */ | |||
| if(x+7 >= width) | |||
| { | |||
| int i; | |||
| dstBlock= dstBlockPtrBackup; | |||
| srcBlock= srcBlockPtrBackup; | |||
| for(i=0;i<BLOCK_SIZE*2; i++) | |||
| { | |||
| memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x); | |||
| } | |||
| } | |||
| dstBlock+=8; | |||
| srcBlock+=8; | |||
| } | |||
| /* did we use a tmp buffer */ | |||
| if(y+15 > height) | |||
| if(y+15 >= height) | |||
| { | |||
| uint8_t *dstBlock= &(dst[y*dstStride]); | |||
| memcpy(dstBlock, tempDst, dstStride*(height-y) ); | |||