bugfix bugs? Originally committed as revision 2455 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -23,9 +23,9 @@ isVertMinMaxOk Ec Ec | |||||
| doVertLowPass E e e | doVertLowPass E e e | ||||
| doVertDefFilter Ec Ec Ec | doVertDefFilter Ec Ec Ec | ||||
| isHorizDC Ec Ec | isHorizDC Ec Ec | ||||
| isHorizMinMaxOk a | |||||
| doHorizLowPass E a a | |||||
| doHorizDefFilter E ac ac | |||||
| isHorizMinMaxOk a E | |||||
| doHorizLowPass E e e | |||||
| doHorizDefFilter E E E | |||||
| deRing | deRing | ||||
| Vertical RKAlgo1 E a a | Vertical RKAlgo1 E a a | ||||
| Vertical X1 a E E | Vertical X1 a E E | ||||
| @@ -60,7 +60,6 @@ compare the quality & speed of all filters | |||||
| split this huge file | split this huge file | ||||
| fix warnings (unused vars, ...) | fix warnings (unused vars, ...) | ||||
| noise reduction filters | noise reduction filters | ||||
| write an exact implementation of the horizontal delocking filter | |||||
| ... | ... | ||||
| Notes: | Notes: | ||||
| @@ -128,7 +127,7 @@ static uint64_t temp3=0; | |||||
| static uint64_t temp4=0; | static uint64_t temp4=0; | ||||
| static uint64_t temp5=0; | static uint64_t temp5=0; | ||||
| static uint64_t pQPb=0; | static uint64_t pQPb=0; | ||||
| static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data | |||||
| static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | |||||
| int hFlatnessThreshold= 56 - 16; | int hFlatnessThreshold= 56 - 16; | ||||
| int vFlatnessThreshold= 56 - 16; | int vFlatnessThreshold= 56 - 16; | ||||
| @@ -277,6 +276,7 @@ asm volatile( | |||||
| "movd %%mm0, %0 \n\t" | "movd %%mm0, %0 \n\t" | ||||
| : "=r" (numEq) | : "=r" (numEq) | ||||
| : "r" (src), "r" (stride) | : "r" (src), "r" (stride) | ||||
| : "%eax", "%ebx" | |||||
| ); | ); | ||||
| numEq= (256 - numEq) &0xFF; | numEq= (256 - numEq) &0xFF; | ||||
| @@ -850,7 +850,7 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |||||
| } | } | ||||
| } | } | ||||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "pxor %%mm7, %%mm7 \n\t" // 0 | "pxor %%mm7, %%mm7 \n\t" // 0 | ||||
| // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | ||||
| @@ -1295,13 +1295,13 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |||||
| //FIXME? |255-0| = 1 | //FIXME? |255-0| = 1 | ||||
| /** | /** | ||||
| * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. | |||||
| * Check if the given 8x8 Block is mostly "flat" | |||||
| */ | */ | ||||
| static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) | |||||
| static inline int isHorizDC(uint8_t src[], int stride) | |||||
| { | { | ||||
| // src++; | // src++; | ||||
| int numEq= 0; | int numEq= 0; | ||||
| #ifdef HAVE_MMX | |||||
| #if 0 | |||||
| asm volatile ( | asm volatile ( | ||||
| // "int $3 \n\t" | // "int $3 \n\t" | ||||
| "leal (%1, %2), %%ecx \n\t" | "leal (%1, %2), %%ecx \n\t" | ||||
| @@ -1386,14 +1386,6 @@ asm volatile ( | |||||
| if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | ||||
| if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | ||||
| if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | ||||
| tempBlock[0 + y*TEMP_STRIDE] = src[0]; | |||||
| tempBlock[1 + y*TEMP_STRIDE] = src[1]; | |||||
| tempBlock[2 + y*TEMP_STRIDE] = src[2]; | |||||
| tempBlock[3 + y*TEMP_STRIDE] = src[3]; | |||||
| tempBlock[4 + y*TEMP_STRIDE] = src[4]; | |||||
| tempBlock[5 + y*TEMP_STRIDE] = src[5]; | |||||
| tempBlock[6 + y*TEMP_STRIDE] = src[6]; | |||||
| tempBlock[7 + y*TEMP_STRIDE] = src[7]; | |||||
| src+= stride; | src+= stride; | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -1416,40 +1408,14 @@ asm volatile ( | |||||
| static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | ||||
| { | { | ||||
| #ifdef MMX_FIXME | |||||
| FIXME | |||||
| int isOk; | |||||
| asm volatile( | |||||
| // "int $3 \n\t" | |||||
| "movq (%1, %2), %%mm0 \n\t" | |||||
| "movq (%1, %2, 8), %%mm1 \n\t" | |||||
| "movq %%mm0, %%mm2 \n\t" | |||||
| "psubusb %%mm1, %%mm0 \n\t" | |||||
| "psubusb %%mm2, %%mm1 \n\t" | |||||
| "por %%mm1, %%mm0 \n\t" // ABS Diff | |||||
| "movq pQPb, %%mm7 \n\t" // QP,..., QP | |||||
| "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |||||
| "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |||||
| "pcmpeqd b00, %%mm0 \n\t" | |||||
| "psrlq $16, %%mm0 \n\t" | |||||
| "pcmpeqd bFF, %%mm0 \n\t" | |||||
| // "movd %%mm0, (%1, %2, 4)\n\t" | |||||
| "movd %%mm0, %0 \n\t" | |||||
| : "=r" (isOk) | |||||
| : "r" (src), "r" (stride) | |||||
| ); | |||||
| return isOk; | |||||
| #else | |||||
| if(abs(src[0] - src[7]) > 2*QP) return 0; | if(abs(src[0] - src[7]) > 2*QP) return 0; | ||||
| return 1; | return 1; | ||||
| #endif | |||||
| } | } | ||||
| static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | |||||
| static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |||||
| { | { | ||||
| #ifdef HAVE_MMX | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "leal (%0, %1), %%ecx \n\t" | "leal (%0, %1), %%ecx \n\t" | ||||
| "leal (%%ecx, %1, 4), %%ebx \n\t" | "leal (%%ecx, %1, 4), %%ebx \n\t" | ||||
| @@ -1536,27 +1502,16 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| : "%eax", "%ebx", "%ecx" | : "%eax", "%ebx", "%ecx" | ||||
| ); | ); | ||||
| #else | #else | ||||
| uint8_t *src= tempBlock; | |||||
| int y; | int y; | ||||
| for(y=0; y<BLOCK_SIZE; y++) | for(y=0; y<BLOCK_SIZE; y++) | ||||
| { | { | ||||
| const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]); | |||||
| dst[0] = src[0]; | |||||
| dst[1] = src[1]; | |||||
| dst[2] = src[2]; | |||||
| dst[3] = src[3]; | |||||
| dst[4] = src[4]; | |||||
| dst[5] = src[5]; | |||||
| dst[6] = src[6]; | |||||
| dst[7] = src[7]; | |||||
| const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |||||
| if(ABS(middleEnergy) < 8*QP) | if(ABS(middleEnergy) < 8*QP) | ||||
| { | { | ||||
| const int q=(src[3] - src[4])/2; | |||||
| const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]); | |||||
| const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]); | |||||
| const int q=(dst[3] - dst[4])/2; | |||||
| const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |||||
| const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |||||
| int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | ||||
| d= MAX(d, 0); | d= MAX(d, 0); | ||||
| @@ -1579,7 +1534,6 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| dst[4]+= d; | dst[4]+= d; | ||||
| } | } | ||||
| dst+= stride; | dst+= stride; | ||||
| src+= TEMP_STRIDE; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -1589,10 +1543,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | ||||
| * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | ||||
| */ | */ | ||||
| static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||||
| static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) | |||||
| { | { | ||||
| //return; | |||||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "leal (%0, %1), %%ecx \n\t" | "leal (%0, %1), %%ecx \n\t" | ||||
| "leal (%%ecx, %1, 4), %%ebx \n\t" | "leal (%%ecx, %1, 4), %%ebx \n\t" | ||||
| @@ -1802,7 +1756,6 @@ Implemented Exact 7-Tap | |||||
| ); | ); | ||||
| #else | #else | ||||
| uint8_t *temp= tempBlock; | |||||
| int y; | int y; | ||||
| for(y=0; y<BLOCK_SIZE; y++) | for(y=0; y<BLOCK_SIZE; y++) | ||||
| { | { | ||||
| @@ -1810,15 +1763,15 @@ Implemented Exact 7-Tap | |||||
| const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | ||||
| int sums[9]; | int sums[9]; | ||||
| sums[0] = first + temp[0]; | |||||
| sums[1] = temp[0] + temp[1]; | |||||
| sums[2] = temp[1] + temp[2]; | |||||
| sums[3] = temp[2] + temp[3]; | |||||
| sums[4] = temp[3] + temp[4]; | |||||
| sums[5] = temp[4] + temp[5]; | |||||
| sums[6] = temp[5] + temp[6]; | |||||
| sums[7] = temp[6] + temp[7]; | |||||
| sums[8] = temp[7] + last; | |||||
| sums[0] = first + dst[0]; | |||||
| sums[1] = dst[0] + dst[1]; | |||||
| sums[2] = dst[1] + dst[2]; | |||||
| sums[3] = dst[2] + dst[3]; | |||||
| sums[4] = dst[3] + dst[4]; | |||||
| sums[5] = dst[4] + dst[5]; | |||||
| sums[6] = dst[5] + dst[6]; | |||||
| sums[7] = dst[6] + dst[7]; | |||||
| sums[8] = dst[7] + last; | |||||
| dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | ||||
| dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | ||||
| @@ -1830,12 +1783,10 @@ Implemented Exact 7-Tap | |||||
| dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | ||||
| dst+= stride; | dst+= stride; | ||||
| temp+= TEMP_STRIDE; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| static inline void dering(uint8_t src[], int stride, int QP) | static inline void dering(uint8_t src[], int stride, int QP) | ||||
| { | { | ||||
| //FIXME | //FIXME | ||||
| @@ -2185,6 +2136,171 @@ MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) | |||||
| #endif | #endif | ||||
| } | } | ||||
| /** | |||||
| * transposes and shift the given 8x8 Block into dst1 and dst2 | |||||
| */ | |||||
| static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | |||||
| { | |||||
| asm( | |||||
| "leal (%0, %1), %%eax \n\t" | |||||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||||
| // 0 1 2 3 4 5 6 7 8 9 | |||||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||||
| "movq (%0), %%mm0 \n\t" // 12345678 | |||||
| "movq (%%eax), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq (%%eax, %1), %%mm1 \n\t" | |||||
| "movq (%%eax, %1, 2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 128(%2) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 144(%2) \n\t" | |||||
| "movd %%mm3, 160(%2) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 176(%2) \n\t" | |||||
| "movd %%mm3, 48(%3) \n\t" | |||||
| "movd %%mm2, 192(%2) \n\t" | |||||
| "movd %%mm2, 64(%3) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 80(%3) \n\t" | |||||
| "movd %%mm1, 96(%3) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 112(%3) \n\t" | |||||
| "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | |||||
| "movq (%%ebx), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||||
| "movq (%%ebx, %1, 2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 132(%2) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 148(%2) \n\t" | |||||
| "movd %%mm3, 164(%2) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 180(%2) \n\t" | |||||
| "movd %%mm3, 52(%3) \n\t" | |||||
| "movd %%mm2, 196(%2) \n\t" | |||||
| "movd %%mm2, 68(%3) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 84(%3) \n\t" | |||||
| "movd %%mm1, 100(%3) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 116(%3) \n\t" | |||||
| :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) | |||||
| : "%eax", "%ebx" | |||||
| ); | |||||
| } | |||||
| /** | |||||
| * transposes the given 8x8 block | |||||
| */ | |||||
| static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) | |||||
| { | |||||
| asm( | |||||
| "leal (%0, %1), %%eax \n\t" | |||||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||||
| // 0 1 2 3 4 5 6 7 8 9 | |||||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||||
| "movq (%2), %%mm0 \n\t" // 12345678 | |||||
| "movq 16(%2), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq 32(%2), %%mm1 \n\t" | |||||
| "movq 48(%2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, (%0) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, (%%eax) \n\t" | |||||
| "movd %%mm3, (%%eax, %1) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, (%%eax, %1, 2) \n\t" | |||||
| "movd %%mm2, (%0, %1, 4) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, (%%ebx) \n\t" | |||||
| "movd %%mm1, (%%ebx, %1) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, (%%ebx, %1, 2) \n\t" | |||||
| "movq 64(%2), %%mm0 \n\t" // 12345678 | |||||
| "movq 80(%2), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq 96(%2), %%mm1 \n\t" | |||||
| "movq 112(%2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 4(%0) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 4(%%eax) \n\t" | |||||
| "movd %%mm3, 4(%%eax, %1) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 4(%%eax, %1, 2) \n\t" | |||||
| "movd %%mm2, 4(%0, %1, 4) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 4(%%ebx) \n\t" | |||||
| "movd %%mm1, 4(%%ebx, %1) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 4(%%ebx, %1, 2) \n\t" | |||||
| :: "r" (dst), "r" (dstStride), "r" (src) | |||||
| : "%eax", "%ebx" | |||||
| ); | |||||
| } | |||||
| #ifdef HAVE_ODIVX_POSTPROCESS | #ifdef HAVE_ODIVX_POSTPROCESS | ||||
| #include "../opendivx/postprocess.h" | #include "../opendivx/postprocess.h" | ||||
| int use_old_pp=0; | int use_old_pp=0; | ||||
| @@ -2710,6 +2826,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | ||||
| int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | ||||
| int QPFrac= QPDelta; | int QPFrac= QPDelta; | ||||
| uint8_t *tempBlock1= tempBlocks; | |||||
| uint8_t *tempBlock2= tempBlocks + 8; | |||||
| #endif | #endif | ||||
| /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | ||||
| than use a temporary buffer */ | than use a temporary buffer */ | ||||
| @@ -2742,6 +2860,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| for(x=0; x<width; x+=BLOCK_SIZE) | for(x=0; x<width; x+=BLOCK_SIZE) | ||||
| { | { | ||||
| const int stride= dstStride; | const int stride= dstStride; | ||||
| uint8_t *tmpXchg; | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| int QP= *QPptr; | int QP= *QPptr; | ||||
| asm volatile( | asm volatile( | ||||
| @@ -2882,25 +3001,47 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| T0=T1; | T0=T1; | ||||
| #endif | #endif | ||||
| } | } | ||||
| #ifdef HAVE_MMX | |||||
| transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); | |||||
| #endif | |||||
| /* check if we have a previous block to deblock it with dstBlock */ | /* check if we have a previous block to deblock it with dstBlock */ | ||||
| if(x - 8 >= 0) | if(x - 8 >= 0) | ||||
| { | { | ||||
| #ifdef MORE_TIMING | #ifdef MORE_TIMING | ||||
| T0= rdtsc(); | T0= rdtsc(); | ||||
| #endif | #endif | ||||
| #ifdef HAVE_MMX | |||||
| if(mode & H_RK1_FILTER) | |||||
| vertRK1Filter(tempBlock1, 16, QP); | |||||
| else if(mode & H_X1_FILTER) | |||||
| vertX1Filter(tempBlock1, 16, QP); | |||||
| else if(mode & H_DEBLOCK) | |||||
| { | |||||
| if( isVertDC(tempBlock1, 16)) | |||||
| { | |||||
| if(isVertMinMaxOk(tempBlock1, 16, QP)) | |||||
| doVertLowPass(tempBlock1, 16, QP); | |||||
| } | |||||
| else | |||||
| doVertDefFilter(tempBlock1, 16, QP); | |||||
| } | |||||
| transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); | |||||
| #else | |||||
| if(mode & H_X1_FILTER) | if(mode & H_X1_FILTER) | ||||
| horizX1Filter(dstBlock-4, stride, QP); | horizX1Filter(dstBlock-4, stride, QP); | ||||
| else if(mode & H_DEBLOCK) | else if(mode & H_DEBLOCK) | ||||
| { | { | ||||
| if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | |||||
| if( isHorizDC(dstBlock-4, stride)) | |||||
| { | { | ||||
| if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |||||
| doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |||||
| if(isHorizMinMaxOk(dstBlock-4, stride, QP)) | |||||
| doHorizLowPass(dstBlock-4, stride, QP); | |||||
| } | } | ||||
| else | else | ||||
| doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |||||
| doHorizDefFilter(dstBlock-4, stride, QP); | |||||
| } | } | ||||
| #endif | |||||
| #ifdef MORE_TIMING | #ifdef MORE_TIMING | ||||
| T1= rdtsc(); | T1= rdtsc(); | ||||
| horizTime+= T1-T0; | horizTime+= T1-T0; | ||||
| @@ -2929,6 +3070,10 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| dstBlock+=8; | dstBlock+=8; | ||||
| srcBlock+=8; | srcBlock+=8; | ||||
| tmpXchg= tempBlock1; | |||||
| tempBlock1= tempBlock2; | |||||
| tempBlock2 = tmpXchg; | |||||
| } | } | ||||
| /* did we use a tmp buffer */ | /* did we use a tmp buffer */ | ||||
| @@ -23,9 +23,9 @@ isVertMinMaxOk Ec Ec | |||||
| doVertLowPass E e e | doVertLowPass E e e | ||||
| doVertDefFilter Ec Ec Ec | doVertDefFilter Ec Ec Ec | ||||
| isHorizDC Ec Ec | isHorizDC Ec Ec | ||||
| isHorizMinMaxOk a | |||||
| doHorizLowPass E a a | |||||
| doHorizDefFilter E ac ac | |||||
| isHorizMinMaxOk a E | |||||
| doHorizLowPass E e e | |||||
| doHorizDefFilter E E E | |||||
| deRing | deRing | ||||
| Vertical RKAlgo1 E a a | Vertical RKAlgo1 E a a | ||||
| Vertical X1 a E E | Vertical X1 a E E | ||||
| @@ -60,7 +60,6 @@ compare the quality & speed of all filters | |||||
| split this huge file | split this huge file | ||||
| fix warnings (unused vars, ...) | fix warnings (unused vars, ...) | ||||
| noise reduction filters | noise reduction filters | ||||
| write an exact implementation of the horizontal delocking filter | |||||
| ... | ... | ||||
| Notes: | Notes: | ||||
| @@ -128,7 +127,7 @@ static uint64_t temp3=0; | |||||
| static uint64_t temp4=0; | static uint64_t temp4=0; | ||||
| static uint64_t temp5=0; | static uint64_t temp5=0; | ||||
| static uint64_t pQPb=0; | static uint64_t pQPb=0; | ||||
| static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data | |||||
| static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | |||||
| int hFlatnessThreshold= 56 - 16; | int hFlatnessThreshold= 56 - 16; | ||||
| int vFlatnessThreshold= 56 - 16; | int vFlatnessThreshold= 56 - 16; | ||||
| @@ -277,6 +276,7 @@ asm volatile( | |||||
| "movd %%mm0, %0 \n\t" | "movd %%mm0, %0 \n\t" | ||||
| : "=r" (numEq) | : "=r" (numEq) | ||||
| : "r" (src), "r" (stride) | : "r" (src), "r" (stride) | ||||
| : "%eax", "%ebx" | |||||
| ); | ); | ||||
| numEq= (256 - numEq) &0xFF; | numEq= (256 - numEq) &0xFF; | ||||
| @@ -850,7 +850,7 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |||||
| } | } | ||||
| } | } | ||||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "pxor %%mm7, %%mm7 \n\t" // 0 | "pxor %%mm7, %%mm7 \n\t" // 0 | ||||
| // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | ||||
| @@ -1295,13 +1295,13 @@ static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |||||
| //FIXME? |255-0| = 1 | //FIXME? |255-0| = 1 | ||||
| /** | /** | ||||
| * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. | |||||
| * Check if the given 8x8 Block is mostly "flat" | |||||
| */ | */ | ||||
| static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) | |||||
| static inline int isHorizDC(uint8_t src[], int stride) | |||||
| { | { | ||||
| // src++; | // src++; | ||||
| int numEq= 0; | int numEq= 0; | ||||
| #ifdef HAVE_MMX | |||||
| #if 0 | |||||
| asm volatile ( | asm volatile ( | ||||
| // "int $3 \n\t" | // "int $3 \n\t" | ||||
| "leal (%1, %2), %%ecx \n\t" | "leal (%1, %2), %%ecx \n\t" | ||||
| @@ -1386,14 +1386,6 @@ asm volatile ( | |||||
| if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | ||||
| if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | ||||
| if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | ||||
| tempBlock[0 + y*TEMP_STRIDE] = src[0]; | |||||
| tempBlock[1 + y*TEMP_STRIDE] = src[1]; | |||||
| tempBlock[2 + y*TEMP_STRIDE] = src[2]; | |||||
| tempBlock[3 + y*TEMP_STRIDE] = src[3]; | |||||
| tempBlock[4 + y*TEMP_STRIDE] = src[4]; | |||||
| tempBlock[5 + y*TEMP_STRIDE] = src[5]; | |||||
| tempBlock[6 + y*TEMP_STRIDE] = src[6]; | |||||
| tempBlock[7 + y*TEMP_STRIDE] = src[7]; | |||||
| src+= stride; | src+= stride; | ||||
| } | } | ||||
| #endif | #endif | ||||
| @@ -1416,40 +1408,14 @@ asm volatile ( | |||||
| static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | ||||
| { | { | ||||
| #ifdef MMX_FIXME | |||||
| FIXME | |||||
| int isOk; | |||||
| asm volatile( | |||||
| // "int $3 \n\t" | |||||
| "movq (%1, %2), %%mm0 \n\t" | |||||
| "movq (%1, %2, 8), %%mm1 \n\t" | |||||
| "movq %%mm0, %%mm2 \n\t" | |||||
| "psubusb %%mm1, %%mm0 \n\t" | |||||
| "psubusb %%mm2, %%mm1 \n\t" | |||||
| "por %%mm1, %%mm0 \n\t" // ABS Diff | |||||
| "movq pQPb, %%mm7 \n\t" // QP,..., QP | |||||
| "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |||||
| "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |||||
| "pcmpeqd b00, %%mm0 \n\t" | |||||
| "psrlq $16, %%mm0 \n\t" | |||||
| "pcmpeqd bFF, %%mm0 \n\t" | |||||
| // "movd %%mm0, (%1, %2, 4)\n\t" | |||||
| "movd %%mm0, %0 \n\t" | |||||
| : "=r" (isOk) | |||||
| : "r" (src), "r" (stride) | |||||
| ); | |||||
| return isOk; | |||||
| #else | |||||
| if(abs(src[0] - src[7]) > 2*QP) return 0; | if(abs(src[0] - src[7]) > 2*QP) return 0; | ||||
| return 1; | return 1; | ||||
| #endif | |||||
| } | } | ||||
| static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | |||||
| static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |||||
| { | { | ||||
| #ifdef HAVE_MMX | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "leal (%0, %1), %%ecx \n\t" | "leal (%0, %1), %%ecx \n\t" | ||||
| "leal (%%ecx, %1, 4), %%ebx \n\t" | "leal (%%ecx, %1, 4), %%ebx \n\t" | ||||
| @@ -1536,27 +1502,16 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| : "%eax", "%ebx", "%ecx" | : "%eax", "%ebx", "%ecx" | ||||
| ); | ); | ||||
| #else | #else | ||||
| uint8_t *src= tempBlock; | |||||
| int y; | int y; | ||||
| for(y=0; y<BLOCK_SIZE; y++) | for(y=0; y<BLOCK_SIZE; y++) | ||||
| { | { | ||||
| const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]); | |||||
| dst[0] = src[0]; | |||||
| dst[1] = src[1]; | |||||
| dst[2] = src[2]; | |||||
| dst[3] = src[3]; | |||||
| dst[4] = src[4]; | |||||
| dst[5] = src[5]; | |||||
| dst[6] = src[6]; | |||||
| dst[7] = src[7]; | |||||
| const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |||||
| if(ABS(middleEnergy) < 8*QP) | if(ABS(middleEnergy) < 8*QP) | ||||
| { | { | ||||
| const int q=(src[3] - src[4])/2; | |||||
| const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]); | |||||
| const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]); | |||||
| const int q=(dst[3] - dst[4])/2; | |||||
| const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |||||
| const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |||||
| int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | ||||
| d= MAX(d, 0); | d= MAX(d, 0); | ||||
| @@ -1579,7 +1534,6 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| dst[4]+= d; | dst[4]+= d; | ||||
| } | } | ||||
| dst+= stride; | dst+= stride; | ||||
| src+= TEMP_STRIDE; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -1589,10 +1543,10 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||||
| * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | ||||
| * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | ||||
| */ | */ | ||||
| static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||||
| static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) | |||||
| { | { | ||||
| //return; | |||||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||||
| #if 0 | |||||
| asm volatile( | asm volatile( | ||||
| "leal (%0, %1), %%ecx \n\t" | "leal (%0, %1), %%ecx \n\t" | ||||
| "leal (%%ecx, %1, 4), %%ebx \n\t" | "leal (%%ecx, %1, 4), %%ebx \n\t" | ||||
| @@ -1802,7 +1756,6 @@ Implemented Exact 7-Tap | |||||
| ); | ); | ||||
| #else | #else | ||||
| uint8_t *temp= tempBlock; | |||||
| int y; | int y; | ||||
| for(y=0; y<BLOCK_SIZE; y++) | for(y=0; y<BLOCK_SIZE; y++) | ||||
| { | { | ||||
| @@ -1810,15 +1763,15 @@ Implemented Exact 7-Tap | |||||
| const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | ||||
| int sums[9]; | int sums[9]; | ||||
| sums[0] = first + temp[0]; | |||||
| sums[1] = temp[0] + temp[1]; | |||||
| sums[2] = temp[1] + temp[2]; | |||||
| sums[3] = temp[2] + temp[3]; | |||||
| sums[4] = temp[3] + temp[4]; | |||||
| sums[5] = temp[4] + temp[5]; | |||||
| sums[6] = temp[5] + temp[6]; | |||||
| sums[7] = temp[6] + temp[7]; | |||||
| sums[8] = temp[7] + last; | |||||
| sums[0] = first + dst[0]; | |||||
| sums[1] = dst[0] + dst[1]; | |||||
| sums[2] = dst[1] + dst[2]; | |||||
| sums[3] = dst[2] + dst[3]; | |||||
| sums[4] = dst[3] + dst[4]; | |||||
| sums[5] = dst[4] + dst[5]; | |||||
| sums[6] = dst[5] + dst[6]; | |||||
| sums[7] = dst[6] + dst[7]; | |||||
| sums[8] = dst[7] + last; | |||||
| dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | ||||
| dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | ||||
| @@ -1830,12 +1783,10 @@ Implemented Exact 7-Tap | |||||
| dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | ||||
| dst+= stride; | dst+= stride; | ||||
| temp+= TEMP_STRIDE; | |||||
| } | } | ||||
| #endif | #endif | ||||
| } | } | ||||
| static inline void dering(uint8_t src[], int stride, int QP) | static inline void dering(uint8_t src[], int stride, int QP) | ||||
| { | { | ||||
| //FIXME | //FIXME | ||||
| @@ -2185,6 +2136,171 @@ MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) | |||||
| #endif | #endif | ||||
| } | } | ||||
| /** | |||||
| * transposes and shift the given 8x8 Block into dst1 and dst2 | |||||
| */ | |||||
| static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | |||||
| { | |||||
| asm( | |||||
| "leal (%0, %1), %%eax \n\t" | |||||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||||
| // 0 1 2 3 4 5 6 7 8 9 | |||||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||||
| "movq (%0), %%mm0 \n\t" // 12345678 | |||||
| "movq (%%eax), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq (%%eax, %1), %%mm1 \n\t" | |||||
| "movq (%%eax, %1, 2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 128(%2) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 144(%2) \n\t" | |||||
| "movd %%mm3, 160(%2) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 176(%2) \n\t" | |||||
| "movd %%mm3, 48(%3) \n\t" | |||||
| "movd %%mm2, 192(%2) \n\t" | |||||
| "movd %%mm2, 64(%3) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 80(%3) \n\t" | |||||
| "movd %%mm1, 96(%3) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 112(%3) \n\t" | |||||
| "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | |||||
| "movq (%%ebx), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||||
| "movq (%%ebx, %1, 2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 132(%2) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 148(%2) \n\t" | |||||
| "movd %%mm3, 164(%2) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 180(%2) \n\t" | |||||
| "movd %%mm3, 52(%3) \n\t" | |||||
| "movd %%mm2, 196(%2) \n\t" | |||||
| "movd %%mm2, 68(%3) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 84(%3) \n\t" | |||||
| "movd %%mm1, 100(%3) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 116(%3) \n\t" | |||||
| :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) | |||||
| : "%eax", "%ebx" | |||||
| ); | |||||
| } | |||||
| /** | |||||
| * transposes the given 8x8 block | |||||
| */ | |||||
| static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) | |||||
| { | |||||
| asm( | |||||
| "leal (%0, %1), %%eax \n\t" | |||||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||||
| // 0 1 2 3 4 5 6 7 8 9 | |||||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||||
| "movq (%2), %%mm0 \n\t" // 12345678 | |||||
| "movq 16(%2), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq 32(%2), %%mm1 \n\t" | |||||
| "movq 48(%2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, (%0) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, (%%eax) \n\t" | |||||
| "movd %%mm3, (%%eax, %1) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, (%%eax, %1, 2) \n\t" | |||||
| "movd %%mm2, (%0, %1, 4) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, (%%ebx) \n\t" | |||||
| "movd %%mm1, (%%ebx, %1) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, (%%ebx, %1, 2) \n\t" | |||||
| "movq 64(%2), %%mm0 \n\t" // 12345678 | |||||
| "movq 80(%2), %%mm1 \n\t" // abcdefgh | |||||
| "movq %%mm0, %%mm2 \n\t" // 12345678 | |||||
| "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |||||
| "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |||||
| "movq 96(%2), %%mm1 \n\t" | |||||
| "movq 112(%2), %%mm3 \n\t" | |||||
| "movq %%mm1, %%mm4 \n\t" | |||||
| "punpcklbw %%mm3, %%mm1 \n\t" | |||||
| "punpckhbw %%mm3, %%mm4 \n\t" | |||||
| "movq %%mm0, %%mm3 \n\t" | |||||
| "punpcklwd %%mm1, %%mm0 \n\t" | |||||
| "punpckhwd %%mm1, %%mm3 \n\t" | |||||
| "movq %%mm2, %%mm1 \n\t" | |||||
| "punpcklwd %%mm4, %%mm2 \n\t" | |||||
| "punpckhwd %%mm4, %%mm1 \n\t" | |||||
| "movd %%mm0, 4(%0) \n\t" | |||||
| "psrlq $32, %%mm0 \n\t" | |||||
| "movd %%mm0, 4(%%eax) \n\t" | |||||
| "movd %%mm3, 4(%%eax, %1) \n\t" | |||||
| "psrlq $32, %%mm3 \n\t" | |||||
| "movd %%mm3, 4(%%eax, %1, 2) \n\t" | |||||
| "movd %%mm2, 4(%0, %1, 4) \n\t" | |||||
| "psrlq $32, %%mm2 \n\t" | |||||
| "movd %%mm2, 4(%%ebx) \n\t" | |||||
| "movd %%mm1, 4(%%ebx, %1) \n\t" | |||||
| "psrlq $32, %%mm1 \n\t" | |||||
| "movd %%mm1, 4(%%ebx, %1, 2) \n\t" | |||||
| :: "r" (dst), "r" (dstStride), "r" (src) | |||||
| : "%eax", "%ebx" | |||||
| ); | |||||
| } | |||||
| #ifdef HAVE_ODIVX_POSTPROCESS | #ifdef HAVE_ODIVX_POSTPROCESS | ||||
| #include "../opendivx/postprocess.h" | #include "../opendivx/postprocess.h" | ||||
| int use_old_pp=0; | int use_old_pp=0; | ||||
| @@ -2710,6 +2826,8 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | ||||
| int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | ||||
| int QPFrac= QPDelta; | int QPFrac= QPDelta; | ||||
| uint8_t *tempBlock1= tempBlocks; | |||||
| uint8_t *tempBlock2= tempBlocks + 8; | |||||
| #endif | #endif | ||||
| /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | ||||
| than use a temporary buffer */ | than use a temporary buffer */ | ||||
| @@ -2742,6 +2860,7 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| for(x=0; x<width; x+=BLOCK_SIZE) | for(x=0; x<width; x+=BLOCK_SIZE) | ||||
| { | { | ||||
| const int stride= dstStride; | const int stride= dstStride; | ||||
| uint8_t *tmpXchg; | |||||
| #ifdef ARCH_X86 | #ifdef ARCH_X86 | ||||
| int QP= *QPptr; | int QP= *QPptr; | ||||
| asm volatile( | asm volatile( | ||||
| @@ -2882,25 +3001,47 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| T0=T1; | T0=T1; | ||||
| #endif | #endif | ||||
| } | } | ||||
| #ifdef HAVE_MMX | |||||
| transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); | |||||
| #endif | |||||
| /* check if we have a previous block to deblock it with dstBlock */ | /* check if we have a previous block to deblock it with dstBlock */ | ||||
| if(x - 8 >= 0) | if(x - 8 >= 0) | ||||
| { | { | ||||
| #ifdef MORE_TIMING | #ifdef MORE_TIMING | ||||
| T0= rdtsc(); | T0= rdtsc(); | ||||
| #endif | #endif | ||||
| #ifdef HAVE_MMX | |||||
| if(mode & H_RK1_FILTER) | |||||
| vertRK1Filter(tempBlock1, 16, QP); | |||||
| else if(mode & H_X1_FILTER) | |||||
| vertX1Filter(tempBlock1, 16, QP); | |||||
| else if(mode & H_DEBLOCK) | |||||
| { | |||||
| if( isVertDC(tempBlock1, 16)) | |||||
| { | |||||
| if(isVertMinMaxOk(tempBlock1, 16, QP)) | |||||
| doVertLowPass(tempBlock1, 16, QP); | |||||
| } | |||||
| else | |||||
| doVertDefFilter(tempBlock1, 16, QP); | |||||
| } | |||||
| transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); | |||||
| #else | |||||
| if(mode & H_X1_FILTER) | if(mode & H_X1_FILTER) | ||||
| horizX1Filter(dstBlock-4, stride, QP); | horizX1Filter(dstBlock-4, stride, QP); | ||||
| else if(mode & H_DEBLOCK) | else if(mode & H_DEBLOCK) | ||||
| { | { | ||||
| if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | |||||
| if( isHorizDC(dstBlock-4, stride)) | |||||
| { | { | ||||
| if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |||||
| doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |||||
| if(isHorizMinMaxOk(dstBlock-4, stride, QP)) | |||||
| doHorizLowPass(dstBlock-4, stride, QP); | |||||
| } | } | ||||
| else | else | ||||
| doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |||||
| doHorizDefFilter(dstBlock-4, stride, QP); | |||||
| } | } | ||||
| #endif | |||||
| #ifdef MORE_TIMING | #ifdef MORE_TIMING | ||||
| T1= rdtsc(); | T1= rdtsc(); | ||||
| horizTime+= T1-T0; | horizTime+= T1-T0; | ||||
| @@ -2929,6 +3070,10 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||||
| dstBlock+=8; | dstBlock+=8; | ||||
| srcBlock+=8; | srcBlock+=8; | ||||
| tmpXchg= tempBlock1; | |||||
| tempBlock1= tempBlock2; | |||||
| tempBlock2 = tmpXchg; | |||||
| } | } | ||||
| /* did we use a tmp buffer */ | /* did we use a tmp buffer */ | ||||