added deinterlace filters (linear interpolate, linear blend, median) minor cleanups (removed some outcommented stuff) Originally committed as revision 2204 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -17,19 +17,22 @@ | |||
| */ | |||
| /* | |||
| C MMX MMX2 3DNow* | |||
| C MMX MMX2 3DNow | |||
| isVertDC Ec Ec | |||
| isVertMinMaxOk Ec Ec | |||
| doVertLowPass E e e* | |||
| doVertLowPass E e e | |||
| doVertDefFilter Ec Ec Ec | |||
| isHorizDC Ec Ec | |||
| isHorizMinMaxOk a | |||
| doHorizLowPass E a a* | |||
| doHorizLowPass E a a | |||
| doHorizDefFilter E ac ac | |||
| deRing | |||
| Vertical RKAlgo1 E a a* | |||
| Vertical X1 a E E* | |||
| Horizontal X1 a E E* | |||
| Vertical RKAlgo1 E a a | |||
| Vertical X1 a E E | |||
| Horizontal X1 a E E | |||
| LinIpolDeinterlace a E E* | |||
| LinBlendDeinterlace a E E* | |||
| MedianDeinterlace a E | |||
| * i dont have a 3dnow CPU -> its untested | |||
| @@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once | |||
| compare the quality & speed of all filters | |||
| implement a few simple deinterlacing filters | |||
| split this huge file | |||
| fix warnings (unused vars, ...) | |||
| ... | |||
| Notes: | |||
| @@ -63,6 +67,9 @@ Notes: | |||
| /* | |||
| Changelog: use the CVS log | |||
| rewrote the horizontal lowpass filter to fix a bug which caused a blocky look | |||
| added deinterlace filters (linear interpolate, linear blend, median) | |||
| minor cleanups (removed some outcommented stuff) | |||
| 0.1.3 | |||
| bugfixes: last 3 lines not brightness/contrast corrected | |||
| brightness statistics messed up with initial black pic | |||
| @@ -194,13 +201,11 @@ static inline void prefetcht2(void *p) | |||
| * Check if the middle 8x8 Block in the given 8x10 block is flat | |||
| */ | |||
| static inline int isVertDC(uint8_t src[], int stride){ | |||
| // return true; | |||
| int numEq= 0; | |||
| int y; | |||
| src+= stride; // src points to begin of the 8x8 Block | |||
| #ifdef HAVE_MMX | |||
| asm volatile( | |||
| // "int $3 \n\t" | |||
| "pushl %1\n\t" | |||
| "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |||
| "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |||
| @@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||
| } | |||
| /** | |||
| * Do a horizontal low pass filter on the 8x8 block | |||
| * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | |||
| * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |||
| * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version) | |||
| * useing the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | |||
| */ | |||
| static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||
| { | |||
| @@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||
| */ | |||
| // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | |||
| /* | |||
| 31 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 13 | |||
| Implemented Exact 7-Tap | |||
| 9421 A321 | |||
| 36421 64321 | |||
| @@ -1654,6 +1651,7 @@ Implemented Exact 7-Tap | |||
| 1249 123A | |||
| */ | |||
| #ifdef HAVE_MMX2 | |||
| #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| @@ -1680,12 +1678,12 @@ Implemented Exact 7-Tap | |||
| #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| "movq %%mm0, %%mm2 \n\t"\ | |||
| "movq %%mm0, %%mm3 \n\t"\ | |||
| "movq %%mm0, %%mm4 \n\t"\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "pand bm00000001, %%mm3 \n\t"\ | |||
| "pand bm10000000, %%mm4 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| @@ -1708,7 +1706,80 @@ Implemented Exact 7-Tap | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| #endif | |||
| #define HLP(i) HLP3(i) | |||
| /* uses the 7-Tap Filter: 1112111 */ | |||
| #define NEW_HLP(i)\ | |||
| "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| "movq %%mm0, %%mm2 \n\t"\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| "movq %%mm1, %%mm5 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "psllq $8, %%mm5 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm5 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| "movq %%mm5, %%mm1 \n\t"\ | |||
| PAVGB(%%mm2, %%mm5)\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| PAVGB(%%mm1, %%mm5)\ | |||
| PAVGB(%%mm5, %%mm0)\ | |||
| "movd %%mm0, (%0) \n\t"\ | |||
| "psrlq $32, %%mm0 \n\t"\ | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| /* uses the 9-Tap Filter: 112242211 */ | |||
| #define NEW_HLP2(i)\ | |||
| "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ | |||
| "movq %%mm0, %%mm1 \n\t" /*0001000*/\ | |||
| "movq %%mm0, %%mm2 \n\t" /*0001000*/\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t" /*0010000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000100*/\ | |||
| "movq %%mm1, %%mm5 \n\t" /*0010000*/\ | |||
| PAVGB(%%mm2, %%mm1) /*0010100*/\ | |||
| PAVGB(%%mm1, %%mm0) /*0012100*/\ | |||
| "psllq $8, %%mm5 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm5 \n\t" /*0100000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000010*/\ | |||
| "movq %%mm5, %%mm1 \n\t" /*0100000*/\ | |||
| PAVGB(%%mm2, %%mm5) /*0100010*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t" /*1000000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000001*/\ | |||
| "movq %%mm1, %%mm6 \n\t" /*1000000*/\ | |||
| PAVGB(%%mm2, %%mm1) /*1000001*/\ | |||
| "psllq $8, %%mm6 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm6 \n\t"/*100000000*/\ | |||
| "por %%mm4, %%mm2 \n\t"/*000000001*/\ | |||
| PAVGB(%%mm2, %%mm6) /*100000001*/\ | |||
| PAVGB(%%mm6, %%mm1) /*110000011*/\ | |||
| PAVGB(%%mm1, %%mm5) /*112000211*/\ | |||
| PAVGB(%%mm5, %%mm0) /*112242211*/\ | |||
| "movd %%mm0, (%0) \n\t"\ | |||
| "psrlq $32, %%mm0 \n\t"\ | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| #define HLP(i) NEW_HLP(i) | |||
| HLP(0) | |||
| "addl %1, %0 \n\t" | |||
| @@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2) | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| */ | |||
| static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" | |||
| "movq (%%eax, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%ebx) \n\t" | |||
| "movq (%0, %1, 8), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[stride] = (src[0] + src[stride*2])>>1; | |||
| src[stride*3] = (src[stride*2] + src[stride*4])>>1; | |||
| src[stride*5] = (src[stride*4] + src[stride*6])>>1; | |||
| src[stride*7] = (src[stride*6] + src[stride*8])>>1; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| */ | |||
| static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" | |||
| "movq (%%eax, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%ebx) \n\t" | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[stride] = (src[0] + src[stride*2])>>1; | |||
| src[stride*3] = (src[stride*2] + src[stride*4])>>1; | |||
| src[stride*5] = (src[stride*4] + src[stride*6])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // L0 | |||
| "movq (%%eax, %1), %%mm1 \n\t" // L2 | |||
| PAVGB(%%mm1, %%mm0) // L0+L2 | |||
| "movq (%%eax), %%mm2 \n\t" // L1 | |||
| PAVGB(%%mm2, %%mm0) | |||
| "movq %%mm0, (%0) \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |||
| PAVGB(%%mm0, %%mm2) // L1+L3 | |||
| PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |||
| "movq %%mm2, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |||
| PAVGB(%%mm2, %%mm1) // L2+L4 | |||
| PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |||
| "movq %%mm1, (%%eax, %1) \n\t" | |||
| "movq (%%ebx), %%mm1 \n\t" // L5 | |||
| PAVGB(%%mm1, %%mm0) // L3+L5 | |||
| PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |||
| PAVGB(%%mm0, %%mm2) // L4+L6 | |||
| PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |||
| PAVGB(%%mm2, %%mm1) // L5+L7 | |||
| PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |||
| "movq %%mm1, (%%ebx) \n\t" | |||
| "movq (%0, %1, 8), %%mm1 \n\t" // L8 | |||
| PAVGB(%%mm1, %%mm0) // L6+L8 | |||
| PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | |||
| "movq %%mm0, (%%ebx, %1) \n\t" | |||
| "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 | |||
| PAVGB(%%mm0, %%mm2) // L7+L9 | |||
| PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | |||
| "movq %%mm2, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |||
| src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // L0 | |||
| "movq (%%eax, %1), %%mm1 \n\t" // L2 | |||
| PAVGB(%%mm1, %%mm0) // L0+L2 | |||
| "movq (%%eax), %%mm2 \n\t" // L1 | |||
| PAVGB(%%mm2, %%mm0) | |||
| "movq %%mm0, (%0) \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |||
| PAVGB(%%mm0, %%mm2) // L1+L3 | |||
| PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |||
| "movq %%mm2, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |||
| PAVGB(%%mm2, %%mm1) // L2+L4 | |||
| PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |||
| "movq %%mm1, (%%eax, %1) \n\t" | |||
| "movq (%%ebx), %%mm1 \n\t" // L5 | |||
| PAVGB(%%mm1, %%mm0) // L3+L5 | |||
| PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |||
| PAVGB(%%mm0, %%mm2) // L4+L6 | |||
| PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |||
| PAVGB(%%mm2, %%mm1) // L5+L7 | |||
| PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |||
| "movq %%mm1, (%%ebx) \n\t" | |||
| PAVGB(%%mm2, %%mm0) // L7 + L8 | |||
| "movq %%mm0, (%%ebx, %1) \n\t" | |||
| "movq %%mm0, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| */ | |||
| static inline void deInterlaceMedian(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // | |||
| "movq (%%eax, %1), %%mm2 \n\t" // | |||
| "movq (%%eax), %%mm1 \n\t" // | |||
| "movq %%mm0, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm2, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm0, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm2 \n\t" | |||
| "movq %%mm2, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx), %%mm2 \n\t" // | |||
| "movq (%%ebx, %1), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // | |||
| "movq (%0, %1, 8), %%mm0 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| //FIXME | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |||
| src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceMedianLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // | |||
| "movq (%%eax, %1), %%mm2 \n\t" // | |||
| "movq (%%eax), %%mm1 \n\t" // | |||
| "movq %%mm0, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm2, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm0, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm2 \n\t" | |||
| "movq %%mm2, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx), %%mm2 \n\t" // | |||
| "movq (%%ebx, %1), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx) \n\t" | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| //FIXME | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| #ifdef HAVE_ODIVX_POSTPROCESS | |||
| #include "../opendivx/postprocess.h" | |||
| int use_old_pp=0; | |||
| @@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63) | |||
| * -63 is best quality -1 is worst | |||
| */ | |||
| //extern "C"{ | |||
| void postprocess(unsigned char * src[], int src_stride, | |||
| unsigned char * dst[], int dst_stride, | |||
| int horizontal_size, int vertical_size, | |||
| @@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| blockCopy(vertBlock + dstStride*2, dstStride, | |||
| vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); | |||
| if(mode & LINEAR_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateLinear(dstBlock, dstStride); | |||
| else if(mode & LINEAR_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendLinear(dstBlock, dstStride); | |||
| else if(mode & MEDIAN_DEINT_FILTER) | |||
| deInterlaceMedian(dstBlock, dstStride); | |||
| /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateCubic(dstBlock, dstStride); | |||
| else if(mode & CUBIC_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendCubic(dstBlock, dstStride); | |||
| */ | |||
| #ifdef MORE_TIMEING | |||
| T1= rdtsc(); | |||
| @@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| blockCopy(vertBlock + dstStride*1, dstStride, | |||
| vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX); | |||
| if(mode & LINEAR_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateLinearLastRow(dstBlock, dstStride); | |||
| else if(mode & LINEAR_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendLinearLastRow(dstBlock, dstStride); | |||
| else if(mode & MEDIAN_DEINT_FILTER) | |||
| deInterlaceMedianLastRow(dstBlock, dstStride); | |||
| /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateCubicLastRow(dstBlock, dstStride); | |||
| else if(mode & CUBIC_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendCubicLastRow(dstBlock, dstStride); | |||
| */ | |||
| } | |||
| if(x - 8 >= 0 && x<width) | |||
| { | |||
| @@ -22,6 +22,7 @@ | |||
| #define BLOCK_SIZE 8 | |||
| #define TEMP_STRIDE 8 | |||
| //#define NUM_BLOCKS_AT_ONCE 16 //not used yet | |||
| #define V_DEBLOCK 0x01 | |||
| #define H_DEBLOCK 0x02 | |||
| @@ -32,20 +33,28 @@ | |||
| #define LUM_H_DEBLOCK H_DEBLOCK // 2 | |||
| #define CHROM_V_DEBLOCK (V_DEBLOCK<<4) // 16 | |||
| #define CHROM_H_DEBLOCK (H_DEBLOCK<<4) // 32 | |||
| #define LUM_DERING DERING // 4 | |||
| #define CHROM_DERING (DERING<<4) // 64 | |||
| #define LUM_DERING DERING // 4 (not implemented yet) | |||
| #define CHROM_DERING (DERING<<4) // 64 (not implemented yet) | |||
| #define LUM_LEVEL_FIX LEVEL_FIX // 8 | |||
| //not supported currently | |||
| #define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128 | |||
| #define CHROM_LEVEL_FIX (LEVEL_FIX<<4) // 128 (not implemented yet) | |||
| // Experimental vertical filters | |||
| #define V_RK1_FILTER 0x0100 // 256 | |||
| #define V_X1_FILTER 0x0200 // 512 | |||
| // Experimental horizontal filters | |||
| #define H_RK1_FILTER 0x1000 // 4096 | |||
| #define H_RK1_FILTER 0x1000 // 4096 (not implemented yet) | |||
| #define H_X1_FILTER 0x2000 // 8192 | |||
| //Deinterlacing Filters | |||
| #define DEINTERLACE_FILTER_MASK 0xF0000 | |||
| #define LINEAR_IPOL_DEINT_FILTER 0x10000 // 65536 | |||
| #define LINEAR_BLEND_DEINT_FILTER 0x20000 // 131072 | |||
| #define CUBIC_BLEND_DEINT_FILTER 0x30000 // 196608 (not implemented yet) | |||
| #define CUBIC_IPOL_DEINT_FILTER 0x40000 // 262144 (not implemented yet) | |||
| #define MEDIAN_DEINT_FILTER 0x80000 // 524288 | |||
| #define GET_PP_QUALITY_MAX 6 | |||
| //#define TIMEING | |||
| @@ -53,18 +62,6 @@ | |||
| #define QP_STORE_T int | |||
| //#ifdef __cplusplus | |||
| //#include <inttypes.h> | |||
| //void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |||
| // QP_STORE_T QPs[], int QPStride, int isColor, int mode); | |||
| //#endif | |||
| //#ifdef __cplusplus | |||
| //extern "C" | |||
| //{ | |||
| //#endif | |||
| void postprocess(unsigned char * src[], int src_stride, | |||
| unsigned char * dst[], int dst_stride, | |||
| int horizontal_size, int vertical_size, | |||
| @@ -72,8 +69,4 @@ void postprocess(unsigned char * src[], int src_stride, | |||
| int getPpModeForQuality(int quality); | |||
| //#ifdef __cplusplus | |||
| //} | |||
| //#endif | |||
| #endif | |||
| @@ -17,19 +17,22 @@ | |||
| */ | |||
| /* | |||
| C MMX MMX2 3DNow* | |||
| C MMX MMX2 3DNow | |||
| isVertDC Ec Ec | |||
| isVertMinMaxOk Ec Ec | |||
| doVertLowPass E e e* | |||
| doVertLowPass E e e | |||
| doVertDefFilter Ec Ec Ec | |||
| isHorizDC Ec Ec | |||
| isHorizMinMaxOk a | |||
| doHorizLowPass E a a* | |||
| doHorizLowPass E a a | |||
| doHorizDefFilter E ac ac | |||
| deRing | |||
| Vertical RKAlgo1 E a a* | |||
| Vertical X1 a E E* | |||
| Horizontal X1 a E E* | |||
| Vertical RKAlgo1 E a a | |||
| Vertical X1 a E E | |||
| Horizontal X1 a E E | |||
| LinIpolDeinterlace a E E* | |||
| LinBlendDeinterlace a E E* | |||
| MedianDeinterlace a E | |||
| * i dont have a 3dnow CPU -> its untested | |||
| @@ -55,6 +58,7 @@ make the mainloop more flexible (variable number of blocks at once | |||
| compare the quality & speed of all filters | |||
| implement a few simple deinterlacing filters | |||
| split this huge file | |||
| fix warnings (unused vars, ...) | |||
| ... | |||
| Notes: | |||
| @@ -63,6 +67,9 @@ Notes: | |||
| /* | |||
| Changelog: use the CVS log | |||
| rewrote the horizontal lowpass filter to fix a bug which caused a blocky look | |||
| added deinterlace filters (linear interpolate, linear blend, median) | |||
| minor cleanups (removed some outcommented stuff) | |||
| 0.1.3 | |||
| bugfixes: last 3 lines not brightness/contrast corrected | |||
| brightness statistics messed up with initial black pic | |||
| @@ -194,13 +201,11 @@ static inline void prefetcht2(void *p) | |||
| * Check if the middle 8x8 Block in the given 8x10 block is flat | |||
| */ | |||
| static inline int isVertDC(uint8_t src[], int stride){ | |||
| // return true; | |||
| int numEq= 0; | |||
| int y; | |||
| src+= stride; // src points to begin of the 8x8 Block | |||
| #ifdef HAVE_MMX | |||
| asm volatile( | |||
| // "int $3 \n\t" | |||
| "pushl %1\n\t" | |||
| "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |||
| "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |||
| @@ -1577,9 +1582,9 @@ static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP | |||
| } | |||
| /** | |||
| * Do a horizontal low pass filter on the 8x8 block | |||
| * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | |||
| * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |||
| * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version) | |||
| * useing the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | |||
| */ | |||
| static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||
| { | |||
| @@ -1635,14 +1640,6 @@ static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |||
| */ | |||
| // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | |||
| /* | |||
| 31 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 121 | |||
| 13 | |||
| Implemented Exact 7-Tap | |||
| 9421 A321 | |||
| 36421 64321 | |||
| @@ -1654,6 +1651,7 @@ Implemented Exact 7-Tap | |||
| 1249 123A | |||
| */ | |||
| #ifdef HAVE_MMX2 | |||
| #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| @@ -1680,12 +1678,12 @@ Implemented Exact 7-Tap | |||
| #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| "movq %%mm0, %%mm2 \n\t"\ | |||
| "movq %%mm0, %%mm3 \n\t"\ | |||
| "movq %%mm0, %%mm4 \n\t"\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "pand bm00000001, %%mm3 \n\t"\ | |||
| "pand bm10000000, %%mm4 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| @@ -1708,7 +1706,80 @@ Implemented Exact 7-Tap | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| #endif | |||
| #define HLP(i) HLP3(i) | |||
| /* uses the 7-Tap Filter: 1112111 */ | |||
| #define NEW_HLP(i)\ | |||
| "movq " #i "(%%eax), %%mm0 \n\t"\ | |||
| "movq %%mm0, %%mm1 \n\t"\ | |||
| "movq %%mm0, %%mm2 \n\t"\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| "movq %%mm1, %%mm5 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "psllq $8, %%mm5 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm5 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| "movq %%mm5, %%mm1 \n\t"\ | |||
| PAVGB(%%mm2, %%mm5)\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t"\ | |||
| "por %%mm4, %%mm2 \n\t"\ | |||
| PAVGB(%%mm2, %%mm1)\ | |||
| PAVGB(%%mm1, %%mm5)\ | |||
| PAVGB(%%mm5, %%mm0)\ | |||
| "movd %%mm0, (%0) \n\t"\ | |||
| "psrlq $32, %%mm0 \n\t"\ | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| /* uses the 9-Tap Filter: 112242211 */ | |||
| #define NEW_HLP2(i)\ | |||
| "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ | |||
| "movq %%mm0, %%mm1 \n\t" /*0001000*/\ | |||
| "movq %%mm0, %%mm2 \n\t" /*0001000*/\ | |||
| "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |||
| "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "psrlq $24, %%mm3 \n\t"\ | |||
| "psllq $56, %%mm4 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t" /*0010000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000100*/\ | |||
| "movq %%mm1, %%mm5 \n\t" /*0010000*/\ | |||
| PAVGB(%%mm2, %%mm1) /*0010100*/\ | |||
| PAVGB(%%mm1, %%mm0) /*0012100*/\ | |||
| "psllq $8, %%mm5 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm5 \n\t" /*0100000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000010*/\ | |||
| "movq %%mm5, %%mm1 \n\t" /*0100000*/\ | |||
| PAVGB(%%mm2, %%mm5) /*0100010*/\ | |||
| "psllq $8, %%mm1 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm1 \n\t" /*1000000*/\ | |||
| "por %%mm4, %%mm2 \n\t" /*0000001*/\ | |||
| "movq %%mm1, %%mm6 \n\t" /*1000000*/\ | |||
| PAVGB(%%mm2, %%mm1) /*1000001*/\ | |||
| "psllq $8, %%mm6 \n\t"\ | |||
| "psrlq $8, %%mm2 \n\t"\ | |||
| "por %%mm3, %%mm6 \n\t"/*100000000*/\ | |||
| "por %%mm4, %%mm2 \n\t"/*000000001*/\ | |||
| PAVGB(%%mm2, %%mm6) /*100000001*/\ | |||
| PAVGB(%%mm6, %%mm1) /*110000011*/\ | |||
| PAVGB(%%mm1, %%mm5) /*112000211*/\ | |||
| PAVGB(%%mm5, %%mm0) /*112242211*/\ | |||
| "movd %%mm0, (%0) \n\t"\ | |||
| "psrlq $32, %%mm0 \n\t"\ | |||
| "movd %%mm0, 4(%0) \n\t" | |||
| #define HLP(i) NEW_HLP(i) | |||
| HLP(0) | |||
| "addl %1, %0 \n\t" | |||
| @@ -1828,6 +1899,363 @@ FIND_MIN_MAX(%%ebx, %1, 2) | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| */ | |||
| static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" | |||
| "movq (%%eax, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%ebx) \n\t" | |||
| "movq (%0, %1, 8), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[stride] = (src[0] + src[stride*2])>>1; | |||
| src[stride*3] = (src[stride*2] + src[stride*4])>>1; | |||
| src[stride*5] = (src[stride*4] + src[stride*6])>>1; | |||
| src[stride*7] = (src[stride*6] + src[stride*8])>>1; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| */ | |||
| static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" | |||
| "movq (%%eax, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" | |||
| PAVGB(%%mm0, %%mm1)\ | |||
| "movq %%mm1, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm1 \n\t" | |||
| PAVGB(%%mm1, %%mm0)\ | |||
| "movq %%mm0, (%%ebx) \n\t" | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[stride] = (src[0] + src[stride*2])>>1; | |||
| src[stride*3] = (src[stride*2] + src[stride*4])>>1; | |||
| src[stride*5] = (src[stride*4] + src[stride*6])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // L0 | |||
| "movq (%%eax, %1), %%mm1 \n\t" // L2 | |||
| PAVGB(%%mm1, %%mm0) // L0+L2 | |||
| "movq (%%eax), %%mm2 \n\t" // L1 | |||
| PAVGB(%%mm2, %%mm0) | |||
| "movq %%mm0, (%0) \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |||
| PAVGB(%%mm0, %%mm2) // L1+L3 | |||
| PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |||
| "movq %%mm2, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |||
| PAVGB(%%mm2, %%mm1) // L2+L4 | |||
| PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |||
| "movq %%mm1, (%%eax, %1) \n\t" | |||
| "movq (%%ebx), %%mm1 \n\t" // L5 | |||
| PAVGB(%%mm1, %%mm0) // L3+L5 | |||
| PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |||
| PAVGB(%%mm0, %%mm2) // L4+L6 | |||
| PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |||
| PAVGB(%%mm2, %%mm1) // L5+L7 | |||
| PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |||
| "movq %%mm1, (%%ebx) \n\t" | |||
| "movq (%0, %1, 8), %%mm1 \n\t" // L8 | |||
| PAVGB(%%mm1, %%mm0) // L6+L8 | |||
| PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | |||
| "movq %%mm0, (%%ebx, %1) \n\t" | |||
| "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 | |||
| PAVGB(%%mm0, %%mm2) // L7+L9 | |||
| PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | |||
| "movq %%mm2, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |||
| src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // L0 | |||
| "movq (%%eax, %1), %%mm1 \n\t" // L2 | |||
| PAVGB(%%mm1, %%mm0) // L0+L2 | |||
| "movq (%%eax), %%mm2 \n\t" // L1 | |||
| PAVGB(%%mm2, %%mm0) | |||
| "movq %%mm0, (%0) \n\t" | |||
| "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |||
| PAVGB(%%mm0, %%mm2) // L1+L3 | |||
| PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |||
| "movq %%mm2, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |||
| PAVGB(%%mm2, %%mm1) // L2+L4 | |||
| PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |||
| "movq %%mm1, (%%eax, %1) \n\t" | |||
| "movq (%%ebx), %%mm1 \n\t" // L5 | |||
| PAVGB(%%mm1, %%mm0) // L3+L5 | |||
| PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |||
| "movq %%mm0, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |||
| PAVGB(%%mm0, %%mm2) // L4+L6 | |||
| PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |||
| "movq %%mm2, (%0, %1, 4) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |||
| PAVGB(%%mm2, %%mm1) // L5+L7 | |||
| PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |||
| "movq %%mm1, (%%ebx) \n\t" | |||
| PAVGB(%%mm2, %%mm0) // L7 + L8 | |||
| "movq %%mm0, (%%ebx, %1) \n\t" | |||
| "movq %%mm0, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | |||
| */ | |||
| static inline void deInterlaceMedian(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // | |||
| "movq (%%eax, %1), %%mm2 \n\t" // | |||
| "movq (%%eax), %%mm1 \n\t" // | |||
| "movq %%mm0, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm2, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm0, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm2 \n\t" | |||
| "movq %%mm2, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx), %%mm2 \n\t" // | |||
| "movq (%%ebx, %1), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx) \n\t" | |||
| "movq (%%ebx, %1, 2), %%mm2 \n\t" // | |||
| "movq (%0, %1, 8), %%mm0 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| //FIXME | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |||
| src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| /** | |||
| * Deinterlaces the given block | |||
| * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |||
| * will shift the image up by 1 line (FIXME if this is a problem) | |||
| */ | |||
| static inline void deInterlaceMedianLastRow(uint8_t src[], int stride) | |||
| { | |||
| #if defined (HAVE_MMX2) | |||
| asm volatile( | |||
| "leal (%0, %1), %%eax \n\t" | |||
| "leal (%%eax, %1, 4), %%ebx \n\t" | |||
| // 0 1 2 3 4 5 6 7 8 9 | |||
| // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |||
| "movq (%0), %%mm0 \n\t" // | |||
| "movq (%%eax, %1), %%mm2 \n\t" // | |||
| "movq (%%eax), %%mm1 \n\t" // | |||
| "movq %%mm0, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm2, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm0 \n\t" | |||
| "movq %%mm0, (%%eax) \n\t" | |||
| "movq (%0, %1, 4), %%mm0 \n\t" // | |||
| "movq (%%eax, %1, 2), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm1, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm1 \n\t" // | |||
| "pmaxub %%mm0, %%mm1 \n\t" // | |||
| "pminub %%mm1, %%mm2 \n\t" | |||
| "movq %%mm2, (%%eax, %1, 2) \n\t" | |||
| "movq (%%ebx), %%mm2 \n\t" // | |||
| "movq (%%ebx, %1), %%mm1 \n\t" // | |||
| "movq %%mm2, %%mm3 \n\t" | |||
| "pmaxub %%mm0, %%mm2 \n\t" // | |||
| "pminub %%mm3, %%mm0 \n\t" // | |||
| "pmaxub %%mm1, %%mm0 \n\t" // | |||
| "pminub %%mm0, %%mm2 \n\t" | |||
| "movq %%mm2, (%%ebx) \n\t" | |||
| "movq %%mm1, (%%ebx, %1, 2) \n\t" | |||
| : : "r" (src), "r" (stride) | |||
| : "%eax", "%ebx" | |||
| ); | |||
| #else | |||
| //FIXME | |||
| int x; | |||
| for(x=0; x<8; x++) | |||
| { | |||
| src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |||
| src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |||
| src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |||
| src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |||
| src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |||
| src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |||
| src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |||
| src[stride*7] = src[stride*6]; | |||
| src++; | |||
| } | |||
| #endif | |||
| } | |||
| #ifdef HAVE_ODIVX_POSTPROCESS | |||
| #include "../opendivx/postprocess.h" | |||
| int use_old_pp=0; | |||
| @@ -1841,7 +2269,6 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63) | |||
| * -63 is best quality -1 is worst | |||
| */ | |||
| //extern "C"{ | |||
| void postprocess(unsigned char * src[], int src_stride, | |||
| unsigned char * dst[], int dst_stride, | |||
| int horizontal_size, int vertical_size, | |||
| @@ -2196,6 +2623,17 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| blockCopy(vertBlock + dstStride*2, dstStride, | |||
| vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); | |||
| if(mode & LINEAR_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateLinear(dstBlock, dstStride); | |||
| else if(mode & LINEAR_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendLinear(dstBlock, dstStride); | |||
| else if(mode & MEDIAN_DEINT_FILTER) | |||
| deInterlaceMedian(dstBlock, dstStride); | |||
| /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateCubic(dstBlock, dstStride); | |||
| else if(mode & CUBIC_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendCubic(dstBlock, dstStride); | |||
| */ | |||
| #ifdef MORE_TIMEING | |||
| T1= rdtsc(); | |||
| @@ -2226,9 +2664,22 @@ static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStri | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| blockCopy(vertBlock + dstStride*1, dstStride, | |||
| vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX); | |||
| if(mode & LINEAR_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateLinearLastRow(dstBlock, dstStride); | |||
| else if(mode & LINEAR_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendLinearLastRow(dstBlock, dstStride); | |||
| else if(mode & MEDIAN_DEINT_FILTER) | |||
| deInterlaceMedianLastRow(dstBlock, dstStride); | |||
| /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |||
| deInterlaceInterpolateCubicLastRow(dstBlock, dstStride); | |||
| else if(mode & CUBIC_BLEND_DEINT_FILTER) | |||
| deInterlaceBlendCubicLastRow(dstBlock, dstStride); | |||
| */ | |||
| } | |||
| if(x - 8 >= 0 && x<width) | |||
| { | |||