This tries to compensate for the errors introduced by the rounding of pmulhw Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n1.2
| @@ -226,10 +226,20 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, | |||||
| :: "r"(dither) | :: "r"(dither) | ||||
| ); | ); | ||||
| } | } | ||||
| filterSize--; | |||||
| __asm__ volatile( | __asm__ volatile( | ||||
| "pxor %%xmm0, %%xmm0\n\t" | "pxor %%xmm0, %%xmm0\n\t" | ||||
| "punpcklbw %%xmm0, %%xmm3\n\t" | "punpcklbw %%xmm0, %%xmm3\n\t" | ||||
| "psraw $4, %%xmm3\n\t" | |||||
| "movd %0, %%xmm1\n\t" | |||||
| "punpcklwd %%xmm1, %%xmm1\n\t" | |||||
| "punpckldq %%xmm1, %%xmm1\n\t" | |||||
| "punpcklqdq %%xmm1, %%xmm1\n\t" | |||||
| "psllw $3, %%xmm1\n\t" | |||||
| "paddw %%xmm1, %%xmm3\n\t" | |||||
| "psraw $4, %%xmm3\n\t" | |||||
| ::"m"(filterSize) | |||||
| ); | |||||
| __asm__ volatile( | |||||
| "movdqa %%xmm3, %%xmm4\n\t" | "movdqa %%xmm3, %%xmm4\n\t" | ||||
| "movdqa %%xmm3, %%xmm7\n\t" | "movdqa %%xmm3, %%xmm7\n\t" | ||||
| "movl %3, %%ecx\n\t" | "movl %3, %%ecx\n\t" | ||||
| @@ -71,9 +71,20 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |||||
| const uint8_t *dither, int offset) | const uint8_t *dither, int offset) | ||||
| { | { | ||||
| dither_8to16(dither, offset); | dither_8to16(dither, offset); | ||||
| __asm__ volatile(\ | |||||
| filterSize--; | |||||
| __asm__ volatile( | |||||
| "movd %0, %%mm1\n\t" | |||||
| "punpcklwd %%mm1, %%mm1\n\t" | |||||
| "punpckldq %%mm1, %%mm1\n\t" | |||||
| "psllw $3, %%mm1\n\t" | |||||
| "paddw %%mm1, %%mm3\n\t" | |||||
| "paddw %%mm1, %%mm4\n\t" | |||||
| "psraw $4, %%mm3\n\t" | "psraw $4, %%mm3\n\t" | ||||
| "psraw $4, %%mm4\n\t" | "psraw $4, %%mm4\n\t" | ||||
| ::"m"(filterSize) | |||||
| ); | |||||
| __asm__ volatile(\ | |||||
| "movq %%mm3, %%mm6\n\t" | "movq %%mm3, %%mm6\n\t" | ||||
| "movq %%mm4, %%mm7\n\t" | "movq %%mm4, %%mm7\n\t" | ||||
| "movl %3, %%ecx\n\t" | "movl %3, %%ecx\n\t" | ||||