untested (no g200 mga or whatever i would need ...) experimental sse2 version (even less tested as no p4 either ...) sse2 version would need 16-byte aligned src & dst else sig11 sse2 version is disabled by default Originally committed as revision 5338 to svn://svn.mplayerhq.hu/mplayer/trunk/postproctags/v0.5
| @@ -409,3 +409,21 @@ void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst | |||||
| rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); | rgb24toyv12_C(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride); | ||||
| #endif | #endif | ||||
| } | } | ||||
| void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, | |||||
| int width, int height, int src1Stride, int src2Stride, int dstStride) | |||||
| { | |||||
| #ifdef CAN_COMPILE_X86_ASM | |||||
| // ordered per speed fasterst first | |||||
| if(gCpuCaps.hasMMX2) | |||||
| interleaveBytes_MMX2(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); | |||||
| else if(gCpuCaps.has3DNow) | |||||
| interleaveBytes_3DNow(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); | |||||
| else if(gCpuCaps.hasMMX) | |||||
| interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); | |||||
| else | |||||
| interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); | |||||
| #else | |||||
| interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride); | |||||
| #endif | |||||
| } | |||||
| @@ -34,6 +34,10 @@ extern void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_ | |||||
| unsigned int width, unsigned int height, | unsigned int width, unsigned int height, | ||||
| unsigned int lumStride, unsigned int chromStride, unsigned int srcStride); | unsigned int lumStride, unsigned int chromStride, unsigned int srcStride); | ||||
| extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst, | |||||
| int width, int height, int src1Stride, int src2Stride, int dstStride); | |||||
| #define MODE_RGB 0x1 | #define MODE_RGB 0x1 | ||||
| #define MODE_BGR 0x2 | #define MODE_BGR 0x2 | ||||
| @@ -1197,3 +1197,83 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||||
| src += srcStride; | src += srcStride; | ||||
| } | } | ||||
| } | } | ||||
| void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest, | |||||
| int width, int height, int src1Stride, int src2Stride, int dstStride){ | |||||
| int h; | |||||
| for(h=0; h < height; h++) | |||||
| { | |||||
| int w; | |||||
| #ifdef HAVE_MMX | |||||
| #ifdef HAVE_SSE2 | |||||
| asm( | |||||
| "xorl %%eax, %%eax \n\t" | |||||
| "1: \n\t" | |||||
| PREFETCH" 64(%1, %%eax) \n\t" | |||||
| PREFETCH" 64(%2, %%eax) \n\t" | |||||
| "movdqa (%1, %%eax), %%xmm0 \n\t" | |||||
| "movdqa (%1, %%eax), %%xmm1 \n\t" | |||||
| "movdqa (%2, %%eax), %%xmm2 \n\t" | |||||
| "punpcklbw %%xmm2, %%xmm0 \n\t" | |||||
| "punpckhbw %%xmm2, %%xmm1 \n\t" | |||||
| "movntdq %%xmm0, (%0, %%eax, 2) \n\t" | |||||
| "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t" | |||||
| "addl $16, %%eax \n\t" | |||||
| "cmpl %3, %%eax \n\t" | |||||
| " jb 1b \n\t" | |||||
| ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |||||
| : "memory", "%eax" | |||||
| ); | |||||
| #else | |||||
| asm( | |||||
| "xorl %%eax, %%eax \n\t" | |||||
| "1: \n\t" | |||||
| PREFETCH" 64(%1, %%eax) \n\t" | |||||
| PREFETCH" 64(%2, %%eax) \n\t" | |||||
| "movq (%1, %%eax), %%mm0 \n\t" | |||||
| "movq 8(%1, %%eax), %%mm2 \n\t" | |||||
| "movq %%mm0, %%mm1 \n\t" | |||||
| "movq %%mm2, %%mm3 \n\t" | |||||
| "movq (%2, %%eax), %%mm4 \n\t" | |||||
| "movq 8(%2, %%eax), %%mm5 \n\t" | |||||
| "punpcklbw %%mm4, %%mm0 \n\t" | |||||
| "punpckhbw %%mm4, %%mm1 \n\t" | |||||
| "punpcklbw %%mm5, %%mm2 \n\t" | |||||
| "punpckhbw %%mm5, %%mm3 \n\t" | |||||
| MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t" | |||||
| MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t" | |||||
| MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t" | |||||
| MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t" | |||||
| "addl $16, %%eax \n\t" | |||||
| "cmpl %3, %%eax \n\t" | |||||
| " jb 1b \n\t" | |||||
| ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15) | |||||
| : "memory", "%eax" | |||||
| ); | |||||
| #endif | |||||
| for(w= (width&(~15)); w < width; w++) | |||||
| { | |||||
| dest[2*w+0] = src1[w]; | |||||
| dest[2*w+1] = src2[w]; | |||||
| } | |||||
| #else | |||||
| for(w=0; w < width; w++) | |||||
| { | |||||
| dest[2*w+0] = src1[w]; | |||||
| dest[2*w+1] = src2[w]; | |||||
| } | |||||
| #endif | |||||
| dest += dstStride; | |||||
| src1 += src1Stride; | |||||
| src2 += src2Stride; | |||||
| } | |||||
| #ifdef HAVE_MMX | |||||
| asm( | |||||
| EMMS" \n\t" | |||||
| SFENCE" \n\t" | |||||
| ::: "memory" | |||||
| ); | |||||
| #endif | |||||
| } | |||||