| @@ -2193,63 +2193,63 @@ static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1 | |||
| #if HAVE_6REGS | |||
| static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1, | |||
| const float *win, int len){ | |||
| x86_reg i = -len*4; | |||
| x86_reg j = len*4-8; | |||
| __asm__ volatile( | |||
| "1: \n" | |||
| "pswapd (%5,%1), %%mm1 \n" | |||
| "movq (%5,%0), %%mm0 \n" | |||
| "pswapd (%4,%1), %%mm5 \n" | |||
| "movq (%3,%0), %%mm4 \n" | |||
| "movq %%mm0, %%mm2 \n" | |||
| "movq %%mm1, %%mm3 \n" | |||
| "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] | |||
| "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] | |||
| "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] | |||
| "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] | |||
| "pfadd %%mm3, %%mm2 \n" | |||
| "pfsub %%mm0, %%mm1 \n" | |||
| "pswapd %%mm2, %%mm2 \n" | |||
| "movq %%mm1, (%2,%0) \n" | |||
| "movq %%mm2, (%2,%1) \n" | |||
| "sub $8, %1 \n" | |||
| "add $8, %0 \n" | |||
| "jl 1b \n" | |||
| "femms \n" | |||
| :"+r"(i), "+r"(j) | |||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |||
| ); | |||
| x86_reg i = -len*4; | |||
| x86_reg j = len*4-8; | |||
| __asm__ volatile( | |||
| "1: \n" | |||
| "pswapd (%5,%1), %%mm1 \n" | |||
| "movq (%5,%0), %%mm0 \n" | |||
| "pswapd (%4,%1), %%mm5 \n" | |||
| "movq (%3,%0), %%mm4 \n" | |||
| "movq %%mm0, %%mm2 \n" | |||
| "movq %%mm1, %%mm3 \n" | |||
| "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i] | |||
| "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j] | |||
| "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j] | |||
| "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i] | |||
| "pfadd %%mm3, %%mm2 \n" | |||
| "pfsub %%mm0, %%mm1 \n" | |||
| "pswapd %%mm2, %%mm2 \n" | |||
| "movq %%mm1, (%2,%0) \n" | |||
| "movq %%mm2, (%2,%1) \n" | |||
| "sub $8, %1 \n" | |||
| "add $8, %0 \n" | |||
| "jl 1b \n" | |||
| "femms \n" | |||
| :"+r"(i), "+r"(j) | |||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |||
| ); | |||
| } | |||
| static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1, | |||
| const float *win, int len){ | |||
| x86_reg i = -len*4; | |||
| x86_reg j = len*4-16; | |||
| __asm__ volatile( | |||
| "1: \n" | |||
| "movaps (%5,%1), %%xmm1 \n" | |||
| "movaps (%5,%0), %%xmm0 \n" | |||
| "movaps (%4,%1), %%xmm5 \n" | |||
| "movaps (%3,%0), %%xmm4 \n" | |||
| "shufps $0x1b, %%xmm1, %%xmm1 \n" | |||
| "shufps $0x1b, %%xmm5, %%xmm5 \n" | |||
| "movaps %%xmm0, %%xmm2 \n" | |||
| "movaps %%xmm1, %%xmm3 \n" | |||
| "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] | |||
| "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] | |||
| "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] | |||
| "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] | |||
| "addps %%xmm3, %%xmm2 \n" | |||
| "subps %%xmm0, %%xmm1 \n" | |||
| "shufps $0x1b, %%xmm2, %%xmm2 \n" | |||
| "movaps %%xmm1, (%2,%0) \n" | |||
| "movaps %%xmm2, (%2,%1) \n" | |||
| "sub $16, %1 \n" | |||
| "add $16, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i), "+r"(j) | |||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |||
| ); | |||
| x86_reg i = -len*4; | |||
| x86_reg j = len*4-16; | |||
| __asm__ volatile( | |||
| "1: \n" | |||
| "movaps (%5,%1), %%xmm1 \n" | |||
| "movaps (%5,%0), %%xmm0 \n" | |||
| "movaps (%4,%1), %%xmm5 \n" | |||
| "movaps (%3,%0), %%xmm4 \n" | |||
| "shufps $0x1b, %%xmm1, %%xmm1 \n" | |||
| "shufps $0x1b, %%xmm5, %%xmm5 \n" | |||
| "movaps %%xmm0, %%xmm2 \n" | |||
| "movaps %%xmm1, %%xmm3 \n" | |||
| "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i] | |||
| "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j] | |||
| "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j] | |||
| "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i] | |||
| "addps %%xmm3, %%xmm2 \n" | |||
| "subps %%xmm0, %%xmm1 \n" | |||
| "shufps $0x1b, %%xmm2, %%xmm2 \n" | |||
| "movaps %%xmm1, (%2,%0) \n" | |||
| "movaps %%xmm2, (%2,%1) \n" | |||
| "sub $16, %1 \n" | |||
| "add $16, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i), "+r"(j) | |||
| :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len) | |||
| ); | |||
| } | |||
| #endif /* HAVE_6REGS */ | |||