swresample/resample: sse float linear interpolation

About two times faster Signed-off-by: James Almer <jamrial@gmail.com> Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
12 years ago · 63dbba655e
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@@ -48,6 +48,7 @@
 #        define RENAME(N) N ## _float
 #    elif defined(TEMPLATE_RESAMPLE_FLT_SSE)
 #        define COMMON_CORE COMMON_CORE_FLT_SSE
 #        define LINEAR_CORE LINEAR_CORE_FLT_SSE
 #        define RENAME(N) N ## _float_sse
 #    endif

--- a/libswresample/x86/resample_mmx.h
+++ b/libswresample/x86/resample_mmx.h
@@ -156,3 +156,38 @@ __asm__ volatile(\
      "r" (((uint8_t*)filter)-len),\
      "r" (dst+dst_index)\
 );

 #define LINEAR_CORE_FLT_SSE \
    x86_reg len= -4*c->filter_length;\
 __asm__ volatile(\
    "xorps      %%xmm0, %%xmm0    \n\t"\
    "xorps      %%xmm2, %%xmm2    \n\t"\
    "1:                           \n\t"\
    "movups   (%3, %0), %%xmm1    \n\t"\
    "movaps     %%xmm1, %%xmm3    \n\t"\
    "mulps    (%4, %0), %%xmm1    \n\t"\
    "mulps    (%5, %0), %%xmm3    \n\t"\
    "addps      %%xmm1, %%xmm0    \n\t"\
    "addps      %%xmm3, %%xmm2    \n\t"\
    "add           $16, %0        \n\t"\
    " js 1b                       \n\t"\
    "movhlps    %%xmm0, %%xmm1    \n\t"\
    "movhlps    %%xmm2, %%xmm3    \n\t"\
    "addps      %%xmm1, %%xmm0    \n\t"\
    "addps      %%xmm3, %%xmm2    \n\t"\
    "movss      %%xmm0, %%xmm1    \n\t"\
    "movss      %%xmm2, %%xmm3    \n\t"\
    "shufps $1, %%xmm0, %%xmm0    \n\t"\
    "shufps $1, %%xmm2, %%xmm2    \n\t"\
    "addps      %%xmm1, %%xmm0    \n\t"\
    "addps      %%xmm3, %%xmm2    \n\t"\
    "movss      %%xmm0, %1        \n\t"\
    "movss      %%xmm2, %2        \n\t"\
    : "+r" (len),\
      "=m" (val),\
      "=m" (v2)\
    : "r" (((uint8_t*)(src+sample_index))-len),\
      "r" (((uint8_t*)filter)-len),\
      "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
 );