x86/swr: convert resample_{common, linear}_double_sse2 to yasm

Signed-off-by: James Almer <jamrial@gmail.com> 312531 -> 311528 dezicycles Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · dd2c9034b1
--- a/libswresample/resample_template.c
+++ b/libswresample/resample_template.c
@@ -25,23 +25,15 @@
 * @author Michael Niedermayer <michaelni@gmx.at>
 */
 #if    defined(TEMPLATE_RESAMPLE_DBL)     \
    || defined(TEMPLATE_RESAMPLE_DBL_SSE2)
 #if defined(TEMPLATE_RESAMPLE_DBL)
 #    define RENAME(N) N ## _double
 #    define FILTER_SHIFT 0
 #    define DELEM  double
 #    define FELEM  double
 #    define FELEM2 double
 #    define OUT(d, v) d = v
 #    if defined(TEMPLATE_RESAMPLE_DBL)
 #        define RENAME(N) N ## _double
 #    elif defined(TEMPLATE_RESAMPLE_DBL_SSE2)
 #        define COMMON_CORE COMMON_CORE_DBL_SSE2
 #        define LINEAR_CORE LINEAR_CORE_DBL_SSE2
 #        define RENAME(N) N ## _double_sse2
 #    endif
 #elif    defined(TEMPLATE_RESAMPLE_FLT)
 #    define RENAME(N) N ## _float
@@ -104,16 +96,12 @@ int RENAME(swri_resample_common)(ResampleContext *c,
    for (dst_index = 0; dst_index < n; dst_index++) {
        FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
 #ifdef COMMON_CORE
        COMMON_CORE
 #else
        FELEM2 val=0;
        int i;
        for (i = 0; i < c->filter_length; i++) {
            val += src[sample_index + i] * (FELEM2)filter[i];
        }
        OUT(dst[dst_index], val);
 #endif
        frac  += c->dst_incr_mod;
        index += c->dst_incr_div;
@@ -150,15 +138,11 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
        FELEM *filter = ((FELEM *) c->filter_bank) + c->filter_alloc * index;
        FELEM2 val=0, v2 = 0;
 #ifdef LINEAR_CORE
        LINEAR_CORE
 #else
        int i;
        for (i = 0; i < c->filter_length; i++) {
            val += src[sample_index + i] * (FELEM2)filter[i];
            v2  += src[sample_index + i] * (FELEM2)filter[i + c->filter_alloc];
        }
 #endif
 #ifdef FELEML
        val += (v2 - val) * (FELEML) frac / c->src_incr;
 #else
@@ -188,8 +172,6 @@ int RENAME(swri_resample_linear)(ResampleContext *c,
    return sample_index;
 }
 #undef COMMON_CORE
 #undef LINEAR_CORE
 #undef RENAME
 #undef FILTER_SHIFT
 #undef DELEM
--- a/libswresample/x86/resample.asm
+++ b/libswresample/x86/resample.asm
@@ -50,11 +50,12 @@ endstruc
 SECTION_RODATA
 pf_1:      dd 1.0
 pdbl_1:    dq 1.0
 pd_0x4000: dd 0x4000
 SECTION .text
 %macro RESAMPLE_FNS 3 ; format [float or int16], bps, log2_bps
 %macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
 ; int resample_common_$format(ResampleContext *ctx, $format *dst,
 ;                             const $format *src, int size, int update_ctx)
 %if ARCH_X86_64 ; unix64 and win64
@@ -165,21 +166,21 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    lea                      filterq, [min_filter_count_x4q+filterq*%2]
    mov         min_filter_count_x4q, min_filter_length_x4q
 %endif
 %ifidn %1, float
    xorps                         m0, m0, m0
 %else ; int16
 %ifidn %1, int16
    movd                          m0, [pd_0x4000]
 %else ; float/double
    xorps                         m0, m0, m0
 %endif
    align 16
 .inner_loop:
    movu                          m1, [srcq+min_filter_count_x4q*1]
 %ifidn %1, float
    mulps                         m1, m1, [filterq+min_filter_count_x4q*1]
    addps                         m0, m0, m1
 %else ; int16
 %ifidn %1, int16
    pmaddwd                       m1, [filterq+min_filter_count_x4q*1]
    paddd                         m0, m1
 %else ; float/double
    mulp%4                        m1, m1, [filterq+min_filter_count_x4q*1]
    addp%4                        m0, m0, m1
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop
@@ -189,16 +190,7 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    addps                        xm0, xm1
 %endif
    ; horizontal sum & store
 %ifidn %1, float
    movhlps                      xm1, xm0
    addps                        xm0, xm1
    shufps                       xm1, xm0, xm0, q0001
    add                        fracd, dst_incr_modd
    addps                        xm0, xm1
    add                       indexd, dst_incr_divd
    movss                     [dstq], xm0
 %else ; int16
 %ifidn %1, int16
 %if mmsize == 16
    pshufd                        m1, m0, q0032
    paddd                         m0, m1
@@ -212,6 +204,17 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
    packssdw                      m0, m0
    add                       indexd, dst_incr_divd
    movd                      [dstq], m0
 %else ; float/double
    ; horizontal sum & store
    movhlps                      xm1, xm0
 %ifidn %1, float
    addps                        xm0, xm1
    shufps                       xm1, xm0, xm0, q0001
 %endif
    add                        fracd, dst_incr_modd
    addp%4                       xm0, xm1
    add                       indexd, dst_incr_divd
    movs%4                    [dstq], xm0
 %endif
    cmp                        fracd, src_incrd
    jl .skip
@@ -307,12 +310,12 @@ cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index,
    mov                   ctx_stackq, ctxq
    mov            phase_mask_stackd, phase_maskd
    mov           min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
 %ifidn %1, float
    cvtsi2ss                     xm0, src_incrd
    movss                        xm4, [pf_1]
    divss                        xm4, xm0
 %else ; int16
 %ifidn %1, int16
    movd                          m4, [pd_0x4000]
 %else ; float/double
    cvtsi2s%4                    xm0, src_incrd
    movs%4                       xm4, [%5]
    divs%4                       xm4, xm0
 %endif
    mov                dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
    shl           min_filter_len_x4d, %3
@@ -360,12 +363,12 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    mov                           r3, dword [ctxq+ResampleContext.src_incr]
    PUSH                              dword [ctxq+ResampleContext.phase_mask]
    PUSH                              r3d
 %ifidn %1, float
    cvtsi2ss                     xm0, r3d
    movss                        xm4, [pf_1]
    divss                        xm4, xm0
 %else ; int16
 %ifidn %1, int16
    movd                          m4, [pd_0x4000]
 %else ; float/double
    cvtsi2s%4                    xm0, r3d
    movs%4                       xm4, [%5]
    divs%4                       xm4, xm0
 %endif
    mov        min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
    mov                       indexd, [ctxq+ResampleContext.index]
@@ -409,27 +412,27 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    mov                     filter2q, filter1q
    add                     filter2q, filter_alloc_x4q
 %endif
 %ifidn %1, float
    xorps                         m0, m0, m0
    xorps                         m2, m2, m2
 %else ; int16
 %ifidn %1, int16
    mova                          m0, m4
    mova                          m2, m4
 %else ; float/double
    xorps                         m0, m0, m0
    xorps                         m2, m2, m2
 %endif
    align 16
 .inner_loop:
    movu                          m1, [srcq+min_filter_count_x4q*1]
 %ifidn %1, float
    mulps                         m3, m1, [filter2q+min_filter_count_x4q*1]
    mulps                         m1, m1, [filter1q+min_filter_count_x4q*1]
    addps                         m2, m2, m3
    addps                         m0, m0, m1
 %else ; int16
 %ifidn %1, int16
    pmaddwd                       m3, m1, [filter2q+min_filter_count_x4q*1]
    pmaddwd                       m1, [filter1q+min_filter_count_x4q*1]
    paddd                         m2, m3
    paddd                         m0, m1
 %else ; float/double
    mulp%4                        m3, m1, [filter2q+min_filter_count_x4q*1]
    mulp%4                        m1, m1, [filter1q+min_filter_count_x4q*1]
    addp%4                        m2, m2, m3
    addp%4                        m0, m0, m1
 %endif
    add         min_filter_count_x4q, mmsize
    js .inner_loop
@@ -441,24 +444,7 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    addps                        xm2, xm3
 %endif
 %ifidn %1, float
    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
    cvtsi2ss                     xm1, fracd
    subps                        xm2, xm0
    mulps                        xm1, xm4
    shufps                       xm1, xm1, q0000
    mulps                        xm2, xm1
    addps                        xm0, xm2
    ; horizontal sum & store
    movhlps                      xm1, xm0
    addps                        xm0, xm1
    shufps                       xm1, xm0, xm0, q0001
    add                        fracd, dst_incr_modd
    addps                        xm0, xm1
    add                       indexd, dst_incr_divd
    movss                     [dstq], xm0
 %else ; int16
 %ifidn %1, int16
 %if mmsize == 16
    pshufd                        m3, m2, q0032
    pshufd                        m1, m0, q0032
@@ -491,6 +477,25 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
    ; - 32bit: eax=r0[filter1], edx=r2[filter2]
    ; - win64: eax=r6[filter1], edx=r1[todo]
    ; - unix64: eax=r6[filter1], edx=r2[todo]
 %else ; float/double
    ; val += (v2 - val) * (FELEML) frac / c->src_incr;
    cvtsi2s%4                    xm1, fracd
    subp%4                       xm2, xm0
    mulp%4                       xm1, xm4
    shufp%4                      xm1, xm1, q0000
    mulp%4                       xm2, xm1
    addp%4                       xm0, xm2
    ; horizontal sum & store
    movhlps                      xm1, xm0
 %ifidn %1, float
    addps                        xm0, xm1
    shufps                       xm1, xm0, xm0, q0001
 %endif
    add                        fracd, dst_incr_modd
    addp%4                       xm0, xm1
    add                       indexd, dst_incr_divd
    movs%4                    [dstq], xm0
 %endif
    cmp                        fracd, src_incrd
    jl .skip
@@ -553,11 +558,11 @@ cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
 %endmacro
 INIT_XMM sse
 RESAMPLE_FNS float, 4, 2
 RESAMPLE_FNS float, 4, 2, s, pf_1
 %if HAVE_AVX_EXTERNAL
 INIT_YMM avx
 RESAMPLE_FNS float, 4, 2
 RESAMPLE_FNS float, 4, 2, s, pf_1
 %endif
 %if ARCH_X86_32
@@ -567,3 +572,4 @@ RESAMPLE_FNS int16, 2, 1
 INIT_XMM sse2
 RESAMPLE_FNS int16, 2, 1
 RESAMPLE_FNS double, 8, 3, d, pdbl_1
--- a/libswresample/x86/resample_mmx.h
+++ b/libswresample/x86/resample_mmx.h
@@ -1,72 +0,0 @@
 /*
 * Copyright (c) 2012 Michael Niedermayer <michaelni@gmx.at>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "libavutil/x86/asm.h"
 #include "libavutil/cpu.h"
 #include "libswresample/swresample_internal.h"
 #define COMMON_CORE_DBL_SSE2 \
    x86_reg len= -8*c->filter_length;\
 __asm__ volatile(\
    "xorpd     %%xmm0, %%xmm0     \n\t"\
    "1:                           \n\t"\
    "movupd  (%1, %0), %%xmm1     \n\t"\
    "mulpd   (%2, %0), %%xmm1     \n\t"\
    "addpd     %%xmm1, %%xmm0     \n\t"\
    "add       $16, %0            \n\t"\
    " js 1b                       \n\t"\
    "movhlps   %%xmm0, %%xmm1     \n\t"\
    "addpd     %%xmm1, %%xmm0     \n\t"\
    "movsd     %%xmm0, (%3)       \n\t"\
    : "+r" (len)\
    : "r" (((uint8_t*)(src+sample_index))-len),\
      "r" (((uint8_t*)filter)-len),\
      "r" (dst+dst_index)\
    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1")\
 );
 #define LINEAR_CORE_DBL_SSE2 \
    x86_reg len= -8*c->filter_length;\
 __asm__ volatile(\
    "xorpd      %%xmm0, %%xmm0    \n\t"\
    "xorpd      %%xmm2, %%xmm2    \n\t"\
    "1:                           \n\t"\
    "movupd   (%3, %0), %%xmm1    \n\t"\
    "movapd     %%xmm1, %%xmm3    \n\t"\
    "mulpd    (%4, %0), %%xmm1    \n\t"\
    "mulpd    (%5, %0), %%xmm3    \n\t"\
    "addpd      %%xmm1, %%xmm0    \n\t"\
    "addpd      %%xmm3, %%xmm2    \n\t"\
    "add           $16, %0        \n\t"\
    " js 1b                       \n\t"\
    "movhlps    %%xmm0, %%xmm1    \n\t"\
    "movhlps    %%xmm2, %%xmm3    \n\t"\
    "addpd      %%xmm1, %%xmm0    \n\t"\
    "addpd      %%xmm3, %%xmm2    \n\t"\
    "movsd      %%xmm0, %1        \n\t"\
    "movsd      %%xmm2, %2        \n\t"\
    : "+r" (len),\
      "=m" (val),\
      "=m" (v2)\
    : "r" (((uint8_t*)(src+sample_index))-len),\
      "r" (((uint8_t*)filter)-len),\
      "r" (((uint8_t*)(filter+c->filter_alloc))-len)\
    XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")\
 );
--- a/libswresample/x86/resample_x86_dsp.c
+++ b/libswresample/x86/resample_x86_dsp.c
@@ -27,21 +27,6 @@
 #include "libswresample/resample.h"
 int swri_resample_common_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
 int swri_resample_linear_double_sse2(ResampleContext *c,  double *dst, const  double *src, int n, int update_ctx);
 #if HAVE_SSE2_INLINE
 #define DO_RESAMPLE_ONE 0
 #include "resample_mmx.h"
 #define TEMPLATE_RESAMPLE_DBL_SSE2
 #include "libswresample/resample_template.c"
 #undef TEMPLATE_RESAMPLE_DBL_SSE2
 #endif
 #undef DO_RESAMPLE_ONE
 int ff_resample_common_int16_mmxext(ResampleContext *c, uint8_t *dst,
                                    const uint8_t *src, int sz, int upd);
 int ff_resample_linear_int16_mmxext(ResampleContext *c, uint8_t *dst,
@@ -62,6 +47,11 @@ int ff_resample_common_float_avx(ResampleContext *c, uint8_t *dst,
 int ff_resample_linear_float_avx(ResampleContext *c, uint8_t *dst,
                                 const uint8_t *src, int sz, int upd);
 int ff_resample_common_double_sse2(ResampleContext *c, uint8_t *dst,
                                   const uint8_t *src, int sz, int upd);
 int ff_resample_linear_double_sse2(ResampleContext *c, uint8_t *dst,
                                   const uint8_t *src, int sz, int upd);
 void swresample_dsp_x86_init(ResampleContext *c)
 {
    int av_unused mm_flags = av_get_cpu_flags();
@@ -78,10 +68,9 @@ void swresample_dsp_x86_init(ResampleContext *c)
    if (HAVE_SSE2_EXTERNAL && mm_flags & AV_CPU_FLAG_SSE2) {
        c->dsp.resample_common[FNIDX(S16P)] = ff_resample_common_int16_sse2;
        c->dsp.resample_linear[FNIDX(S16P)] = ff_resample_linear_int16_sse2;
    }
    if (HAVE_SSE2_INLINE && mm_flags & AV_CPU_FLAG_SSE2) {
        c->dsp.resample_common[FNIDX(DBLP)] = (resample_fn) swri_resample_common_double_sse2;
        c->dsp.resample_linear[FNIDX(DBLP)] = (resample_fn) swri_resample_linear_double_sse2;
        c->dsp.resample_common[FNIDX(DBLP)] = ff_resample_common_double_sse2;
        c->dsp.resample_linear[FNIDX(DBLP)] = ff_resample_linear_double_sse2;
    }
    if (HAVE_AVX_EXTERNAL && mm_flags & AV_CPU_FLAG_AVX) {
        c->dsp.resample_common[FNIDX(FLTP)] = ff_resample_common_float_avx;