x86: fft: convert sse inline asm to yasm

13 years ago · 8299260470
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT)                += x86/dct32_sse.o
 YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc_yasm.o
 YASM-OBJS-FFT-$(HAVE_AMD3DNOW)         += x86/fft_3dn.o
 YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT)      += x86/fft_3dn2.o
 YASM-OBJS-FFT-$(HAVE_SSE)              += x86/fft_sse.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft_mmx.o                 \
                                          $(YASM-OBJS-FFT-yes)
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
--- a/libavcodec/x86/fft_mmx.asm
+++ b/libavcodec/x86/fft_mmx.asm
@@ -45,6 +45,10 @@ struc FFTContext
    .mdctbits: resd 1
    .tcos:     pointer 1
    .tsin:     pointer 1
    .fftperm:  pointer 1
    .fftcalc:  pointer 1
    .imdctcalc:pointer 1
    .imdcthalf:pointer 1
 endstruc

 SECTION_RODATA
@@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
 perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
 ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
 ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
 ps_m1m1m1m1: times 4 dd 1<<31
 ps_m1p1: dd 1<<31, 0

 %assign i 16
@@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3
    rep ret
 %endmacro

 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
    lea r2, [dispatch_tab%1]
    mov r2, [r2 + (%2q-2)*gprsize]
 %ifdef PIC
    lea r3, [$$]
    add r2, r3
 %endif
    call r2
 %endmacro ; FFT_DISPATCH

 INIT_YMM avx

 %if HAVE_AVX
@@ -548,6 +563,14 @@ INIT_YMM avx

 DECL_PASS pass_avx, PASS_BIG 1
 DECL_PASS pass_interleave_avx, PASS_BIG 0

 cglobal fft_calc, 2,5,8
    mov     r3d, [r0 + FFTContext.nbits]
    mov     r0, r1
    mov     r1, r3
    FFT_DISPATCH _interleave %+ SUFFIX, r1
    REP_RET

 %endif

 INIT_XMM sse
@@ -565,6 +588,112 @@ INIT_XMM sse
 DECL_PASS pass_sse, PASS_BIG 1
 DECL_PASS pass_interleave_sse, PASS_BIG 0

 cglobal fft_calc, 2,5,8
    mov     r3d, [r0 + FFTContext.nbits]
    PUSH    r1
    PUSH    r3
    mov     r0, r1
    mov     r1, r3
    FFT_DISPATCH _interleave %+ SUFFIX, r1
    POP     rcx
    POP     r4
    cmp     rcx, 4
    jg      .end
    mov     r2, -1
    add     rcx, 3
    shl     r2, cl
    sub     r4, r2
 .loop
    movaps   xmm0, [r4 + r2]
    movaps   xmm1, xmm0
    unpcklps xmm0, [r4 + r2 + 16]
    unpckhps xmm1, [r4 + r2 + 16]
    movaps   [r4 + r2],      xmm0
    movaps   [r4 + r2 + 16], xmm1
    add      r2, 32
    jl       .loop
 .end:
    REP_RET

 cextern_naked memcpy

 cglobal fft_permute, 2,7,1
    mov     r4,  [r0 + FFTContext.revtab]
    mov     r5,  [r0 + FFTContext.tmpbuf]
    mov     ecx, [r0 + FFTContext.nbits]
    mov     r2, 1
    shl     r2, cl
    xor     r0, r0
 %if ARCH_X86_32
    mov     r1, r1m
 %endif
 .loop:
    movaps  xmm0, [r1 + 8*r0]
    movzx   r6, word [r4 + 2*r0]
    movzx   r3, word [r4 + 2*r0 + 2]
    movlps  [r5 + 8*r6], xmm0
    movhps  [r5 + 8*r3], xmm0
    add     r0, 2
    cmp     r0, r2
    jl      .loop
    shl     r2, 3
 %if ARCH_X86_64
    mov     r0, r1
    mov     r1, r5
 %else
    push    r2
    push    r5
    push    r1
 %endif
 %if ARCH_X86_64 && WIN64 == 0
    jmp     memcpy
 %else
    call    memcpy
 %if ARCH_X86_32
    add     esp, 12
 %endif
    REP_RET
 %endif

 cglobal imdct_calc, 3,5,3
    mov     r3d, [r0 + FFTContext.mdctsize]
    mov     r4,  [r0 + FFTContext.imdcthalf]
    add     r1,  r3
    PUSH    r3
    PUSH    r1
 %if ARCH_X86_32
    push    r2
    push    r1
    push    r0
 %else
    sub     rsp, 8
 %endif
    call    r4
 %if ARCH_X86_32
    add     esp, 12
 %else
    add     rsp, 8
 %endif
    POP     r1
    POP     r3
    lea     r0, [r1 + 2*r3]
    mov     r2, r3
    sub     r3, 16
    neg     r2
    movaps  xmm2, [ps_m1m1m1m1]
 .loop:
    movaps  xmm0, [r1 + r3]
    movaps  xmm1, [r0 + r2]
    shufps  xmm0, xmm0, 0x1b
    shufps  xmm1, xmm1, 0x1b
    xorps   xmm0, xmm2
    movaps  [r0 + r3], xmm1
    movaps  [r1 + r2], xmm0
    sub     r3, 16
    add     r2, 16
    jl      .loop
    REP_RET

 INIT_MMX 3dnow
 %define mulps pfmul
 %define addps pfadd
@@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0
 %define SECTION_REL
 %endif

 %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
    lea r2, [dispatch_tab%1]
    mov r2, [r2 + (%2q-2)*gprsize]
 %ifdef PIC
    lea r3, [$$]
    add r2, r3
 %endif
    call r2
 %endmacro ; FFT_DISPATCH

 %macro DECL_FFT 1-2 ; nbits, suffix
 %ifidn %0, 1
 %xdefine fullsuffix SUFFIX
--- a/libavcodec/x86/fft_sse.c
+++ b/libavcodec/x86/fft_sse.c
@@ -1,110 +0,0 @@
 /*
 * FFT/MDCT transform with SSE optimizations
 * Copyright (c) 2008 Loren Merritt
 *
 * This file is part of Libav.
 *
 * Libav is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * Libav is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with Libav; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/x86_cpu.h"
 #include "libavcodec/dsputil.h"
 #include "fft.h"
 #include "config.h"

 DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] =
    { 1U << 31, 1U << 31, 1U << 31, 1U << 31 };

 void ff_fft_dispatch_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits);
 void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits);

 #if HAVE_AVX
 void ff_fft_calc_avx(FFTContext *s, FFTComplex *z)
 {
    ff_fft_dispatch_interleave_avx(z, s->nbits);
 }
 #endif

 void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
 {
    int n = 1 << s->nbits;

    ff_fft_dispatch_interleave_sse(z, s->nbits);

    if(n <= 16) {
        x86_reg i = -8*n;
        __asm__ volatile(
            "1: \n"
            "movaps     (%0,%1), %%xmm0 \n"
            "movaps      %%xmm0, %%xmm1 \n"
            "unpcklps 16(%0,%1), %%xmm0 \n"
            "unpckhps 16(%0,%1), %%xmm1 \n"
            "movaps      %%xmm0,   (%0,%1) \n"
            "movaps      %%xmm1, 16(%0,%1) \n"
            "add $32, %0 \n"
            "jl 1b \n"
            :"+r"(i)
            :"r"(z+n)
            :"memory"
        );
    }
 }

 void ff_fft_permute_sse(FFTContext *s, FFTComplex *z)
 {
    int n = 1 << s->nbits;
    int i;
    for(i=0; i<n; i+=2) {
        __asm__ volatile(
            "movaps %2, %%xmm0 \n"
            "movlps %%xmm0, %0 \n"
            "movhps %%xmm0, %1 \n"
            :"=m"(s->tmp_buf[s->revtab[i]]),
             "=m"(s->tmp_buf[s->revtab[i+1]])
            :"m"(z[i])
        );
    }
    memcpy(z, s->tmp_buf, n*sizeof(FFTComplex));
 }

 void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input)
 {
    x86_reg j, k;
    long n = s->mdct_size;
    long n4 = n >> 2;

    s->imdct_half(s, output + n4, input);

    j = -n;
    k = n-16;
    __asm__ volatile(
        "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n"
        "1: \n"
        "movaps       (%2,%1), %%xmm0 \n"
        "movaps       (%3,%0), %%xmm1 \n"
        "shufps $0x1b, %%xmm0, %%xmm0 \n"
        "shufps $0x1b, %%xmm1, %%xmm1 \n"
        "xorps         %%xmm7, %%xmm0 \n"
        "movaps        %%xmm1, (%3,%1) \n"
        "movaps        %%xmm0, (%2,%0) \n"
        "sub $16, %1 \n"
        "add $16, %0 \n"
        "jl 1b \n"
        :"+r"(j), "+r"(k)
        :"r"(output+n4), "r"(output+n4*3)
        XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7")
    );
 }