| @@ -39,7 +39,6 @@ YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o | |||
| YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o | |||
| YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o | |||
| YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o | |||
| YASM-OBJS-FFT-$(HAVE_SSE) += x86/fft_sse.o | |||
| YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ | |||
| $(YASM-OBJS-FFT-yes) | |||
| YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ | |||
| @@ -45,6 +45,10 @@ struc FFTContext | |||
| .mdctbits: resd 1 | |||
| .tcos: pointer 1 | |||
| .tsin: pointer 1 | |||
| .fftperm: pointer 1 | |||
| .fftcalc: pointer 1 | |||
| .imdctcalc:pointer 1 | |||
| .imdcthalf:pointer 1 | |||
| endstruc | |||
| SECTION_RODATA | |||
| @@ -65,6 +69,7 @@ perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 | |||
| perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 | |||
| ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 | |||
| ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 | |||
| ps_m1m1m1m1: times 4 dd 1<<31 | |||
| ps_m1p1: dd 1<<31, 0 | |||
| %assign i 16 | |||
| @@ -532,6 +537,16 @@ DEFINE_ARGS z, w, n, o1, o3 | |||
| rep ret | |||
| %endmacro | |||
| %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs | |||
| lea r2, [dispatch_tab%1] | |||
| mov r2, [r2 + (%2q-2)*gprsize] | |||
| %ifdef PIC | |||
| lea r3, [$$] | |||
| add r2, r3 | |||
| %endif | |||
| call r2 | |||
| %endmacro ; FFT_DISPATCH | |||
| INIT_YMM avx | |||
| %if HAVE_AVX | |||
| @@ -548,6 +563,14 @@ INIT_YMM avx | |||
| DECL_PASS pass_avx, PASS_BIG 1 | |||
| DECL_PASS pass_interleave_avx, PASS_BIG 0 | |||
| cglobal fft_calc, 2,5,8 | |||
| mov r3d, [r0 + FFTContext.nbits] | |||
| mov r0, r1 | |||
| mov r1, r3 | |||
| FFT_DISPATCH _interleave %+ SUFFIX, r1 | |||
| REP_RET | |||
| %endif | |||
| INIT_XMM sse | |||
| @@ -565,6 +588,112 @@ INIT_XMM sse | |||
| DECL_PASS pass_sse, PASS_BIG 1 | |||
| DECL_PASS pass_interleave_sse, PASS_BIG 0 | |||
| cglobal fft_calc, 2,5,8 | |||
| mov r3d, [r0 + FFTContext.nbits] | |||
| PUSH r1 | |||
| PUSH r3 | |||
| mov r0, r1 | |||
| mov r1, r3 | |||
| FFT_DISPATCH _interleave %+ SUFFIX, r1 | |||
| POP rcx | |||
| POP r4 | |||
| cmp rcx, 4 | |||
| jg .end | |||
| mov r2, -1 | |||
| add rcx, 3 | |||
| shl r2, cl | |||
| sub r4, r2 | |||
| .loop | |||
| movaps xmm0, [r4 + r2] | |||
| movaps xmm1, xmm0 | |||
| unpcklps xmm0, [r4 + r2 + 16] | |||
| unpckhps xmm1, [r4 + r2 + 16] | |||
| movaps [r4 + r2], xmm0 | |||
| movaps [r4 + r2 + 16], xmm1 | |||
| add r2, 32 | |||
| jl .loop | |||
| .end: | |||
| REP_RET | |||
| cextern_naked memcpy | |||
| cglobal fft_permute, 2,7,1 | |||
| mov r4, [r0 + FFTContext.revtab] | |||
| mov r5, [r0 + FFTContext.tmpbuf] | |||
| mov ecx, [r0 + FFTContext.nbits] | |||
| mov r2, 1 | |||
| shl r2, cl | |||
| xor r0, r0 | |||
| %if ARCH_X86_32 | |||
| mov r1, r1m | |||
| %endif | |||
| .loop: | |||
| movaps xmm0, [r1 + 8*r0] | |||
| movzx r6, word [r4 + 2*r0] | |||
| movzx r3, word [r4 + 2*r0 + 2] | |||
| movlps [r5 + 8*r6], xmm0 | |||
| movhps [r5 + 8*r3], xmm0 | |||
| add r0, 2 | |||
| cmp r0, r2 | |||
| jl .loop | |||
| shl r2, 3 | |||
| %if ARCH_X86_64 | |||
| mov r0, r1 | |||
| mov r1, r5 | |||
| %else | |||
| push r2 | |||
| push r5 | |||
| push r1 | |||
| %endif | |||
| %if ARCH_X86_64 && WIN64 == 0 | |||
| jmp memcpy | |||
| %else | |||
| call memcpy | |||
| %if ARCH_X86_32 | |||
| add esp, 12 | |||
| %endif | |||
| REP_RET | |||
| %endif | |||
| cglobal imdct_calc, 3,5,3 | |||
| mov r3d, [r0 + FFTContext.mdctsize] | |||
| mov r4, [r0 + FFTContext.imdcthalf] | |||
| add r1, r3 | |||
| PUSH r3 | |||
| PUSH r1 | |||
| %if ARCH_X86_32 | |||
| push r2 | |||
| push r1 | |||
| push r0 | |||
| %else | |||
| sub rsp, 8 | |||
| %endif | |||
| call r4 | |||
| %if ARCH_X86_32 | |||
| add esp, 12 | |||
| %else | |||
| add rsp, 8 | |||
| %endif | |||
| POP r1 | |||
| POP r3 | |||
| lea r0, [r1 + 2*r3] | |||
| mov r2, r3 | |||
| sub r3, 16 | |||
| neg r2 | |||
| movaps xmm2, [ps_m1m1m1m1] | |||
| .loop: | |||
| movaps xmm0, [r1 + r3] | |||
| movaps xmm1, [r0 + r2] | |||
| shufps xmm0, xmm0, 0x1b | |||
| shufps xmm1, xmm1, 0x1b | |||
| xorps xmm0, xmm2 | |||
| movaps [r0 + r3], xmm1 | |||
| movaps [r1 + r2], xmm0 | |||
| sub r3, 16 | |||
| add r2, 16 | |||
| jl .loop | |||
| REP_RET | |||
| INIT_MMX 3dnow | |||
| %define mulps pfmul | |||
| %define addps pfadd | |||
| @@ -582,16 +711,6 @@ DECL_PASS pass_interleave_3dnow, PASS_BIG 0 | |||
| %define SECTION_REL | |||
| %endif | |||
| %macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs | |||
| lea r2, [dispatch_tab%1] | |||
| mov r2, [r2 + (%2q-2)*gprsize] | |||
| %ifdef PIC | |||
| lea r3, [$$] | |||
| add r2, r3 | |||
| %endif | |||
| call r2 | |||
| %endmacro ; FFT_DISPATCH | |||
| %macro DECL_FFT 1-2 ; nbits, suffix | |||
| %ifidn %0, 1 | |||
| %xdefine fullsuffix SUFFIX | |||
| @@ -1,110 +0,0 @@ | |||
| /* | |||
| * FFT/MDCT transform with SSE optimizations | |||
| * Copyright (c) 2008 Loren Merritt | |||
| * | |||
| * This file is part of Libav. | |||
| * | |||
| * Libav is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * Libav is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with Libav; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/dsputil.h" | |||
| #include "fft.h" | |||
| #include "config.h" | |||
| DECLARE_ASM_CONST(16, unsigned int, ff_m1m1m1m1)[4] = | |||
| { 1U << 31, 1U << 31, 1U << 31, 1U << 31 }; | |||
| void ff_fft_dispatch_sse(FFTComplex *z, int nbits); | |||
| void ff_fft_dispatch_interleave_sse(FFTComplex *z, int nbits); | |||
| void ff_fft_dispatch_interleave_avx(FFTComplex *z, int nbits); | |||
| #if HAVE_AVX | |||
| void ff_fft_calc_avx(FFTContext *s, FFTComplex *z) | |||
| { | |||
| ff_fft_dispatch_interleave_avx(z, s->nbits); | |||
| } | |||
| #endif | |||
| void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) | |||
| { | |||
| int n = 1 << s->nbits; | |||
| ff_fft_dispatch_interleave_sse(z, s->nbits); | |||
| if(n <= 16) { | |||
| x86_reg i = -8*n; | |||
| __asm__ volatile( | |||
| "1: \n" | |||
| "movaps (%0,%1), %%xmm0 \n" | |||
| "movaps %%xmm0, %%xmm1 \n" | |||
| "unpcklps 16(%0,%1), %%xmm0 \n" | |||
| "unpckhps 16(%0,%1), %%xmm1 \n" | |||
| "movaps %%xmm0, (%0,%1) \n" | |||
| "movaps %%xmm1, 16(%0,%1) \n" | |||
| "add $32, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(i) | |||
| :"r"(z+n) | |||
| :"memory" | |||
| ); | |||
| } | |||
| } | |||
| void ff_fft_permute_sse(FFTContext *s, FFTComplex *z) | |||
| { | |||
| int n = 1 << s->nbits; | |||
| int i; | |||
| for(i=0; i<n; i+=2) { | |||
| __asm__ volatile( | |||
| "movaps %2, %%xmm0 \n" | |||
| "movlps %%xmm0, %0 \n" | |||
| "movhps %%xmm0, %1 \n" | |||
| :"=m"(s->tmp_buf[s->revtab[i]]), | |||
| "=m"(s->tmp_buf[s->revtab[i+1]]) | |||
| :"m"(z[i]) | |||
| ); | |||
| } | |||
| memcpy(z, s->tmp_buf, n*sizeof(FFTComplex)); | |||
| } | |||
| void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input) | |||
| { | |||
| x86_reg j, k; | |||
| long n = s->mdct_size; | |||
| long n4 = n >> 2; | |||
| s->imdct_half(s, output + n4, input); | |||
| j = -n; | |||
| k = n-16; | |||
| __asm__ volatile( | |||
| "movaps "MANGLE(ff_m1m1m1m1)", %%xmm7 \n" | |||
| "1: \n" | |||
| "movaps (%2,%1), %%xmm0 \n" | |||
| "movaps (%3,%0), %%xmm1 \n" | |||
| "shufps $0x1b, %%xmm0, %%xmm0 \n" | |||
| "shufps $0x1b, %%xmm1, %%xmm1 \n" | |||
| "xorps %%xmm7, %%xmm0 \n" | |||
| "movaps %%xmm1, (%3,%1) \n" | |||
| "movaps %%xmm0, (%2,%0) \n" | |||
| "sub $16, %1 \n" | |||
| "add $16, %0 \n" | |||
| "jl 1b \n" | |||
| :"+r"(j), "+r"(k) | |||
| :"r"(output+n4), "r"(output+n4*3) | |||
| XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm7") | |||
| ); | |||
| } | |||