64-bit CPUs always have SSE available, thus there is no need to compile in the 3dnow functions. This results in smaller binaries.tags/n1.0
@@ -38,8 +38,6 @@ YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o | |||
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o | |||
YASM-OBJS-$(CONFIG_DCT) += x86/dct32_sse.o | |||
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_yasm.o | |||
YASM-OBJS-FFT-$(HAVE_AMD3DNOW) += x86/fft_3dn.o | |||
YASM-OBJS-FFT-$(HAVE_AMD3DNOWEXT) += x86/fft_3dn2.o | |||
YASM-OBJS-$(CONFIG_FFT) += x86/fft_mmx.o \ | |||
$(YASM-OBJS-FFT-yes) | |||
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ | |||
@@ -25,6 +25,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) | |||
{ | |||
#if HAVE_YASM | |||
int has_vectors = av_get_cpu_flags(); | |||
#if ARCH_X86_32 | |||
if (has_vectors & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) { | |||
/* 3DNow! for K6-2/3 */ | |||
s->imdct_calc = ff_imdct_calc_3dnow; | |||
@@ -37,6 +38,7 @@ av_cold void ff_fft_init_mmx(FFTContext *s) | |||
s->imdct_half = ff_imdct_half_3dnow2; | |||
s->fft_calc = ff_fft_calc_3dnow2; | |||
} | |||
#endif | |||
if (has_vectors & AV_CPU_FLAG_SSE && HAVE_SSE) { | |||
/* SSE for P3/P4/K8 */ | |||
s->imdct_calc = ff_imdct_calc_sse; | |||
@@ -1,23 +0,0 @@ | |||
/* | |||
* FFT/MDCT transform with 3DNow! optimizations | |||
* Copyright (c) 2008 Loren Merritt | |||
* | |||
* This file is part of Libav. | |||
* | |||
* Libav is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* Libav is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Libav; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#define EMULATE_3DNOWEXT | |||
#include "fft_3dn2.c" |
@@ -1,173 +0,0 @@ | |||
/* | |||
* FFT/MDCT transform with Extended 3DNow! optimizations | |||
* Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt | |||
* | |||
* This file is part of Libav. | |||
* | |||
* Libav is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* Libav is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Libav; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#include "libavutil/x86_cpu.h" | |||
#include "libavcodec/dsputil.h" | |||
#include "fft.h" | |||
DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 }; | |||
#ifdef EMULATE_3DNOWEXT | |||
#define PSWAPD(s,d)\ | |||
"movq "#s","#d"\n"\ | |||
"psrlq $32,"#d"\n"\ | |||
"punpckldq "#s","#d"\n" | |||
#define ff_fft_calc_3dnow2 ff_fft_calc_3dnow | |||
#define ff_fft_dispatch_3dnow2 ff_fft_dispatch_3dnow | |||
#define ff_fft_dispatch_interleave_3dnow2 ff_fft_dispatch_interleave_3dnow | |||
#define ff_imdct_calc_3dnow2 ff_imdct_calc_3dnow | |||
#define ff_imdct_half_3dnow2 ff_imdct_half_3dnow | |||
#else | |||
#define PSWAPD(s,d) "pswapd "#s","#d"\n" | |||
#endif | |||
void ff_fft_dispatch_3dnow2(FFTComplex *z, int nbits); | |||
void ff_fft_dispatch_interleave_3dnow2(FFTComplex *z, int nbits); | |||
void ff_fft_calc_3dnow2(FFTContext *s, FFTComplex *z) | |||
{ | |||
int n = 1<<s->nbits; | |||
int i; | |||
ff_fft_dispatch_interleave_3dnow2(z, s->nbits); | |||
__asm__ volatile("femms"); | |||
if(n <= 8) | |||
for(i=0; i<n; i+=2) | |||
FFSWAP(FFTSample, z[i].im, z[i+1].re); | |||
} | |||
void ff_imdct_half_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input) | |||
{ | |||
x86_reg j, k; | |||
long n = s->mdct_size; | |||
long n2 = n >> 1; | |||
long n4 = n >> 2; | |||
long n8 = n >> 3; | |||
const uint16_t *revtab = s->revtab; | |||
const FFTSample *tcos = s->tcos; | |||
const FFTSample *tsin = s->tsin; | |||
const FFTSample *in1, *in2; | |||
FFTComplex *z = (FFTComplex *)output; | |||
/* pre rotation */ | |||
in1 = input; | |||
in2 = input + n2 - 1; | |||
#ifdef EMULATE_3DNOWEXT | |||
__asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31)); | |||
#endif | |||
for(k = 0; k < n4; k++) { | |||
// FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it | |||
__asm__ volatile( | |||
"movd %0, %%mm0 \n" | |||
"movd %2, %%mm1 \n" | |||
"punpckldq %1, %%mm0 \n" | |||
"punpckldq %3, %%mm1 \n" | |||
"movq %%mm0, %%mm2 \n" | |||
PSWAPD( %%mm1, %%mm3 ) | |||
"pfmul %%mm1, %%mm0 \n" | |||
"pfmul %%mm3, %%mm2 \n" | |||
#ifdef EMULATE_3DNOWEXT | |||
"movq %%mm0, %%mm1 \n" | |||
"punpckhdq %%mm2, %%mm0 \n" | |||
"punpckldq %%mm2, %%mm1 \n" | |||
"pxor %%mm7, %%mm0 \n" | |||
"pfadd %%mm1, %%mm0 \n" | |||
#else | |||
"pfpnacc %%mm2, %%mm0 \n" | |||
#endif | |||
::"m"(in2[-2*k]), "m"(in1[2*k]), | |||
"m"(tcos[k]), "m"(tsin[k]) | |||
); | |||
__asm__ volatile( | |||
"movq %%mm0, %0 \n\t" | |||
:"=m"(z[revtab[k]]) | |||
); | |||
} | |||
ff_fft_dispatch_3dnow2(z, s->nbits); | |||
#define CMUL(j,mm0,mm1)\ | |||
"movq (%2,"#j",2), %%mm6 \n"\ | |||
"movq 8(%2,"#j",2), "#mm0"\n"\ | |||
"movq %%mm6, "#mm1"\n"\ | |||
"movq "#mm0",%%mm7 \n"\ | |||
"pfmul (%3,"#j"), %%mm6 \n"\ | |||
"pfmul (%4,"#j"), "#mm0"\n"\ | |||
"pfmul (%4,"#j"), "#mm1"\n"\ | |||
"pfmul (%3,"#j"), %%mm7 \n"\ | |||
"pfsub %%mm6, "#mm0"\n"\ | |||
"pfadd %%mm7, "#mm1"\n" | |||
/* post rotation */ | |||
j = -n2; | |||
k = n2-8; | |||
__asm__ volatile( | |||
"1: \n" | |||
CMUL(%0, %%mm0, %%mm1) | |||
CMUL(%1, %%mm2, %%mm3) | |||
"movd %%mm0, (%2,%0,2) \n" | |||
"movd %%mm1,12(%2,%1,2) \n" | |||
"movd %%mm2, (%2,%1,2) \n" | |||
"movd %%mm3,12(%2,%0,2) \n" | |||
"psrlq $32, %%mm0 \n" | |||
"psrlq $32, %%mm1 \n" | |||
"psrlq $32, %%mm2 \n" | |||
"psrlq $32, %%mm3 \n" | |||
"movd %%mm0, 8(%2,%0,2) \n" | |||
"movd %%mm1, 4(%2,%1,2) \n" | |||
"movd %%mm2, 8(%2,%1,2) \n" | |||
"movd %%mm3, 4(%2,%0,2) \n" | |||
"sub $8, %1 \n" | |||
"add $8, %0 \n" | |||
"jl 1b \n" | |||
:"+r"(j), "+r"(k) | |||
:"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) | |||
:"memory" | |||
); | |||
__asm__ volatile("femms"); | |||
} | |||
void ff_imdct_calc_3dnow2(FFTContext *s, FFTSample *output, const FFTSample *input) | |||
{ | |||
x86_reg j, k; | |||
long n = s->mdct_size; | |||
long n4 = n >> 2; | |||
ff_imdct_half_3dnow2(s, output+n4, input); | |||
j = -n; | |||
k = n-8; | |||
__asm__ volatile( | |||
"movq %4, %%mm7 \n" | |||
"1: \n" | |||
PSWAPD((%2,%1), %%mm0) | |||
PSWAPD((%3,%0), %%mm1) | |||
"pxor %%mm7, %%mm0 \n" | |||
"movq %%mm1, (%3,%1) \n" | |||
"movq %%mm0, (%2,%0) \n" | |||
"sub $8, %1 \n" | |||
"add $8, %0 \n" | |||
"jl 1b \n" | |||
:"+r"(j), "+r"(k) | |||
:"r"(output+n4), "r"(output+n4*3), | |||
"m"(*m1m1) | |||
); | |||
__asm__ volatile("femms"); | |||
} |
@@ -29,6 +29,7 @@ | |||
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) | |||
%include "x86inc.asm" | |||
%include "x86util.asm" | |||
%if ARCH_X86_64 | |||
%define pointer resq | |||
@@ -105,7 +106,7 @@ SECTION_TEXT | |||
pfadd %5, %4 ; {t6,t5} | |||
pxor %3, [ps_m1p1] ; {t8,t7} | |||
mova %6, %1 | |||
pswapd %3, %3 | |||
PSWAPD %3, %3 | |||
pfadd %1, %5 ; {r0,i0} | |||
pfsub %6, %5 ; {r2,i2} | |||
mova %4, %2 | |||
@@ -396,7 +397,6 @@ fft32_interleave_avx: | |||
%endif | |||
INIT_XMM sse | |||
%define movdqa movaps | |||
align 16 | |||
fft4_avx: | |||
@@ -469,8 +469,8 @@ fft8 %+ SUFFIX: | |||
mova Z(2), m2 | |||
T2_3DN m4, m5, Z(4), Z(5) | |||
T2_3DN m6, m7, Z2(6), Z2(7) | |||
pswapd m0, m5 | |||
pswapd m2, m7 | |||
PSWAPD m0, m5 | |||
PSWAPD m2, m7 | |||
pxor m0, [ps_m1p1] | |||
pxor m2, [ps_m1p1] | |||
pfsub m5, m0 | |||
@@ -498,11 +498,11 @@ fft8 %+ SUFFIX: | |||
ret | |||
%endmacro | |||
INIT_MMX 3dnow2 | |||
FFT48_3DN | |||
%macro pswapd 2 | |||
%ifidn %1, %2 | |||
%if ARCH_X86_32 | |||
%macro PSWAPD 2 | |||
%if cpuflag(3dnow2) | |||
pswapd %1, %2 | |||
%elifidn %1, %2 | |||
movd [r0+12], %1 | |||
punpckhdq %1, [r0+8] | |||
%else | |||
@@ -512,9 +512,12 @@ FFT48_3DN | |||
%endif | |||
%endmacro | |||
INIT_MMX 3dnow | |||
INIT_MMX 3dnow2 | |||
FFT48_3DN | |||
INIT_MMX 3dnow | |||
FFT48_3DN | |||
%endif | |||
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] | |||
%define Z2(x) [zcq + o3q + mmsize*(x&1)] | |||
@@ -588,6 +591,7 @@ INIT_XMM sse | |||
DECL_PASS pass_sse, PASS_BIG 1 | |||
DECL_PASS pass_interleave_sse, PASS_BIG 0 | |||
%macro FFT_CALC_FUNC 0 | |||
cglobal fft_calc, 2,5,8 | |||
mov r3d, [r0 + FFTContext.nbits] | |||
PUSH r1 | |||
@@ -597,23 +601,43 @@ cglobal fft_calc, 2,5,8 | |||
FFT_DISPATCH _interleave %+ SUFFIX, r1 | |||
POP rcx | |||
POP r4 | |||
cmp rcx, 4 | |||
cmp rcx, 3+(mmsize/16) | |||
jg .end | |||
mov r2, -1 | |||
add rcx, 3 | |||
shl r2, cl | |||
sub r4, r2 | |||
.loop | |||
%if mmsize == 8 | |||
PSWAPD m0, [r4 + r2 + 4] | |||
mova [r4 + r2 + 4], m0 | |||
%else | |||
movaps xmm0, [r4 + r2] | |||
movaps xmm1, xmm0 | |||
unpcklps xmm0, [r4 + r2 + 16] | |||
unpckhps xmm1, [r4 + r2 + 16] | |||
movaps [r4 + r2], xmm0 | |||
movaps [r4 + r2 + 16], xmm1 | |||
add r2, 32 | |||
%endif | |||
add r2, mmsize*2 | |||
jl .loop | |||
.end: | |||
%if cpuflag(3dnow) | |||
femms | |||
RET | |||
%else | |||
REP_RET | |||
%endif | |||
%endmacro | |||
%if ARCH_X86_32 | |||
INIT_MMX 3dnow | |||
FFT_CALC_FUNC | |||
INIT_MMX 3dnow2 | |||
FFT_CALC_FUNC | |||
%endif | |||
INIT_XMM sse | |||
FFT_CALC_FUNC | |||
cglobal fft_permute, 2,7,1 | |||
mov r4, [r0 + FFTContext.revtab] | |||
@@ -648,6 +672,7 @@ cglobal fft_permute, 2,7,1 | |||
jl .loopcopy | |||
REP_RET | |||
%macro IMDCT_CALC_FUNC 0 | |||
cglobal imdct_calc, 3,5,3 | |||
mov r3d, [r0 + FFTContext.mdctsize] | |||
mov r4, [r0 + FFTContext.imdcthalf] | |||
@@ -671,22 +696,45 @@ cglobal imdct_calc, 3,5,3 | |||
POP r3 | |||
lea r0, [r1 + 2*r3] | |||
mov r2, r3 | |||
sub r3, 16 | |||
sub r3, mmsize | |||
neg r2 | |||
movaps xmm2, [ps_m1m1m1m1] | |||
mova m2, [ps_m1m1m1m1] | |||
.loop: | |||
movaps xmm0, [r1 + r3] | |||
movaps xmm1, [r0 + r2] | |||
shufps xmm0, xmm0, 0x1b | |||
shufps xmm1, xmm1, 0x1b | |||
xorps xmm0, xmm2 | |||
movaps [r0 + r3], xmm1 | |||
movaps [r1 + r2], xmm0 | |||
sub r3, 16 | |||
add r2, 16 | |||
%if mmsize == 8 | |||
PSWAPD m0, [r1 + r3] | |||
PSWAPD m1, [r0 + r2] | |||
pxor m0, m2 | |||
%else | |||
mova m0, [r1 + r3] | |||
mova m1, [r0 + r2] | |||
shufps m0, m0, 0x1b | |||
shufps m1, m1, 0x1b | |||
xorps m0, m2 | |||
%endif | |||
mova [r0 + r3], m1 | |||
mova [r1 + r2], m0 | |||
sub r3, mmsize | |||
add r2, mmsize | |||
jl .loop | |||
%if cpuflag(3dnow) | |||
femms | |||
RET | |||
%else | |||
REP_RET | |||
%endif | |||
%endmacro | |||
%if ARCH_X86_32 | |||
INIT_MMX 3dnow | |||
IMDCT_CALC_FUNC | |||
INIT_MMX 3dnow2 | |||
IMDCT_CALC_FUNC | |||
%endif | |||
INIT_XMM sse | |||
IMDCT_CALC_FUNC | |||
%if ARCH_X86_32 | |||
INIT_MMX 3dnow | |||
%define mulps pfmul | |||
%define addps pfadd | |||
@@ -697,6 +745,7 @@ DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] | |||
DECL_PASS pass_interleave_3dnow, PASS_BIG 0 | |||
%define pass_3dnow2 pass_3dnow | |||
%define pass_interleave_3dnow2 pass_interleave_3dnow | |||
%endif | |||
%ifdef PIC | |||
%define SECTION_REL - $$ | |||
@@ -760,12 +809,14 @@ DECL_FFT 6, _interleave | |||
INIT_XMM sse | |||
DECL_FFT 5 | |||
DECL_FFT 5, _interleave | |||
%if ARCH_X86_32 | |||
INIT_MMX 3dnow | |||
DECL_FFT 4 | |||
DECL_FFT 4, _interleave | |||
INIT_MMX 3dnow2 | |||
DECL_FFT 4 | |||
DECL_FFT 4, _interleave | |||
%endif | |||
INIT_XMM sse | |||
%undef mulps | |||
@@ -775,6 +826,37 @@ INIT_XMM sse | |||
%undef unpckhps | |||
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 | |||
%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 | |||
PSWAPD m0, [%3+%2*4] | |||
movq m2, [%3+%1*4-8] | |||
movq m3, m0 | |||
punpckldq m0, m2 | |||
punpckhdq m2, m3 | |||
movd m1, [%4+%1*2-4] ; tcos[j] | |||
movd m3, [%4+%2*2] ; tcos[n4-j-1] | |||
punpckldq m1, [%5+%1*2-4] ; tsin[j] | |||
punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] | |||
mova m4, m0 | |||
PSWAPD m5, m1 | |||
pfmul m0, m1 | |||
pfmul m4, m5 | |||
mova m6, m2 | |||
PSWAPD m5, m3 | |||
pfmul m2, m3 | |||
pfmul m6, m5 | |||
%if cpuflag(3dnow2) | |||
pfpnacc m0, m4 | |||
pfpnacc m2, m6 | |||
%else | |||
SBUTTERFLY dq, 0, 4, 1 | |||
SBUTTERFLY dq, 2, 6, 3 | |||
pxor m4, m7 | |||
pxor m6, m7 | |||
pfadd m0, m4 | |||
pfadd m2, m6 | |||
%endif | |||
%else | |||
movaps xmm0, [%3+%2*4] | |||
movaps xmm1, [%3+%1*4-0x10] | |||
movaps xmm2, xmm0 | |||
@@ -795,6 +877,7 @@ INIT_XMM sse | |||
movaps xmm0, xmm1 | |||
unpcklps xmm1, xmm2 | |||
unpckhps xmm0, xmm2 | |||
%endif | |||
%endmacro | |||
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 | |||
@@ -863,6 +946,40 @@ INIT_XMM sse | |||
jl .post | |||
%endmacro | |||
%macro CMUL_3DNOW 6 | |||
mova m6, [%1+%2*2] | |||
mova %3, [%1+%2*2+8] | |||
mova %4, m6 | |||
mova m7, %3 | |||
pfmul m6, [%5+%2] | |||
pfmul %3, [%6+%2] | |||
pfmul %4, [%6+%2] | |||
pfmul m7, [%5+%2] | |||
pfsub %3, m6 | |||
pfadd %4, m7 | |||
%endmacro | |||
%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 | |||
.post: | |||
CMUL_3DNOW %3, %1, m0, m1, %4, %5 | |||
CMUL_3DNOW %3, %2, m2, m3, %4, %5 | |||
movd [%3+%1*2+ 0], m0 | |||
movd [%3+%2*2+12], m1 | |||
movd [%3+%2*2+ 0], m2 | |||
movd [%3+%1*2+12], m3 | |||
psrlq m0, 32 | |||
psrlq m1, 32 | |||
psrlq m2, 32 | |||
psrlq m3, 32 | |||
movd [%3+%1*2+ 8], m0 | |||
movd [%3+%2*2+ 4], m1 | |||
movd [%3+%2*2+ 8], m2 | |||
movd [%3+%1*2+ 4], m3 | |||
sub %2, 8 | |||
add %1, 8 | |||
jl .post | |||
%endmacro | |||
%macro DECL_IMDCT 1 | |||
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input | |||
%if ARCH_X86_64 | |||
@@ -892,22 +1009,34 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i | |||
push rrevtab | |||
%endif | |||
sub r3, 4 | |||
%if ARCH_X86_64 | |||
sub r3, mmsize/4 | |||
%if ARCH_X86_64 || mmsize == 8 | |||
xor r4, r4 | |||
sub r4, r3 | |||
%endif | |||
%if notcpuflag(3dnow2) && mmsize == 8 | |||
movd m7, [ps_m1m1m1m1] | |||
%endif | |||
.pre: | |||
%if ARCH_X86_64 == 0 | |||
;unspill | |||
%if mmsize != 8 | |||
xor r4, r4 | |||
sub r4, r3 | |||
mov rtsin, [esp+4] | |||
%endif | |||
mov rtcos, [esp+8] | |||
mov rtsin, [esp+4] | |||
%endif | |||
PREROTATER r4, r3, r2, rtcos, rtsin | |||
%if ARCH_X86_64 | |||
%if mmsize == 8 | |||
mov r6, [esp] ; rrevtab = ptr+n8 | |||
movzx r5, word [rrevtab+r4-2] ; rrevtab[j] | |||
movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] | |||
mova [r1+r5*8], m0 | |||
mova [r1+r6*8], m2 | |||
add r4, 2 | |||
%elif ARCH_X86_64 | |||
movzx r5, word [rrevtab+r4-4] | |||
movzx r6, word [rrevtab+r4-2] | |||
movzx r10, word [rrevtab+r3] | |||
@@ -928,7 +1057,7 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i | |||
movlps [r1+r5*8], xmm1 | |||
movhps [r1+r4*8], xmm1 | |||
%endif | |||
sub r3, 4 | |||
sub r3, mmsize/4 | |||
jns .pre | |||
mov r5, r0 | |||
@@ -953,12 +1082,23 @@ cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *i | |||
%1 r0, r1, r6, rtcos, rtsin | |||
%if ARCH_X86_64 == 0 | |||
add esp, 12 | |||
%endif | |||
%if mmsize == 8 | |||
femms | |||
%endif | |||
RET | |||
%endmacro | |||
DECL_IMDCT POSROTATESHUF | |||
%if ARCH_X86_32 | |||
INIT_MMX 3dnow | |||
DECL_IMDCT POSROTATESHUF_3DNOW | |||
INIT_MMX 3dnow2 | |||
DECL_IMDCT POSROTATESHUF_3DNOW | |||
%endif | |||
INIT_YMM avx | |||
%if HAVE_AVX | |||