| @@ -62,6 +62,7 @@ YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ | |||||
| x86/rv40dsp.o | x86/rv40dsp.o | ||||
| YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o | YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o | ||||
| YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o | YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o | ||||
| YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o | |||||
| YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o | YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o | ||||
| YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o | YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o | ||||
| YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | ||||
| @@ -38,9 +38,6 @@ | |||||
| DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; | DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL; | ||||
| DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; | ||||
| DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] = | |||||
| { 0x8000000080000000ULL, 0x8000000080000000ULL }; | |||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; | ||||
| DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; | DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; | ||||
| @@ -31,8 +31,6 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg; | |||||
| extern const uint64_t ff_bone; | extern const uint64_t ff_bone; | ||||
| extern const uint64_t ff_wtwo; | extern const uint64_t ff_wtwo; | ||||
| extern const uint64_t ff_pdw_80000000[2]; | |||||
| extern const xmm_reg ff_pw_3; | extern const xmm_reg ff_pw_3; | ||||
| extern const xmm_reg ff_pw_4; | extern const xmm_reg ff_pw_4; | ||||
| extern const xmm_reg ff_pw_5; | extern const xmm_reg ff_pw_5; | ||||
| @@ -0,0 +1,83 @@ | |||||
| ;****************************************************************************** | |||||
| ;* Vorbis x86 optimizations | |||||
| ;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION_RODATA | |||||
| pdw_80000000: times 4 dd 0x80000000 | |||||
| SECTION .text | |||||
| %if ARCH_X86_32 | |||||
| INIT_MMX 3dnow | |||||
| cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size | |||||
| pxor m7, m7 | |||||
| lea magq, [magq+block_sizeq*4] | |||||
| lea angq, [angq+block_sizeq*4] | |||||
| neg block_sizeq | |||||
| .loop: | |||||
| mova m0, [magq+block_sizeq*4] | |||||
| mova m1, [angq+block_sizeq*4] | |||||
| mova m2, m0 | |||||
| mova m3, m1 | |||||
| pfcmpge m2, m7 ; m <= 0.0 | |||||
| pfcmpge m3, m7 ; a <= 0.0 | |||||
| pslld m2, 31 ; keep only the sign bit | |||||
| pxor m1, m2 | |||||
| mova m4, m3 | |||||
| pand m3, m1 | |||||
| pandn m4, m1 | |||||
| pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) | |||||
| pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) | |||||
| mova [angq+block_sizeq*4], m3 | |||||
| mova [magq+block_sizeq*4], m0 | |||||
| add block_sizeq, 2 | |||||
| jl .loop | |||||
| femms | |||||
| RET | |||||
| %endif | |||||
| INIT_XMM sse | |||||
| cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr | |||||
| mova m5, [pdw_80000000] | |||||
| xor cntrq, cntrq | |||||
| align 16 | |||||
| .loop: | |||||
| mova m0, [magq+cntrq*4] | |||||
| mova m1, [angq+cntrq*4] | |||||
| xorps m2, m2 | |||||
| xorps m3, m3 | |||||
| cmpleps m2, m0 ; m <= 0.0 | |||||
| cmpleps m3, m1 ; a <= 0.0 | |||||
| andps m2, m5 ; keep only the sign bit | |||||
| xorps m1, m2 | |||||
| mova m4, m3 | |||||
| andps m3, m1 | |||||
| andnps m4, m1 | |||||
| addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) | |||||
| subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) | |||||
| mova [angq+cntrq*4], m3 | |||||
| mova [magq+cntrq*4], m0 | |||||
| add cntrq, 4 | |||||
| cmp cntrq, block_sizeq | |||||
| jl .loop | |||||
| RET | |||||
| @@ -21,83 +21,22 @@ | |||||
| #include "config.h" | #include "config.h" | ||||
| #include "libavutil/cpu.h" | #include "libavutil/cpu.h" | ||||
| #include "libavcodec/vorbisdsp.h" | #include "libavcodec/vorbisdsp.h" | ||||
| #include "dsputil_mmx.h" // for ff_pdw_80000000 | |||||
| #if HAVE_INLINE_ASM | |||||
| #if ARCH_X86_32 | |||||
| static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, | |||||
| intptr_t blocksize) | |||||
| { | |||||
| int i; | |||||
| __asm__ volatile ("pxor %%mm7, %%mm7":); | |||||
| for (i = 0; i < blocksize; i += 2) { | |||||
| __asm__ volatile ( | |||||
| "movq %0, %%mm0 \n\t" | |||||
| "movq %1, %%mm1 \n\t" | |||||
| "movq %%mm0, %%mm2 \n\t" | |||||
| "movq %%mm1, %%mm3 \n\t" | |||||
| "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 | |||||
| "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 | |||||
| "pslld $31, %%mm2 \n\t" // keep only the sign bit | |||||
| "pxor %%mm2, %%mm1 \n\t" | |||||
| "movq %%mm3, %%mm4 \n\t" | |||||
| "pand %%mm1, %%mm3 \n\t" | |||||
| "pandn %%mm1, %%mm4 \n\t" | |||||
| "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) | |||||
| "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) | |||||
| "movq %%mm3, %1 \n\t" | |||||
| "movq %%mm0, %0 \n\t" | |||||
| : "+m"(mag[i]), "+m"(ang[i]) | |||||
| :: "memory" | |||||
| ); | |||||
| } | |||||
| __asm__ volatile ("femms"); | |||||
| } | |||||
| #endif | |||||
| static void vorbis_inverse_coupling_sse(float *mag, float *ang, | |||||
| intptr_t blocksize) | |||||
| { | |||||
| int i; | |||||
| __asm__ volatile ( | |||||
| "movaps %0, %%xmm5 \n\t" | |||||
| :: "m"(ff_pdw_80000000[0]) | |||||
| ); | |||||
| for (i = 0; i < blocksize; i += 4) { | |||||
| __asm__ volatile ( | |||||
| "movaps %0, %%xmm0 \n\t" | |||||
| "movaps %1, %%xmm1 \n\t" | |||||
| "xorps %%xmm2, %%xmm2 \n\t" | |||||
| "xorps %%xmm3, %%xmm3 \n\t" | |||||
| "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 | |||||
| "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 | |||||
| "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit | |||||
| "xorps %%xmm2, %%xmm1 \n\t" | |||||
| "movaps %%xmm3, %%xmm4 \n\t" | |||||
| "andps %%xmm1, %%xmm3 \n\t" | |||||
| "andnps %%xmm1, %%xmm4 \n\t" | |||||
| "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m))) | |||||
| "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m))) | |||||
| "movaps %%xmm3, %1 \n\t" | |||||
| "movaps %%xmm0, %0 \n\t" | |||||
| : "+m"(mag[i]), "+m"(ang[i]) | |||||
| :: "memory" | |||||
| ); | |||||
| } | |||||
| } | |||||
| #endif | |||||
| void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, | |||||
| intptr_t blocksize); | |||||
| void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, | |||||
| intptr_t blocksize); | |||||
| void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) | void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) | ||||
| { | { | ||||
| #if HAVE_INLINE_ASM | |||||
| #if HAVE_YASM | |||||
| int mm_flags = av_get_cpu_flags(); | int mm_flags = av_get_cpu_flags(); | ||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| if (mm_flags & AV_CPU_FLAG_3DNOW) | if (mm_flags & AV_CPU_FLAG_3DNOW) | ||||
| dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; | |||||
| dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; | |||||
| #endif /* ARCH_X86_32 */ | #endif /* ARCH_X86_32 */ | ||||
| if (mm_flags & AV_CPU_FLAG_SSE) | if (mm_flags & AV_CPU_FLAG_SSE) | ||||
| dsp->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; | |||||
| #endif /* HAVE_INLINE_ASM */ | |||||
| dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; | |||||
| #endif /* HAVE_YASM */ | |||||
| } | } | ||||