help in fixing the Win64 fate failures. Originally committed as revision 24922 to svn://svn.ffmpeg.org/ffmpeg/trunktags/n0.8
| @@ -31,10 +31,9 @@ MMX-OBJS-$(CONFIG_VP3_DECODER) += x86/vp3dsp_mmx.o \ | |||
| MMX-OBJS-$(CONFIG_VP5_DECODER) += x86/vp3dsp_mmx.o \ | |||
| x86/vp3dsp_sse2.o \ | |||
| x86/vp56dsp_init.o | |||
| YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp56dsp.o | |||
| MMX-OBJS-$(CONFIG_VP6_DECODER) += x86/vp3dsp_mmx.o \ | |||
| x86/vp3dsp_sse2.o \ | |||
| x86/vp6dsp_mmx.o \ | |||
| x86/vp6dsp_sse2.o \ | |||
| x86/vp56dsp_init.o | |||
| YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | |||
| MMX-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp-init.o | |||
| @@ -0,0 +1,170 @@ | |||
| ;****************************************************************************** | |||
| ;* MMX/SSE2-optimized functions for the VP6 decoder | |||
| ;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> | |||
| ;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "x86inc.asm" | |||
| %include "x86util.asm" | |||
| cextern pw_64 | |||
| SECTION .text | |||
| %macro DIAG4_MMX 6 | |||
| movq m0, [%1+%2] | |||
| movq m1, [%1+%3] | |||
| movq m3, m0 | |||
| movq m4, m1 | |||
| punpcklbw m0, m7 | |||
| punpcklbw m1, m7 | |||
| punpckhbw m3, m7 | |||
| punpckhbw m4, m7 | |||
| pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] | |||
| pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] | |||
| pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] | |||
| pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] | |||
| paddw m0, m1 | |||
| paddw m3, m4 | |||
| movq m1, [%1+%4] | |||
| movq m2, [%1+%5] | |||
| movq m4, m1 | |||
| movq m5, m2 | |||
| punpcklbw m1, m7 | |||
| punpcklbw m2, m7 | |||
| punpcklbw m4, m7 | |||
| punpcklbw m5, m7 | |||
| pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] | |||
| pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] | |||
| pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] | |||
| pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] | |||
| paddw m1, m2 | |||
| paddw m4, m5 | |||
| paddsw m0, m1 | |||
| paddsw m3, m4 | |||
| paddsw m0, m6 ; Add 64 | |||
| paddsw m3, m6 ; Add 64 | |||
| psraw m0, 7 | |||
| psraw m3, 7 | |||
| packuswb m0, m3 | |||
| movq [%6], m0 | |||
| %endmacro | |||
| %macro DIAG4_SSE2 6 | |||
| movq m0, [%1+%2] | |||
| movq m1, [%1+%3] | |||
| punpcklbw m0, m7 | |||
| punpcklbw m1, m7 | |||
| pmullw m0, m4 ; src[x-8 ] * biweight [0] | |||
| pmullw m1, m5 ; src[x ] * biweight [1] | |||
| paddw m0, m1 | |||
| movq m1, [%1+%4] | |||
| movq m2, [%1+%5] | |||
| punpcklbw m1, m7 | |||
| punpcklbw m2, m7 | |||
| pmullw m1, m6 ; src[x+8 ] * biweight [2] | |||
| pmullw m2, m3 ; src[x+16] * biweight [3] | |||
| paddw m1, m2 | |||
| paddsw m0, m1 | |||
| paddsw m0, [pw_64] ; Add 64 | |||
| psraw m0, 7 | |||
| packuswb m0, m0 | |||
| movq [%6], m0 | |||
| %endmacro | |||
| %macro SPLAT4REGS_MMX 0 | |||
| movq m5, m3 | |||
| punpcklwd m3, m3 | |||
| movq m4, m3 | |||
| punpckldq m3, m3 | |||
| punpckhdq m4, m4 | |||
| punpckhwd m5, m5 | |||
| movq m6, m5 | |||
| punpckhdq m6, m6 | |||
| punpckldq m5, m5 | |||
| movq [rsp+8*11], m3 | |||
| movq [rsp+8*12], m4 | |||
| movq [rsp+8*13], m5 | |||
| movq [rsp+8*14], m6 | |||
| %endmacro | |||
| %macro SPLAT4REGS_SSE2 0 | |||
| pshuflw m4, m3, 0x0 | |||
| pshuflw m5, m3, 0x55 | |||
| pshuflw m6, m3, 0xAA | |||
| pshuflw m3, m3, 0xFF | |||
| punpcklqdq m4, m4 | |||
| punpcklqdq m5, m5 | |||
| punpcklqdq m6, m6 | |||
| punpcklqdq m3, m3 | |||
| %endmacro | |||
| %macro vp6_filter_diag4 2 | |||
| ; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, | |||
| ; const int16_t h_weight[4], const int16_t v_weights[4]) | |||
| cglobal vp6_filter_diag4_%1, 5, 7, %2 | |||
| mov r5, rsp ; backup stack pointer | |||
| and rsp, ~(mmsize-1) ; align stack | |||
| %ifidn %1, sse2 | |||
| sub rsp, 8*11 | |||
| %else | |||
| sub rsp, 8*15 | |||
| movq m6, [pw_64] | |||
| %endif | |||
| sub r1, r2 | |||
| pxor m7, m7 | |||
| movq m3, [r3] | |||
| SPLAT4REGS | |||
| mov r3, rsp | |||
| mov r6, 11 | |||
| .nextrow | |||
| DIAG4 r1, -1, 0, 1, 2, r3 | |||
| add r3, 8 | |||
| add r1, r2 | |||
| dec r6 | |||
| jnz .nextrow | |||
| movq m3, [r4] | |||
| SPLAT4REGS | |||
| lea r3, [rsp+8] | |||
| mov r6, 8 | |||
| .nextcol | |||
| DIAG4 r3, -8, 0, 8, 16, r0 | |||
| add r3, 8 | |||
| add r0, r2 | |||
| dec r6 | |||
| jnz .nextcol | |||
| mov rsp, r5 ; restore stack pointer | |||
| RET | |||
| %endmacro | |||
| INIT_MMX | |||
| %define DIAG4 DIAG4_MMX | |||
| %define SPLAT4REGS SPLAT4REGS_MMX | |||
| vp6_filter_diag4 mmx, 0 | |||
| INIT_XMM | |||
| %define DIAG4 DIAG4_SSE2 | |||
| %define SPLAT4REGS SPLAT4REGS_SSE2 | |||
| vp6_filter_diag4 sse2, 8 | |||
| @@ -23,11 +23,15 @@ | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/dsputil.h" | |||
| #include "libavcodec/vp56dsp.h" | |||
| #include "vp6dsp_mmx.h" | |||
| #include "vp6dsp_sse2.h" | |||
| void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights,const int16_t *v_weights); | |||
| void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights,const int16_t *v_weights); | |||
| av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) | |||
| { | |||
| #if HAVE_YASM | |||
| int mm_flags = mm_support(); | |||
| if (CONFIG_VP6_DECODER && codec == CODEC_ID_VP6) { | |||
| @@ -39,4 +43,5 @@ av_cold void ff_vp56dsp_init_x86(VP56DSPContext* c, enum CodecID codec) | |||
| c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; | |||
| } | |||
| } | |||
| #endif | |||
| } | |||
| @@ -1,108 +0,0 @@ | |||
| /** | |||
| * @file | |||
| * MMX-optimized functions for the VP6 decoder | |||
| * | |||
| * Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/dsputil.h" | |||
| #include "dsputil_mmx.h" | |||
| #include "vp6dsp_mmx.h" | |||
| #define DIAG4_MMX(in1,in2,in3,in4) \ | |||
| "movq "#in1"(%0), %%mm0 \n\t" \ | |||
| "movq "#in2"(%0), %%mm1 \n\t" \ | |||
| "movq %%mm0, %%mm3 \n\t" \ | |||
| "movq %%mm1, %%mm4 \n\t" \ | |||
| "punpcklbw %%mm7, %%mm0 \n\t" \ | |||
| "punpcklbw %%mm7, %%mm1 \n\t" \ | |||
| "punpckhbw %%mm7, %%mm3 \n\t" \ | |||
| "punpckhbw %%mm7, %%mm4 \n\t" \ | |||
| "pmullw 0(%2), %%mm0 \n\t" /* src[x-8 ] * biweight [0] */ \ | |||
| "pmullw 8(%2), %%mm1 \n\t" /* src[x ] * biweight [1] */ \ | |||
| "pmullw 0(%2), %%mm3 \n\t" /* src[x-8 ] * biweight [0] */ \ | |||
| "pmullw 8(%2), %%mm4 \n\t" /* src[x ] * biweight [1] */ \ | |||
| "paddw %%mm1, %%mm0 \n\t" \ | |||
| "paddw %%mm4, %%mm3 \n\t" \ | |||
| "movq "#in3"(%0), %%mm1 \n\t" \ | |||
| "movq "#in4"(%0), %%mm2 \n\t" \ | |||
| "movq %%mm1, %%mm4 \n\t" \ | |||
| "movq %%mm2, %%mm5 \n\t" \ | |||
| "punpcklbw %%mm7, %%mm1 \n\t" \ | |||
| "punpcklbw %%mm7, %%mm2 \n\t" \ | |||
| "punpckhbw %%mm7, %%mm4 \n\t" \ | |||
| "punpckhbw %%mm7, %%mm5 \n\t" \ | |||
| "pmullw 16(%2), %%mm1 \n\t" /* src[x+8 ] * biweight [2] */ \ | |||
| "pmullw 24(%2), %%mm2 \n\t" /* src[x+16] * biweight [3] */ \ | |||
| "pmullw 16(%2), %%mm4 \n\t" /* src[x+8 ] * biweight [2] */ \ | |||
| "pmullw 24(%2), %%mm5 \n\t" /* src[x+16] * biweight [3] */ \ | |||
| "paddw %%mm2, %%mm1 \n\t" \ | |||
| "paddw %%mm5, %%mm4 \n\t" \ | |||
| "paddsw %%mm1, %%mm0 \n\t" \ | |||
| "paddsw %%mm4, %%mm3 \n\t" \ | |||
| "paddsw %%mm6, %%mm0 \n\t" /* Add 64 */ \ | |||
| "paddsw %%mm6, %%mm3 \n\t" /* Add 64 */ \ | |||
| "psraw $7, %%mm0 \n\t" \ | |||
| "psraw $7, %%mm3 \n\t" \ | |||
| "packuswb %%mm3, %%mm0 \n\t" \ | |||
| "movq %%mm0, (%1) \n\t" | |||
| void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights, const int16_t *v_weights) | |||
| { | |||
| uint8_t tmp[8*11], *t = tmp; | |||
| int16_t weights[4*4]; | |||
| int i; | |||
| src -= stride; | |||
| for (i=0; i<4*4; i++) | |||
| weights[i] = h_weights[i>>2]; | |||
| __asm__ volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "movq "MANGLE(ff_pw_64)", %%mm6 \n\t" | |||
| "1: \n\t" | |||
| DIAG4_MMX(-1,0,1,2) | |||
| "add $8, %1 \n\t" | |||
| "add %3, %0 \n\t" | |||
| "decl %4 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+r"(src), "+r"(t) | |||
| : "r"(weights), "r"((x86_reg)stride), "r"(11) | |||
| : "memory"); | |||
| t = tmp + 8; | |||
| for (i=0; i<4*4; i++) | |||
| weights[i] = v_weights[i>>2]; | |||
| __asm__ volatile( | |||
| "pxor %%mm7, %%mm7 \n\t" | |||
| "movq "MANGLE(ff_pw_64)", %%mm6 \n\t" | |||
| "1: \n\t" | |||
| DIAG4_MMX(-8,0,8,16) | |||
| "add $8, %0 \n\t" | |||
| "add %3, %1 \n\t" | |||
| "decl %4 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+r"(t), "+r"(dst) | |||
| : "r"(weights), "r"((x86_reg)stride), "r"(8) | |||
| : "memory"); | |||
| } | |||
| @@ -1,30 +0,0 @@ | |||
| /* | |||
| * vp6dsp MMX function declarations | |||
| * Copyright (c) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #ifndef AVCODEC_X86_VP6DSP_MMX_H | |||
| #define AVCODEC_X86_VP6DSP_MMX_H | |||
| #include <stdint.h> | |||
| void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights,const int16_t *v_weights); | |||
| #endif /* AVCODEC_X86_VP6DSP_MMX_H */ | |||
| @@ -1,98 +0,0 @@ | |||
| /** | |||
| * @file | |||
| * SSE2-optimized functions for the VP6 decoder | |||
| * | |||
| * Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavutil/x86_cpu.h" | |||
| #include "libavcodec/dsputil.h" | |||
| #include "dsputil_mmx.h" | |||
| #include "vp6dsp_sse2.h" | |||
| #define DIAG4_SSE2(in1,in2,in3,in4) \ | |||
| "movq "#in1"(%0), %%xmm0 \n\t" \ | |||
| "movq "#in2"(%0), %%xmm1 \n\t" \ | |||
| "punpcklbw %%xmm7, %%xmm0 \n\t" \ | |||
| "punpcklbw %%xmm7, %%xmm1 \n\t" \ | |||
| "pmullw %%xmm4, %%xmm0 \n\t" /* src[x-8 ] * biweight [0] */ \ | |||
| "pmullw %%xmm5, %%xmm1 \n\t" /* src[x ] * biweight [1] */ \ | |||
| "paddw %%xmm1, %%xmm0 \n\t" \ | |||
| "movq "#in3"(%0), %%xmm1 \n\t" \ | |||
| "movq "#in4"(%0), %%xmm2 \n\t" \ | |||
| "punpcklbw %%xmm7, %%xmm1 \n\t" \ | |||
| "punpcklbw %%xmm7, %%xmm2 \n\t" \ | |||
| "pmullw %%xmm6, %%xmm1 \n\t" /* src[x+8 ] * biweight [2] */ \ | |||
| "pmullw %%xmm3, %%xmm2 \n\t" /* src[x+16] * biweight [3] */ \ | |||
| "paddw %%xmm2, %%xmm1 \n\t" \ | |||
| "paddsw %%xmm1, %%xmm0 \n\t" \ | |||
| "paddsw "MANGLE(ff_pw_64)", %%xmm0 \n\t" /* Add 64 */ \ | |||
| "psraw $7, %%xmm0 \n\t" \ | |||
| "packuswb %%xmm0, %%xmm0 \n\t" \ | |||
| "movq %%xmm0, (%1) \n\t" \ | |||
| void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights,const int16_t *v_weights) | |||
| { | |||
| uint8_t tmp[8*11], *t = tmp; | |||
| src -= stride; | |||
| __asm__ volatile( | |||
| "pxor %%xmm7, %%xmm7 \n\t" | |||
| "movq %4, %%xmm3 \n\t" | |||
| "pshuflw $0, %%xmm3, %%xmm4 \n\t" | |||
| "punpcklqdq %%xmm4, %%xmm4 \n\t" | |||
| "pshuflw $85, %%xmm3, %%xmm5 \n\t" | |||
| "punpcklqdq %%xmm5, %%xmm5 \n\t" | |||
| "pshuflw $170, %%xmm3, %%xmm6 \n\t" | |||
| "punpcklqdq %%xmm6, %%xmm6 \n\t" | |||
| "pshuflw $255, %%xmm3, %%xmm3 \n\t" | |||
| "punpcklqdq %%xmm3, %%xmm3 \n\t" | |||
| "1: \n\t" | |||
| DIAG4_SSE2(-1,0,1,2) | |||
| "add $8, %1 \n\t" | |||
| "add %2, %0 \n\t" | |||
| "decl %3 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+r"(src), "+r"(t) | |||
| : "g"((x86_reg)stride), "r"(11), "m"(*(const int64_t*)h_weights) | |||
| : "memory"); | |||
| t = tmp + 8; | |||
| __asm__ volatile( | |||
| "movq %4, %%xmm3 \n\t" | |||
| "pshuflw $0, %%xmm3, %%xmm4 \n\t" | |||
| "punpcklqdq %%xmm4, %%xmm4 \n\t" | |||
| "pshuflw $85, %%xmm3, %%xmm5 \n\t" | |||
| "punpcklqdq %%xmm5, %%xmm5 \n\t" | |||
| "pshuflw $170, %%xmm3, %%xmm6 \n\t" | |||
| "punpcklqdq %%xmm6, %%xmm6 \n\t" | |||
| "pshuflw $255, %%xmm3, %%xmm3 \n\t" | |||
| "punpcklqdq %%xmm3, %%xmm3 \n\t" | |||
| "1: \n\t" | |||
| DIAG4_SSE2(-8,0,8,16) | |||
| "add $8, %0 \n\t" | |||
| "add %2, %1 \n\t" | |||
| "decl %3 \n\t" | |||
| "jnz 1b \n\t" | |||
| : "+r"(t), "+r"(dst) | |||
| : "g"((x86_reg)stride), "r"(8), "m"(*(const int64_t*)v_weights) | |||
| : "memory"); | |||
| } | |||
| @@ -1,30 +0,0 @@ | |||
| /* | |||
| * vp6dsp SSE2 function declarations | |||
| * Copyright (c) 2009 Zuxy Meng <zuxy.meng@gmail.com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #ifndef AVCODEC_X86_VP6DSP_SSE2_H | |||
| #define AVCODEC_X86_VP6DSP_SSE2_H | |||
| #include <stdint.h> | |||
| void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, | |||
| const int16_t *h_weights,const int16_t *v_weights); | |||
| #endif /* AVCODEC_X86_VP6DSP_SSE2_H */ | |||