These smaller samples do not need to be unpacked to double words allowing the code to process more pixels every iteration (still 2 in MMX but 6 in SSE2). It also avoids emulating the missing double word instructions on older instruction sets. Like with the previous code for 16-bit samples this has been tested on an Athlon64 and a Core2Quad. Athlon64: 1809275 decicycles in C, 32718 runs, 50 skips 911675 decicycles in mmx, 32727 runs, 41 skips, 2.0x faster 495284 decicycles in sse2, 32747 runs, 21 skips, 3.7x faster Core2Quad: 921363 decicycles in C, 32756 runs, 12 skips 486537 decicycles in mmx, 32764 runs, 4 skips, 1.9x faster 293296 decicycles in sse2, 32759 runs, 9 skips, 3.1x faster 284910 decicycles in ssse3, 32759 runs, 9 skips, 3.2x faster Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.0
| @@ -5,4 +5,4 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o | |||||
| YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o | YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o | ||||
| YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o | YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o | ||||
| YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o | |||||
| YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o | |||||
| @@ -49,6 +49,16 @@ void ff_yadif_filter_line_16bit_sse4(void *dst, void *prev, void *cur, | |||||
| void *next, int w, int prefs, | void *next, int w, int prefs, | ||||
| int mrefs, int parity, int mode); | int mrefs, int parity, int mode); | ||||
| void ff_yadif_filter_line_10bit_mmxext(void *dst, void *prev, void *cur, | |||||
| void *next, int w, int prefs, | |||||
| int mrefs, int parity, int mode); | |||||
| void ff_yadif_filter_line_10bit_sse2(void *dst, void *prev, void *cur, | |||||
| void *next, int w, int prefs, | |||||
| int mrefs, int parity, int mode); | |||||
| void ff_yadif_filter_line_10bit_ssse3(void *dst, void *prev, void *cur, | |||||
| void *next, int w, int prefs, | |||||
| int mrefs, int parity, int mode); | |||||
| av_cold void ff_yadif_init_x86(YADIFContext *yadif) | av_cold void ff_yadif_init_x86(YADIFContext *yadif) | ||||
| { | { | ||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| @@ -56,7 +66,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif) | |||||
| : yadif->csp->comp[0].depth_minus1 + 1; | : yadif->csp->comp[0].depth_minus1 + 1; | ||||
| #if HAVE_YASM | #if HAVE_YASM | ||||
| if (bit_depth > 8) { | |||||
| if (bit_depth >= 15) { | |||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| if (EXTERNAL_MMXEXT(cpu_flags)) | if (EXTERNAL_MMXEXT(cpu_flags)) | ||||
| yadif->filter_line = ff_yadif_filter_line_16bit_mmxext; | yadif->filter_line = ff_yadif_filter_line_16bit_mmxext; | ||||
| @@ -67,6 +77,15 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif) | |||||
| yadif->filter_line = ff_yadif_filter_line_16bit_ssse3; | yadif->filter_line = ff_yadif_filter_line_16bit_ssse3; | ||||
| if (EXTERNAL_SSE4(cpu_flags)) | if (EXTERNAL_SSE4(cpu_flags)) | ||||
| yadif->filter_line = ff_yadif_filter_line_16bit_sse4; | yadif->filter_line = ff_yadif_filter_line_16bit_sse4; | ||||
| } else if ( bit_depth >= 9 && bit_depth <= 14) { | |||||
| #if ARCH_X86_32 | |||||
| if (EXTERNAL_MMXEXT(cpu_flags)) | |||||
| yadif->filter_line = ff_yadif_filter_line_10bit_mmxext; | |||||
| #endif /* ARCH_X86_32 */ | |||||
| if (EXTERNAL_SSE2(cpu_flags)) | |||||
| yadif->filter_line = ff_yadif_filter_line_10bit_sse2; | |||||
| if (EXTERNAL_SSSE3(cpu_flags)) | |||||
| yadif->filter_line = ff_yadif_filter_line_10bit_ssse3; | |||||
| } else { | } else { | ||||
| #if ARCH_X86_32 | #if ARCH_X86_32 | ||||
| if (EXTERNAL_MMXEXT(cpu_flags)) | if (EXTERNAL_MMXEXT(cpu_flags)) | ||||
| @@ -0,0 +1,284 @@ | |||||
| ;***************************************************************************** | |||||
| ;* x86-optimized functions for yadif filter | |||||
| ;* | |||||
| ;* Copyright (C) 2006 Michael Niedermayer <michaelni@gmx.at> | |||||
| ;* Copyright (c) 2013 Daniel Kang <daniel.d.kang@gmail.com> | |||||
| ;* Copyright (c) 2011-2013 James Darnley <james.darnley@gmail.com> | |||||
| ;* | |||||
| ;* This file is part of FFmpeg. | |||||
| ;* | |||||
| ;* FFmpeg is free software; you can redistribute it and/or modify | |||||
| ;* it under the terms of the GNU General Public License as published by | |||||
| ;* the Free Software Foundation; either version 2 of the License, or | |||||
| ;* (at your option) any later version. | |||||
| ;* | |||||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||||
| ;* GNU General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU General Public License along | |||||
| ;* with FFmpeg; if not, write to the Free Software Foundation, Inc., | |||||
| ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | |||||
| ;****************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION_RODATA | |||||
| pw_1: times 8 dw 1 | |||||
| SECTION .text | |||||
| %macro PABS 2 | |||||
| %if cpuflag(ssse3) | |||||
| pabsw %1, %1 | |||||
| %else | |||||
| pxor %2, %2 | |||||
| pcmpgtw %2, %1 | |||||
| pxor %1, %2 | |||||
| psubw %1, %2 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro PMAXUW 2 | |||||
| %if cpuflag(sse4) | |||||
| pmaxuw %1, %2 | |||||
| %else | |||||
| psubusw %1, %2 | |||||
| paddusw %1, %2 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro CHECK 2 | |||||
| movu m2, [curq+t1+%1*2] | |||||
| movu m3, [curq+t0+%2*2] | |||||
| mova m4, m2 | |||||
| mova m5, m2 | |||||
| pxor m4, m3 | |||||
| pavgw m5, m3 | |||||
| pand m4, [pw_1] | |||||
| psubusw m5, m4 | |||||
| %if mmsize == 16 | |||||
| psrldq m5, 2 | |||||
| %else | |||||
| psrlq m5, 16 | |||||
| %endif | |||||
| mova m4, m2 | |||||
| psubusw m2, m3 | |||||
| psubusw m3, m4 | |||||
| PMAXUW m2, m3 | |||||
| mova m3, m2 | |||||
| mova m4, m2 | |||||
| %if mmsize == 16 | |||||
| psrldq m3, 2 | |||||
| psrldq m4, 4 | |||||
| %else | |||||
| psrlq m3, 16 | |||||
| psrlq m4, 32 | |||||
| %endif | |||||
| paddw m2, m3 | |||||
| paddw m2, m4 | |||||
| %endmacro | |||||
| %macro CHECK1 0 | |||||
| mova m3, m0 | |||||
| pcmpgtw m3, m2 | |||||
| pminsw m0, m2 | |||||
| mova m6, m3 | |||||
| pand m5, m3 | |||||
| pandn m3, m1 | |||||
| por m3, m5 | |||||
| mova m1, m3 | |||||
| %endmacro | |||||
| ; %macro CHECK2 0 | |||||
| ; paddw m6, [pw_1] | |||||
| ; psllw m6, 14 | |||||
| ; paddsw m2, m6 | |||||
| ; mova m3, m0 | |||||
| ; pcmpgtw m3, m2 | |||||
| ; pminsw m0, m2 | |||||
| ; pand m5, m3 | |||||
| ; pandn m3, m1 | |||||
| ; por m3, m5 | |||||
| ; mova m1, m3 | |||||
| ; %endmacro | |||||
| ; This version of CHECK2 is required for 14-bit samples. The left-shift trick | |||||
| ; in the old code is not large enough to correctly select pixels or scores. | |||||
| %macro CHECK2 0 | |||||
| mova m3, m0 | |||||
| pcmpgtw m0, m2 | |||||
| pand m0, m6 | |||||
| mova m6, m0 | |||||
| pand m5, m6 | |||||
| pand m2, m0 | |||||
| pandn m6, m1 | |||||
| pandn m0, m3 | |||||
| por m6, m5 | |||||
| por m0, m2 | |||||
| mova m1, m6 | |||||
| %endmacro | |||||
| %macro LOAD 2 | |||||
| movu m%1, %2 | |||||
| %endmacro | |||||
| %macro FILTER 3 | |||||
| .loop%1: | |||||
| pxor m7, m7 | |||||
| LOAD 0, [curq+t1] | |||||
| LOAD 1, [curq+t0] | |||||
| LOAD 2, [%2] | |||||
| LOAD 3, [%3] | |||||
| mova m4, m3 | |||||
| paddw m3, m2 | |||||
| psraw m3, 1 | |||||
| mova [rsp+ 0], m0 | |||||
| mova [rsp+16], m3 | |||||
| mova [rsp+32], m1 | |||||
| psubw m2, m4 | |||||
| PABS m2, m4 | |||||
| LOAD 3, [prevq+t1] | |||||
| LOAD 4, [prevq+t0] | |||||
| psubw m3, m0 | |||||
| psubw m4, m1 | |||||
| PABS m3, m5 | |||||
| PABS m4, m5 | |||||
| paddw m3, m4 | |||||
| psrlw m2, 1 | |||||
| psrlw m3, 1 | |||||
| pmaxsw m2, m3 | |||||
| LOAD 3, [nextq+t1] | |||||
| LOAD 4, [nextq+t0] | |||||
| psubw m3, m0 | |||||
| psubw m4, m1 | |||||
| PABS m3, m5 | |||||
| PABS m4, m5 | |||||
| paddw m3, m4 | |||||
| psrlw m3, 1 | |||||
| pmaxsw m2, m3 | |||||
| mova [rsp+48], m2 | |||||
| paddw m1, m0 | |||||
| paddw m0, m0 | |||||
| psubw m0, m1 | |||||
| psrlw m1, 1 | |||||
| PABS m0, m2 | |||||
| movu m2, [curq+t1-1*2] | |||||
| movu m3, [curq+t0-1*2] | |||||
| mova m4, m2 | |||||
| psubusw m2, m3 | |||||
| psubusw m3, m4 | |||||
| PMAXUW m2, m3 | |||||
| %if mmsize == 16 | |||||
| mova m3, m2 | |||||
| psrldq m3, 4 | |||||
| %else | |||||
| mova m3, m2 | |||||
| psrlq m3, 32 | |||||
| %endif | |||||
| paddw m0, m2 | |||||
| paddw m0, m3 | |||||
| psubw m0, [pw_1] | |||||
| CHECK -2, 0 | |||||
| CHECK1 | |||||
| CHECK -3, 1 | |||||
| CHECK2 | |||||
| CHECK 0, -2 | |||||
| CHECK1 | |||||
| CHECK 1, -3 | |||||
| CHECK2 | |||||
| mova m6, [rsp+48] | |||||
| cmp DWORD r8m, 2 | |||||
| jge .end%1 | |||||
| LOAD 2, [%2+t1*2] | |||||
| LOAD 4, [%3+t1*2] | |||||
| LOAD 3, [%2+t0*2] | |||||
| LOAD 5, [%3+t0*2] | |||||
| paddw m2, m4 | |||||
| paddw m3, m5 | |||||
| psrlw m2, 1 | |||||
| psrlw m3, 1 | |||||
| mova m4, [rsp+ 0] | |||||
| mova m5, [rsp+16] | |||||
| mova m7, [rsp+32] | |||||
| psubw m2, m4 | |||||
| psubw m3, m7 | |||||
| mova m0, m5 | |||||
| psubw m5, m4 | |||||
| psubw m0, m7 | |||||
| mova m4, m2 | |||||
| pminsw m2, m3 | |||||
| pmaxsw m3, m4 | |||||
| pmaxsw m2, m5 | |||||
| pminsw m3, m5 | |||||
| pmaxsw m2, m0 | |||||
| pminsw m3, m0 | |||||
| pxor m4, m4 | |||||
| pmaxsw m6, m3 | |||||
| psubw m4, m2 | |||||
| pmaxsw m6, m4 | |||||
| .end%1: | |||||
| mova m2, [rsp+16] | |||||
| mova m3, m2 | |||||
| psubw m2, m6 | |||||
| paddw m3, m6 | |||||
| pmaxsw m1, m2 | |||||
| pminsw m1, m3 | |||||
| movu [dstq], m1 | |||||
| add dstq, mmsize-4 | |||||
| add prevq, mmsize-4 | |||||
| add curq, mmsize-4 | |||||
| add nextq, mmsize-4 | |||||
| sub DWORD r4m, mmsize/2-2 | |||||
| jg .loop%1 | |||||
| %endmacro | |||||
| %macro YADIF 0 | |||||
| %if ARCH_X86_32 | |||||
| cglobal yadif_filter_line_10bit, 4, 6, 8, 80, dst, prev, cur, next, w, \ | |||||
| prefs, mrefs, parity, mode | |||||
| %else | |||||
| cglobal yadif_filter_line_10bit, 4, 7, 8, 80, dst, prev, cur, next, w, \ | |||||
| prefs, mrefs, parity, mode | |||||
| %endif | |||||
| cmp DWORD wm, 0 | |||||
| jle .ret | |||||
| %if ARCH_X86_32 | |||||
| mov r4, r5mp | |||||
| mov r5, r6mp | |||||
| DECLARE_REG_TMP 4,5 | |||||
| %else | |||||
| movsxd r5, DWORD r5m | |||||
| movsxd r6, DWORD r6m | |||||
| DECLARE_REG_TMP 5,6 | |||||
| %endif | |||||
| cmp DWORD paritym, 0 | |||||
| je .parity0 | |||||
| FILTER 1, prevq, curq | |||||
| jmp .ret | |||||
| .parity0: | |||||
| FILTER 0, curq, nextq | |||||
| .ret: | |||||
| RET | |||||
| %endmacro | |||||
| INIT_XMM ssse3 | |||||
| YADIF | |||||
| INIT_XMM sse2 | |||||
| YADIF | |||||
| %if ARCH_X86_32 | |||||
| INIT_MMX mmxext | |||||
| YADIF | |||||
| %endif | |||||