Decoding time of ped1080p.webm goes from 20.7sec to 11.3sec.tags/n2.1
| @@ -2044,4 +2044,6 @@ av_cold void ff_vp9dsp_init(VP9DSPContext *dsp) | |||
| vp9dsp_itxfm_init(dsp); | |||
| vp9dsp_loopfilter_init(dsp); | |||
| vp9dsp_mc_init(dsp); | |||
| if (ARCH_X86) ff_vp9dsp_init_x86(dsp); | |||
| } | |||
| @@ -112,4 +112,6 @@ typedef struct VP9DSPContext { | |||
| void ff_vp9dsp_init(VP9DSPContext *dsp); | |||
| void ff_vp9dsp_init_x86(VP9DSPContext *dsp); | |||
| #endif /* AVCODEC_VP9DSP_H */ | |||
| @@ -34,6 +34,7 @@ OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o | |||
| OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o | |||
| OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o | |||
| OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o | |||
| OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o | |||
| OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o | |||
| OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o | |||
| @@ -98,4 +99,5 @@ YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o | |||
| YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o | |||
| YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o | |||
| YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o | |||
| YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o | |||
| YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o | |||
| @@ -0,0 +1,221 @@ | |||
| ;****************************************************************************** | |||
| ;* VP9 SIMD optimizations | |||
| ;* | |||
| ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | |||
| ;* | |||
| ;* This file is part of FFmpeg. | |||
| ;* | |||
| ;* FFmpeg is free software; you can redistribute it and/or | |||
| ;* modify it under the terms of the GNU Lesser General Public | |||
| ;* License as published by the Free Software Foundation; either | |||
| ;* version 2.1 of the License, or (at your option) any later version. | |||
| ;* | |||
| ;* FFmpeg is distributed in the hope that it will be useful, | |||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| ;* Lesser General Public License for more details. | |||
| ;* | |||
| ;* You should have received a copy of the GNU Lesser General Public | |||
| ;* License along with FFmpeg; if not, write to the Free Software | |||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| ;****************************************************************************** | |||
| %include "libavutil/x86/x86util.asm" | |||
| SECTION_RODATA | |||
| ; FIXME share with vp8dsp.asm | |||
| pw_256: times 8 dw 256 | |||
| %macro F8_TAPS 8 | |||
| times 8 db %1, %2 | |||
| times 8 db %3, %4 | |||
| times 8 db %5, %6 | |||
| times 8 db %7, %8 | |||
| %endmacro | |||
| ; int8_t ff_filters_ssse3[3][15][4][16] | |||
| const filters_ssse3 ; smooth | |||
| F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 | |||
| F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 | |||
| F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 | |||
| F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 | |||
| F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 | |||
| F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 | |||
| F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 | |||
| F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 | |||
| F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 | |||
| F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 | |||
| F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 | |||
| F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 | |||
| F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 | |||
| F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 | |||
| F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 | |||
| ; regular | |||
| F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 | |||
| F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 | |||
| F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 | |||
| F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 | |||
| F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 | |||
| F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 | |||
| F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 | |||
| F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 | |||
| F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 | |||
| F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 | |||
| F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 | |||
| F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 | |||
| F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 | |||
| F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 | |||
| F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 | |||
| ; sharp | |||
| F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 | |||
| F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 | |||
| F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 | |||
| F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 | |||
| F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 | |||
| F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 | |||
| F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 | |||
| F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 | |||
| F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 | |||
| F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 | |||
| F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 | |||
| F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 | |||
| F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 | |||
| F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 | |||
| F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 | |||
| SECTION .text | |||
| %macro filter_h_fn 1 | |||
| %assign %%px mmsize/2 | |||
| cglobal %1_8tap_1d_h_%%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery | |||
| mova m6, [pw_256] | |||
| mova m7, [filteryq+ 0] | |||
| %if ARCH_X86_64 && mmsize > 8 | |||
| mova m8, [filteryq+16] | |||
| mova m9, [filteryq+32] | |||
| mova m10, [filteryq+48] | |||
| %endif | |||
| .loop: | |||
| movh m0, [srcq-3] | |||
| movh m1, [srcq-2] | |||
| movh m2, [srcq-1] | |||
| movh m3, [srcq+0] | |||
| movh m4, [srcq+1] | |||
| movh m5, [srcq+2] | |||
| punpcklbw m0, m1 | |||
| punpcklbw m2, m3 | |||
| movh m1, [srcq+3] | |||
| movh m3, [srcq+4] | |||
| add srcq, sstrideq | |||
| punpcklbw m4, m5 | |||
| punpcklbw m1, m3 | |||
| pmaddubsw m0, m7 | |||
| %if ARCH_X86_64 && mmsize > 8 | |||
| pmaddubsw m2, m8 | |||
| pmaddubsw m4, m9 | |||
| pmaddubsw m1, m10 | |||
| %else | |||
| pmaddubsw m2, [filteryq+16] | |||
| pmaddubsw m4, [filteryq+32] | |||
| pmaddubsw m1, [filteryq+48] | |||
| %endif | |||
| paddw m0, m2 | |||
| paddw m4, m1 | |||
| paddsw m0, m4 | |||
| pmulhrsw m0, m6 | |||
| %ifidn %1, avg | |||
| movh m1, [dstq] | |||
| %endif | |||
| packuswb m0, m0 | |||
| %ifidn %1, avg | |||
| pavgb m0, m1 | |||
| %endif | |||
| movh [dstq], m0 | |||
| add dstq, dstrideq | |||
| dec hd | |||
| jg .loop | |||
| RET | |||
| %endmacro | |||
| INIT_MMX ssse3 | |||
| filter_h_fn put | |||
| filter_h_fn avg | |||
| INIT_XMM ssse3 | |||
| filter_h_fn put | |||
| filter_h_fn avg | |||
| %macro filter_v_fn 1 | |||
| %assign %%px mmsize/2 | |||
| %if ARCH_X86_64 | |||
| cglobal %1_8tap_1d_v_%%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 | |||
| %else | |||
| cglobal %1_8tap_1d_v_%%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 | |||
| mov filteryq, r5mp | |||
| %define hd r4mp | |||
| %endif | |||
| sub srcq, sstrideq | |||
| lea sstride3q, [sstrideq*3] | |||
| sub srcq, sstrideq | |||
| mova m6, [pw_256] | |||
| sub srcq, sstrideq | |||
| mova m7, [filteryq+ 0] | |||
| lea src4q, [srcq+sstrideq*4] | |||
| %if ARCH_X86_64 && mmsize > 8 | |||
| mova m8, [filteryq+16] | |||
| mova m9, [filteryq+32] | |||
| mova m10, [filteryq+48] | |||
| %endif | |||
| .loop: | |||
| ; FIXME maybe reuse loads from previous rows, or just | |||
| ; more generally unroll this to prevent multiple loads of | |||
| ; the same data? | |||
| movh m0, [srcq] | |||
| movh m1, [srcq+sstrideq] | |||
| movh m2, [srcq+sstrideq*2] | |||
| movh m3, [srcq+sstride3q] | |||
| movh m4, [src4q] | |||
| movh m5, [src4q+sstrideq] | |||
| punpcklbw m0, m1 | |||
| punpcklbw m2, m3 | |||
| movh m1, [src4q+sstrideq*2] | |||
| movh m3, [src4q+sstride3q] | |||
| add srcq, sstrideq | |||
| add src4q, sstrideq | |||
| punpcklbw m4, m5 | |||
| punpcklbw m1, m3 | |||
| pmaddubsw m0, m7 | |||
| %if ARCH_X86_64 && mmsize > 8 | |||
| pmaddubsw m2, m8 | |||
| pmaddubsw m4, m9 | |||
| pmaddubsw m1, m10 | |||
| %else | |||
| pmaddubsw m2, [filteryq+16] | |||
| pmaddubsw m4, [filteryq+32] | |||
| pmaddubsw m1, [filteryq+48] | |||
| %endif | |||
| paddw m0, m2 | |||
| paddw m4, m1 | |||
| paddsw m0, m4 | |||
| pmulhrsw m0, m6 | |||
| %ifidn %1, avg | |||
| movh m1, [dstq] | |||
| %endif | |||
| packuswb m0, m0 | |||
| %ifidn %1, avg | |||
| pavgb m0, m1 | |||
| %endif | |||
| movh [dstq], m0 | |||
| add dstq, dstrideq | |||
| dec hd | |||
| jg .loop | |||
| RET | |||
| %endmacro | |||
| INIT_MMX ssse3 | |||
| filter_v_fn put | |||
| filter_v_fn avg | |||
| INIT_XMM ssse3 | |||
| filter_v_fn put | |||
| filter_v_fn avg | |||
| @@ -0,0 +1,171 @@ | |||
| /* | |||
| * VP9 SIMD optimizations | |||
| * | |||
| * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | |||
| * | |||
| * This file is part of FFmpeg. | |||
| * | |||
| * FFmpeg is free software; you can redistribute it and/or | |||
| * modify it under the terms of the GNU Lesser General Public | |||
| * License as published by the Free Software Foundation; either | |||
| * version 2.1 of the License, or (at your option) any later version. | |||
| * | |||
| * FFmpeg is distributed in the hope that it will be useful, | |||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
| * Lesser General Public License for more details. | |||
| * | |||
| * You should have received a copy of the GNU Lesser General Public | |||
| * License along with FFmpeg; if not, write to the Free Software | |||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
| */ | |||
| #include "libavutil/cpu.h" | |||
| #include "libavutil/mem.h" | |||
| #include "libavutil/x86/asm.h" | |||
| #include "libavcodec/vp9dsp.h" | |||
| #if HAVE_YASM | |||
| #define mc_func(avg, sz, dir, opt) \ | |||
| void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, const int8_t (*filter)[16]) | |||
| #define mc_funcs(sz) \ | |||
| mc_func(put, sz, h, ssse3); \ | |||
| mc_func(avg, sz, h, ssse3); \ | |||
| mc_func(put, sz, v, ssse3); \ | |||
| mc_func(avg, sz, v, ssse3) | |||
| mc_funcs(4); | |||
| mc_funcs(8); | |||
| #undef mc_funcs | |||
| #undef mc_func | |||
| #define mc_rep_func(avg, sz, hsz, dir, opt) \ | |||
| static av_always_inline void \ | |||
| ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, const int8_t (*filter)[16]) \ | |||
| { \ | |||
| ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \ | |||
| src_stride, h, filter); \ | |||
| ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \ | |||
| src_stride, h, filter); \ | |||
| } | |||
| #define mc_rep_funcs(sz, hsz) \ | |||
| mc_rep_func(put, sz, hsz, h, ssse3); \ | |||
| mc_rep_func(avg, sz, hsz, h, ssse3); \ | |||
| mc_rep_func(put, sz, hsz, v, ssse3); \ | |||
| mc_rep_func(avg, sz, hsz, v, ssse3) | |||
| mc_rep_funcs(16, 8); | |||
| mc_rep_funcs(32, 16); | |||
| mc_rep_funcs(64, 32); | |||
| #undef mc_rep_funcs | |||
| #undef mc_rep_func | |||
| extern const int8_t ff_filters_ssse3[3][15][4][16]; | |||
| #define filter_8tap_2d_fn(op, sz, f, fname) \ | |||
| static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, int mx, int my) \ | |||
| { \ | |||
| LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \ | |||
| ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \ | |||
| h + 7, ff_filters_ssse3[f][mx - 1]); \ | |||
| ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \ | |||
| h, ff_filters_ssse3[f][my - 1]); \ | |||
| } | |||
| #define filters_8tap_2d_fn(op, sz) \ | |||
| filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \ | |||
| filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \ | |||
| filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth) | |||
| #define filters_8tap_2d_fn2(op) \ | |||
| filters_8tap_2d_fn(op, 64) \ | |||
| filters_8tap_2d_fn(op, 32) \ | |||
| filters_8tap_2d_fn(op, 16) \ | |||
| filters_8tap_2d_fn(op, 8) \ | |||
| filters_8tap_2d_fn(op, 4) | |||
| filters_8tap_2d_fn2(put) | |||
| filters_8tap_2d_fn2(avg) | |||
| #undef filters_8tap_2d_fn2 | |||
| #undef filters_8tap_2d_fn | |||
| #undef filter_8tap_2d_fn | |||
| #define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \ | |||
| static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, int mx, int my) \ | |||
| { \ | |||
| ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \ | |||
| h, ff_filters_ssse3[f][dvar - 1]); \ | |||
| } | |||
| #define filters_8tap_1d_fn(op, sz, dir, dvar) \ | |||
| filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \ | |||
| filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \ | |||
| filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar) | |||
| #define filters_8tap_1d_fn2(op, sz) \ | |||
| filters_8tap_1d_fn(op, sz, h, mx) \ | |||
| filters_8tap_1d_fn(op, sz, v, my) | |||
| #define filters_8tap_1d_fn3(op) \ | |||
| filters_8tap_1d_fn2(op, 64) \ | |||
| filters_8tap_1d_fn2(op, 32) \ | |||
| filters_8tap_1d_fn2(op, 16) \ | |||
| filters_8tap_1d_fn2(op, 8) \ | |||
| filters_8tap_1d_fn2(op, 4) | |||
| filters_8tap_1d_fn3(put) | |||
| filters_8tap_1d_fn3(avg) | |||
| #undef filters_8tap_1d_fn | |||
| #undef filters_8tap_1d_fn2 | |||
| #undef filters_8tap_1d_fn3 | |||
| #undef filter_8tap_1d_fn | |||
| #endif /* HAVE_YASM */ | |||
| av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) | |||
| { | |||
| #if HAVE_YASM | |||
| int cpu_flags = av_get_cpu_flags(); | |||
| #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ | |||
| dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ | |||
| dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ | |||
| dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt | |||
| #define init_subpel2(idx, idxh, idxv, dir, type, opt) \ | |||
| init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \ | |||
| init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \ | |||
| init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \ | |||
| init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \ | |||
| init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt) | |||
| #define init_subpel3(idx, type, opt) \ | |||
| init_subpel2(idx, 1, 1, hv, type, opt); \ | |||
| init_subpel2(idx, 0, 1, v, type, opt); \ | |||
| init_subpel2(idx, 1, 0, h, type, opt) | |||
| if (cpu_flags & AV_CPU_FLAG_SSSE3) { | |||
| init_subpel3(0, put, ssse3); | |||
| init_subpel3(1, avg, ssse3); | |||
| } | |||
| #undef init_subpel1 | |||
| #undef init_subpel2 | |||
| #undef init_subpel3 | |||
| #endif /* HAVE_YASM */ | |||
| } | |||
| @@ -700,7 +700,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | |||
| extern %1 | |||
| %endmacro | |||
| %macro const 2+ | |||
| %macro const 1-2+ | |||
| %xdefine %1 mangle(private_prefix %+ _ %+ %1) | |||
| global %1 | |||
| %1: %2 | |||