vp9_diag_downleft_16x16_10bpp_c: 263.0 vp9_diag_downleft_16x16_10bpp_sse2: 44.7 vp9_diag_downleft_16x16_10bpp_ssse3: 32.5 vp9_diag_downleft_16x16_10bpp_avx: 31.9 vp9_diag_downleft_16x16_10bpp_avx2: 25.7 vp9_diag_downleft_16x16_12bpp_c: 264.7 vp9_diag_downleft_16x16_12bpp_sse2: 44.4 vp9_diag_downleft_16x16_12bpp_ssse3: 32.0 vp9_diag_downleft_16x16_12bpp_avx: 32.4 vp9_diag_downleft_16x16_12bpp_avx2: 25.5 Benchmarked with 10000 runs Signed-off-by: Ilia <zakne0ne@gmail.com> Signed-off-by: Ronald S. Bultje <rsbultje@gmail.com>tags/n3.3
| @@ -51,6 +51,7 @@ decl_ipred_fns(h, 16, mmxext, sse2); | |||||
| decl_ipred_fns(dc, 16, mmxext, sse2); | decl_ipred_fns(dc, 16, mmxext, sse2); | ||||
| decl_ipred_fns(dc_top, 16, mmxext, sse2); | decl_ipred_fns(dc_top, 16, mmxext, sse2); | ||||
| decl_ipred_fns(dc_left, 16, mmxext, sse2); | decl_ipred_fns(dc_left, 16, mmxext, sse2); | ||||
| decl_ipred_fn(dl, 16, 16, avx2); | |||||
| #define decl_ipred_dir_funcs(type) \ | #define decl_ipred_dir_funcs(type) \ | ||||
| decl_ipred_fns(type, 16, sse2, sse2); \ | decl_ipred_fns(type, 16, sse2, sse2); \ | ||||
| @@ -133,6 +134,7 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp) | |||||
| init_fpel_func(2, 1, 32, avg, _16, avx2); | init_fpel_func(2, 1, 32, avg, _16, avx2); | ||||
| init_fpel_func(1, 1, 64, avg, _16, avx2); | init_fpel_func(1, 1, 64, avg, _16, avx2); | ||||
| init_fpel_func(0, 1, 128, avg, _16, avx2); | init_fpel_func(0, 1, 128, avg, _16, avx2); | ||||
| init_ipred_func(dl, DIAG_DOWN_LEFT, 16, 16, avx2); | |||||
| } | } | ||||
| #endif /* HAVE_YASM */ | #endif /* HAVE_YASM */ | ||||
| @@ -847,6 +847,45 @@ DL_FUNCS | |||||
| INIT_XMM avx | INIT_XMM avx | ||||
| DL_FUNCS | DL_FUNCS | ||||
| %if HAVE_AVX2_EXTERNAL | |||||
| INIT_YMM avx2 | |||||
| cglobal vp9_ipred_dl_16x16_16, 2, 4, 5, dst, stride, l, a | |||||
| movifnidn aq, amp | |||||
| mova m0, [aq] ; abcdefghijklmnop | |||||
| vpbroadcastw xm1, [aq+30] ; pppppppp | |||||
| vperm2i128 m2, m0, m1, q0201 ; ijklmnoppppppppp | |||||
| vpalignr m3, m2, m0, 2 ; bcdefghijklmnopp | |||||
| vpalignr m4, m2, m0, 4 ; cdefghijklmnoppp | |||||
| LOWPASS 0, 3, 4 ; BCDEFGHIJKLMNOPp | |||||
| vperm2i128 m2, m0, m1, q0201 ; JKLMNOPppppppppp | |||||
| DEFINE_ARGS dst, stride, stride3, cnt | |||||
| mov cntd, 2 | |||||
| lea stride3q, [strideq*3] | |||||
| .loop: | |||||
| mova [dstq+strideq*0], m0 | |||||
| vpalignr m3, m2, m0, 2 | |||||
| vpalignr m4, m2, m0, 4 | |||||
| mova [dstq+strideq*1], m3 | |||||
| mova [dstq+strideq*2], m4 | |||||
| vpalignr m3, m2, m0, 6 | |||||
| vpalignr m4, m2, m0, 8 | |||||
| mova [dstq+stride3q ], m3 | |||||
| lea dstq, [dstq+strideq*4] | |||||
| mova [dstq+strideq*0], m4 | |||||
| vpalignr m3, m2, m0, 10 | |||||
| vpalignr m4, m2, m0, 12 | |||||
| mova [dstq+strideq*1], m3 | |||||
| mova [dstq+strideq*2], m4 | |||||
| vpalignr m3, m2, m0, 14 | |||||
| mova [dstq+stride3q ], m3 | |||||
| lea dstq, [dstq+strideq*4] | |||||
| mova m0, m2 | |||||
| vperm2i128 m2, m2, m2, q0101 ; pppppppppppppppp | |||||
| dec cntd | |||||
| jg .loop | |||||
| RET | |||||
| %endif | |||||
| %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function | %macro DR_FUNCS 1 ; stack_mem_for_32x32_32bit_function | ||||
| cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a | cglobal vp9_ipred_dr_4x4_16, 4, 4, 3, dst, stride, l, a | ||||
| movh m0, [lq] ; wxyz.... | movh m0, [lq] ; wxyz.... | ||||