| @@ -29,20 +29,20 @@ | |||
| #if HAVE_YASM | |||
| decl_fpel_func(put, 4, mmx); | |||
| decl_fpel_func(put, 8, mmx); | |||
| decl_fpel_func(put, 16, sse); | |||
| decl_fpel_func(put, 32, sse); | |||
| decl_fpel_func(put, 64, sse); | |||
| decl_fpel_func(avg, 4, mmxext); | |||
| decl_fpel_func(avg, 8, mmxext); | |||
| decl_fpel_func(avg, 16, sse2); | |||
| decl_fpel_func(avg, 32, sse2); | |||
| decl_fpel_func(avg, 64, sse2); | |||
| decl_fpel_func(put, 32, avx); | |||
| decl_fpel_func(put, 64, avx); | |||
| decl_fpel_func(avg, 32, avx2); | |||
| decl_fpel_func(avg, 64, avx2); | |||
| decl_fpel_func(put, 4, , mmx); | |||
| decl_fpel_func(put, 8, , mmx); | |||
| decl_fpel_func(put, 16, , sse); | |||
| decl_fpel_func(put, 32, , sse); | |||
| decl_fpel_func(put, 64, , sse); | |||
| decl_fpel_func(avg, 4, _8, mmxext); | |||
| decl_fpel_func(avg, 8, _8, mmxext); | |||
| decl_fpel_func(avg, 16, _8, sse2); | |||
| decl_fpel_func(avg, 32, _8, sse2); | |||
| decl_fpel_func(avg, 64, _8, sse2); | |||
| decl_fpel_func(put, 32, , avx); | |||
| decl_fpel_func(put, 64, , avx); | |||
| decl_fpel_func(avg, 32, _8, avx2); | |||
| decl_fpel_func(avg, 64, _8, avx2); | |||
| #define mc_func(avg, sz, dir, opt, type, f_sz) \ | |||
| void ff_vp9_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| @@ -378,8 +378,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |||
| } while (0) | |||
| if (EXTERNAL_MMX(cpu_flags)) { | |||
| init_fpel_func(4, 0, 4, put, mmx); | |||
| init_fpel_func(3, 0, 8, put, mmx); | |||
| init_fpel_func(4, 0, 4, put, , mmx); | |||
| init_fpel_func(3, 0, 8, put, , mmx); | |||
| if (!bitexact) { | |||
| dsp->itxfm_add[4 /* lossless */][DCT_DCT] = | |||
| dsp->itxfm_add[4 /* lossless */][ADST_DCT] = | |||
| @@ -392,8 +392,8 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |||
| if (EXTERNAL_MMXEXT(cpu_flags)) { | |||
| init_subpel2(4, 0, 4, put, mmxext); | |||
| init_subpel2(4, 1, 4, avg, mmxext); | |||
| init_fpel_func(4, 1, 4, avg, mmxext); | |||
| init_fpel_func(3, 1, 8, avg, mmxext); | |||
| init_fpel_func(4, 1, 4, avg, _8, mmxext); | |||
| init_fpel_func(3, 1, 8, avg, _8, mmxext); | |||
| dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; | |||
| init_dc_ipred(4, mmxext); | |||
| init_dc_ipred(8, mmxext); | |||
| @@ -401,9 +401,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |||
| } | |||
| if (EXTERNAL_SSE(cpu_flags)) { | |||
| init_fpel_func(2, 0, 16, put, sse); | |||
| init_fpel_func(1, 0, 32, put, sse); | |||
| init_fpel_func(0, 0, 64, put, sse); | |||
| init_fpel_func(2, 0, 16, put, , sse); | |||
| init_fpel_func(1, 0, 32, put, , sse); | |||
| init_fpel_func(0, 0, 64, put, , sse); | |||
| init_ipred(16, sse, v, VERT); | |||
| init_ipred(32, sse, v, VERT); | |||
| } | |||
| @@ -411,9 +411,9 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |||
| if (EXTERNAL_SSE2(cpu_flags)) { | |||
| init_subpel3_8to64(0, put, sse2); | |||
| init_subpel3_8to64(1, avg, sse2); | |||
| init_fpel_func(2, 1, 16, avg, sse2); | |||
| init_fpel_func(1, 1, 32, avg, sse2); | |||
| init_fpel_func(0, 1, 64, avg, sse2); | |||
| init_fpel_func(2, 1, 16, avg, _8, sse2); | |||
| init_fpel_func(1, 1, 32, avg, _8, sse2); | |||
| init_fpel_func(0, 1, 64, avg, _8, sse2); | |||
| init_lpf(sse2); | |||
| dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; | |||
| dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; | |||
| @@ -483,14 +483,14 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |||
| init_dir_tm_h_ipred(32, avx); | |||
| } | |||
| if (EXTERNAL_AVX_FAST(cpu_flags)) { | |||
| init_fpel_func(1, 0, 32, put, avx); | |||
| init_fpel_func(0, 0, 64, put, avx); | |||
| init_fpel_func(1, 0, 32, put, , avx); | |||
| init_fpel_func(0, 0, 64, put, , avx); | |||
| init_ipred(32, avx, v, VERT); | |||
| } | |||
| if (EXTERNAL_AVX2(cpu_flags)) { | |||
| init_fpel_func(1, 1, 32, avg, avx2); | |||
| init_fpel_func(0, 1, 64, avg, avx2); | |||
| init_fpel_func(1, 1, 32, avg, _8, avx2); | |||
| init_fpel_func(0, 1, 64, avg, _8, avx2); | |||
| if (ARCH_X86_64) { | |||
| #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | |||
| init_subpel3_32_64(0, put, avx2); | |||
| @@ -23,16 +23,16 @@ | |||
| #ifndef AVCODEC_X86_VP9DSP_INIT_H | |||
| #define AVCODEC_X86_VP9DSP_INIT_H | |||
| #define decl_fpel_func(avg, sz, opt) \ | |||
| void ff_vp9_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, int mx, int my) | |||
| #define decl_fpel_func(avg, sz, bpp, opt) \ | |||
| void ff_vp9_##avg##sz##bpp##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | |||
| const uint8_t *src, ptrdiff_t src_stride, \ | |||
| int h, int mx, int my) | |||
| #define init_fpel_func(idx1, idx2, sz, type, opt) \ | |||
| #define init_fpel_func(idx1, idx2, sz, type, bpp, opt) \ | |||
| dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ | |||
| dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ | |||
| dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ | |||
| dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##_##opt | |||
| dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_vp9_##type##sz##bpp##_##opt | |||
| void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp); | |||
| @@ -29,14 +29,22 @@ | |||
| #if HAVE_YASM | |||
| decl_fpel_func(put, 8, mmx); | |||
| decl_fpel_func(put, 16, sse); | |||
| decl_fpel_func(put, 32, sse); | |||
| decl_fpel_func(put, 64, sse); | |||
| decl_fpel_func(put, 128, sse); | |||
| decl_fpel_func(put, 32, avx); | |||
| decl_fpel_func(put, 64, avx); | |||
| decl_fpel_func(put, 128, avx); | |||
| decl_fpel_func(put, 8, , mmx); | |||
| decl_fpel_func(avg, 8, _16, mmxext); | |||
| decl_fpel_func(put, 16, , sse); | |||
| decl_fpel_func(put, 32, , sse); | |||
| decl_fpel_func(put, 64, , sse); | |||
| decl_fpel_func(put, 128, , sse); | |||
| decl_fpel_func(avg, 16, _16, sse2); | |||
| decl_fpel_func(avg, 32, _16, sse2); | |||
| decl_fpel_func(avg, 64, _16, sse2); | |||
| decl_fpel_func(avg, 128, _16, sse2); | |||
| decl_fpel_func(put, 32, , avx); | |||
| decl_fpel_func(put, 64, , avx); | |||
| decl_fpel_func(put, 128, , avx); | |||
| decl_fpel_func(avg, 32, _16, avx2); | |||
| decl_fpel_func(avg, 64, _16, avx2); | |||
| decl_fpel_func(avg, 128, _16, avx2); | |||
| #endif /* HAVE_YASM */ | |||
| @@ -46,19 +54,37 @@ av_cold void ff_vp9dsp_init_16bpp_x86(VP9DSPContext *dsp, int bpp) | |||
| int cpu_flags = av_get_cpu_flags(); | |||
| if (EXTERNAL_MMX(cpu_flags)) { | |||
| init_fpel_func(4, 0, 8, put, mmx); | |||
| init_fpel_func(4, 0, 8, put, , mmx); | |||
| } | |||
| if (EXTERNAL_MMXEXT(cpu_flags)) { | |||
| init_fpel_func(4, 1, 8, avg, _16, mmxext); | |||
| } | |||
| if (EXTERNAL_SSE(cpu_flags)) { | |||
| init_fpel_func(3, 0, 16, put, sse); | |||
| init_fpel_func(2, 0, 32, put, sse); | |||
| init_fpel_func(1, 0, 64, put, sse); | |||
| init_fpel_func(0, 0, 128, put, sse); | |||
| init_fpel_func(3, 0, 16, put, , sse); | |||
| init_fpel_func(2, 0, 32, put, , sse); | |||
| init_fpel_func(1, 0, 64, put, , sse); | |||
| init_fpel_func(0, 0, 128, put, , sse); | |||
| } | |||
| if (EXTERNAL_SSE2(cpu_flags)) { | |||
| init_fpel_func(3, 1, 16, avg, _16, sse2); | |||
| init_fpel_func(2, 1, 32, avg, _16, sse2); | |||
| init_fpel_func(1, 1, 64, avg, _16, sse2); | |||
| init_fpel_func(0, 1, 128, avg, _16, sse2); | |||
| } | |||
| if (EXTERNAL_AVX_FAST(cpu_flags)) { | |||
| init_fpel_func(2, 0, 32, put, avx); | |||
| init_fpel_func(1, 0, 64, put, avx); | |||
| init_fpel_func(0, 0, 128, put, avx); | |||
| init_fpel_func(2, 0, 32, put, , avx); | |||
| init_fpel_func(1, 0, 64, put, , avx); | |||
| init_fpel_func(0, 0, 128, put, , avx); | |||
| } | |||
| if (EXTERNAL_AVX2(cpu_flags)) { | |||
| init_fpel_func(2, 1, 32, avg, _16, avx2); | |||
| init_fpel_func(1, 1, 64, avg, _16, avx2); | |||
| init_fpel_func(0, 1, 128, avg, _16, avx2); | |||
| } | |||
| #endif /* HAVE_YASM */ | |||
| @@ -553,7 +553,7 @@ filter_vx2_fn avg | |||
| %endif ; ARCH_X86_64 | |||
| %macro fpel_fn 6-7 4 | |||
| %macro fpel_fn 6-8 0, 4 | |||
| %if %2 == 4 | |||
| %define %%srcfn movh | |||
| %define %%dstfn movh | |||
| @@ -562,12 +562,22 @@ filter_vx2_fn avg | |||
| %define %%dstfn mova | |||
| %endif | |||
| %if %7 == 8 | |||
| %define %%pavg pavgb | |||
| %define %%szsuf _8 | |||
| %elif %7 == 16 | |||
| %define %%pavg pavgw | |||
| %define %%szsuf _16 | |||
| %else | |||
| %define %%szsuf | |||
| %endif | |||
| %if %2 <= mmsize | |||
| cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 | |||
| cglobal vp9_%1%2%%szsuf, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 | |||
| lea sstride3q, [sstrideq*3] | |||
| lea dstride3q, [dstrideq*3] | |||
| %else | |||
| cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h | |||
| cglobal vp9_%1%2%%szsuf, 5, 5, %8, dst, dstride, src, sstride, h | |||
| %endif | |||
| .loop: | |||
| %%srcfn m0, [srcq] | |||
| @@ -582,10 +592,16 @@ cglobal vp9_%1%2, 5, 5, %7, dst, dstride, src, sstride, h | |||
| %endif | |||
| lea srcq, [srcq+sstrideq*%6] | |||
| %ifidn %1, avg | |||
| pavgb m0, [dstq] | |||
| pavgb m1, [dstq+d%3] | |||
| pavgb m2, [dstq+d%4] | |||
| pavgb m3, [dstq+d%5] | |||
| %%pavg m0, [dstq] | |||
| %%pavg m1, [dstq+d%3] | |||
| %%pavg m2, [dstq+d%4] | |||
| %%pavg m3, [dstq+d%5] | |||
| %if %2/mmsize == 8 | |||
| %%pavg m4, [dstq+mmsize*4] | |||
| %%pavg m5, [dstq+mmsize*5] | |||
| %%pavg m6, [dstq+mmsize*6] | |||
| %%pavg m7, [dstq+mmsize*7] | |||
| %endif | |||
| %endif | |||
| %%dstfn [dstq], m0 | |||
| %%dstfn [dstq+d%3], m1 | |||
| @@ -611,25 +627,38 @@ INIT_MMX mmx | |||
| fpel_fn put, 4, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn put, 8, strideq, strideq*2, stride3q, 4 | |||
| INIT_MMX mmxext | |||
| fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn avg, 4, strideq, strideq*2, stride3q, 4, 8 | |||
| fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 8 | |||
| INIT_XMM sse | |||
| fpel_fn put, 16, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 | |||
| fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 | |||
| fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 8 | |||
| fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1, 0, 8 | |||
| INIT_XMM sse2 | |||
| fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 | |||
| fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 | |||
| fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 8 | |||
| fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 8 | |||
| fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 8 | |||
| INIT_YMM avx | |||
| fpel_fn put, 32, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2 | |||
| fpel_fn put, 128, mmsize, mmsize*2, mmsize*3, 1 | |||
| %if HAVE_AVX2_EXTERNAL | |||
| INIT_YMM avx2 | |||
| fpel_fn avg, 32, strideq, strideq*2, stride3q, 4 | |||
| fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2 | |||
| fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 8 | |||
| fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 8 | |||
| %endif | |||
| INIT_MMX mmxext | |||
| fpel_fn avg, 8, strideq, strideq*2, stride3q, 4, 16 | |||
| INIT_XMM sse2 | |||
| fpel_fn avg, 16, strideq, strideq*2, stride3q, 4, 16 | |||
| fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2, 16 | |||
| fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1, 16 | |||
| fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16, 8 | |||
| %if HAVE_AVX2_EXTERNAL | |||
| INIT_YMM avx2 | |||
| fpel_fn avg, 32, strideq, strideq*2, stride3q, 4, 16 | |||
| fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2, 16 | |||
| fpel_fn avg, 128, mmsize, mmsize*2, mmsize*3, 1, 16 | |||
| %endif | |||
| %undef s16 | |||
| %undef d16 | |||