| @@ -85,6 +85,8 @@ DECLARE_ALIGNED(32, const ymm_reg, ff_pd_16) = { 0x0000001000000010ULL, 0x000 | |||||
| 0x0000001000000010ULL, 0x0000001000000010ULL }; | 0x0000001000000010ULL, 0x0000001000000010ULL }; | ||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, | DECLARE_ALIGNED(32, const ymm_reg, ff_pd_32) = { 0x0000002000000020ULL, 0x0000002000000020ULL, | ||||
| 0x0000002000000020ULL, 0x0000002000000020ULL }; | 0x0000002000000020ULL, 0x0000002000000020ULL }; | ||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pd_8192) = { 0x0000200000002000ULL, 0x0000200000002000ULL, | |||||
| 0x0000200000002000ULL, 0x0000200000002000ULL }; | |||||
| DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, | DECLARE_ALIGNED(32, const ymm_reg, ff_pd_65535)= { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, | ||||
| 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL }; | 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL }; | ||||
| @@ -65,6 +65,7 @@ extern const xmm_reg ff_ps_neg; | |||||
| extern const ymm_reg ff_pd_1; | extern const ymm_reg ff_pd_1; | ||||
| extern const ymm_reg ff_pd_16; | extern const ymm_reg ff_pd_16; | ||||
| extern const ymm_reg ff_pd_32; | extern const ymm_reg ff_pd_32; | ||||
| extern const ymm_reg ff_pd_8192; | |||||
| extern const ymm_reg ff_pd_65535; | extern const ymm_reg ff_pd_65535; | ||||
| # if ARCH_X86_64 | # if ARCH_X86_64 | ||||
| @@ -125,6 +125,10 @@ lpf_mix2_wrappers_set(BPC, avx); | |||||
| decl_ipred_fns(tm, BPC, mmxext, sse2); | decl_ipred_fns(tm, BPC, mmxext, sse2); | ||||
| decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); | decl_itxfm_func(iwht, iwht, 4, BPC, mmxext); | ||||
| #if BPC == 10 | |||||
| decl_itxfm_func(idct, idct, 4, BPC, mmxext); | |||||
| decl_itxfm_func(idct, idct, 4, BPC, ssse3); | |||||
| #endif | |||||
| #endif /* HAVE_YASM */ | #endif /* HAVE_YASM */ | ||||
| av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) | av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) | ||||
| @@ -170,6 +174,9 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) | |||||
| init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); | init_ipred_func(tm, TM_VP8, 4, BPC, mmxext); | ||||
| if (!bitexact) { | if (!bitexact) { | ||||
| init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); | init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext); | ||||
| #if BPC == 10 | |||||
| init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext); | |||||
| #endif | |||||
| } | } | ||||
| } | } | ||||
| @@ -182,6 +189,11 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact) | |||||
| if (EXTERNAL_SSSE3(cpu_flags)) { | if (EXTERNAL_SSSE3(cpu_flags)) { | ||||
| init_lpf_funcs(BPC, ssse3); | init_lpf_funcs(BPC, ssse3); | ||||
| #if BPC == 10 | |||||
| if (!bitexact) { | |||||
| init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, ssse3); | |||||
| } | |||||
| #endif | |||||
| } | } | ||||
| if (EXTERNAL_AVX(cpu_flags)) { | if (EXTERNAL_AVX(cpu_flags)) { | ||||
| @@ -71,8 +71,6 @@ pw_13377x2: times 8 dw 13377*2 | |||||
| pw_m13377_13377: times 4 dw -13377, 13377 | pw_m13377_13377: times 4 dw -13377, 13377 | ||||
| pw_13377_0: times 4 dw 13377, 0 | pw_13377_0: times 4 dw 13377, 0 | ||||
| pd_8192: times 4 dd 8192 | |||||
| cextern pw_8 | cextern pw_8 | ||||
| cextern pw_16 | cextern pw_16 | ||||
| cextern pw_32 | cextern pw_32 | ||||
| @@ -80,38 +78,10 @@ cextern pw_512 | |||||
| cextern pw_1024 | cextern pw_1024 | ||||
| cextern pw_2048 | cextern pw_2048 | ||||
| cextern pw_m1 | cextern pw_m1 | ||||
| cextern pd_8192 | |||||
| SECTION .text | SECTION .text | ||||
| ; (a*x + b*y + round) >> shift | |||||
| %macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 | |||||
| pmaddwd m%1, m%2, %4 | |||||
| pmaddwd m%2, %5 | |||||
| paddd m%1, %3 | |||||
| paddd m%2, %3 | |||||
| psrad m%1, 14 | |||||
| psrad m%2, 14 | |||||
| %endmacro | |||||
| %macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 | |||||
| VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] | |||||
| VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] | |||||
| packssdw m%1, m%7 | |||||
| packssdw m%2, m%6 | |||||
| %endmacro | |||||
| %macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 | |||||
| %if %0 == 7 | |||||
| punpckhwd m%6, m%2, m%1 | |||||
| punpcklwd m%2, m%1 | |||||
| VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 | |||||
| %else | |||||
| punpckhwd m%8, m%4, m%3 | |||||
| punpcklwd m%2, m%4, m%3 | |||||
| VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 | %macro VP9_UNPACK_MULSUB_2D_4X 6 ; dst1 [src1], dst2 [src2], dst3, dst4, mul1, mul2 | ||||
| punpckhwd m%4, m%2, m%1 | punpckhwd m%4, m%2, m%1 | ||||
| punpcklwd m%2, m%1 | punpcklwd m%2, m%1 | ||||
| @@ -191,24 +161,6 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, block, eob | |||||
| ; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); | ; void vp9_idct_idct_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); | ||||
| ;------------------------------------------------------------------------------------------- | ;------------------------------------------------------------------------------------------- | ||||
| %macro VP9_IDCT4_1D_FINALIZE 0 | |||||
| SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 | |||||
| SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 | |||||
| SWAP 0, 3, 2 ; 3102 -> 0123 | |||||
| %endmacro | |||||
| %macro VP9_IDCT4_1D 0 | |||||
| %if cpuflag(ssse3) | |||||
| SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) | |||||
| pmulhrsw m2, m6 ; m2=t0 | |||||
| pmulhrsw m0, m6 ; m0=t1 | |||||
| %else ; <= sse2 | |||||
| VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 | |||||
| %endif | |||||
| VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 | |||||
| VP9_IDCT4_1D_FINALIZE | |||||
| %endmacro | |||||
| ; 2x2 top left corner | ; 2x2 top left corner | ||||
| %macro VP9_IDCT4_2x2_1D 0 | %macro VP9_IDCT4_2x2_1D 0 | ||||
| pmulhrsw m0, m5 ; m0=t1 | pmulhrsw m0, m5 ; m0=t1 | ||||
| @@ -25,8 +25,18 @@ | |||||
| SECTION_RODATA | SECTION_RODATA | ||||
| cextern pw_8 | |||||
| cextern pw_1023 | cextern pw_1023 | ||||
| cextern pw_2048 | |||||
| cextern pw_4095 | cextern pw_4095 | ||||
| cextern pd_8192 | |||||
| ; FIXME these should probably be shared between 8bpp and 10/12bpp | |||||
| pw_m11585_11585: times 4 dw -11585, 11585 | |||||
| pw_11585_11585: times 8 dw 11585 | |||||
| pw_m15137_6270: times 4 dw -15137, 6270 | |||||
| pw_6270_15137: times 4 dw 6270, 15137 | |||||
| pw_11585x2: times 8 dw 11585*2 | |||||
| SECTION .text | SECTION .text | ||||
| @@ -118,3 +128,89 @@ INIT_MMX mmxext | |||||
| IWHT4_FN 10, 1023 | IWHT4_FN 10, 1023 | ||||
| INIT_MMX mmxext | INIT_MMX mmxext | ||||
| IWHT4_FN 12, 4095 | IWHT4_FN 12, 4095 | ||||
| ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits | |||||
| ; in 15+1 words without additional effort, since the coefficients are 15bpp. | |||||
| %macro IDCT4_10_FN 0 | |||||
| cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob | |||||
| cmp eobd, 1 | |||||
| jg .idctfull | |||||
| ; dc-only | |||||
| %if cpuflag(ssse3) | |||||
| movd m0, [blockq] | |||||
| mova m5, [pw_11585x2] | |||||
| pmulhrsw m0, m5 | |||||
| pmulhrsw m0, m5 | |||||
| %else | |||||
| DEFINE_ARGS dst, stride, block, coef | |||||
| mov coefd, dword [blockq] | |||||
| imul coefd, 11585 | |||||
| add coefd, 8192 | |||||
| sar coefd, 14 | |||||
| imul coefd, 11585 | |||||
| add coefd, (8 << 14) + 8192 | |||||
| sar coefd, 14 + 4 | |||||
| movd m0, coefd | |||||
| %endif | |||||
| pshufw m0, m0, 0 | |||||
| pxor m4, m4 | |||||
| mova m5, [pw_1023] | |||||
| movh [blockq], m4 | |||||
| %if cpuflag(ssse3) | |||||
| pmulhrsw m0, [pw_2048] ; (x*2048 + (1<<14))>>15 <=> (x+8)>>4 | |||||
| %endif | |||||
| VP9_STORE_2X 0, 0, 6, 7, 4, 5 | |||||
| lea dstq, [dstq+2*strideq] | |||||
| VP9_STORE_2X 0, 0, 6, 7, 4, 5 | |||||
| RET | |||||
| .idctfull: | |||||
| mova m0, [blockq+0*16+0] | |||||
| mova m1, [blockq+1*16+0] | |||||
| packssdw m0, [blockq+0*16+8] | |||||
| packssdw m1, [blockq+1*16+8] | |||||
| mova m2, [blockq+2*16+0] | |||||
| mova m3, [blockq+3*16+0] | |||||
| packssdw m2, [blockq+2*16+8] | |||||
| packssdw m3, [blockq+3*16+8] | |||||
| %if cpuflag(ssse3) | |||||
| mova m6, [pw_11585x2] | |||||
| %endif | |||||
| mova m7, [pd_8192] ; rounding | |||||
| VP9_IDCT4_1D | |||||
| TRANSPOSE4x4W 0, 1, 2, 3, 4 | |||||
| VP9_IDCT4_1D | |||||
| pxor m4, m4 | |||||
| ZERO_BLOCK blockq, 16, 4, m4 | |||||
| %if cpuflag(ssse3) | |||||
| mova m5, [pw_2048] | |||||
| pmulhrsw m0, m5 | |||||
| pmulhrsw m1, m5 | |||||
| pmulhrsw m2, m5 | |||||
| pmulhrsw m3, m5 | |||||
| %else | |||||
| mova m5, [pw_8] | |||||
| paddw m0, m5 | |||||
| paddw m1, m5 | |||||
| paddw m2, m5 | |||||
| paddw m3, m5 | |||||
| psraw m0, 4 | |||||
| psraw m1, 4 | |||||
| psraw m2, 4 | |||||
| psraw m3, 4 | |||||
| %endif | |||||
| mova m5, [pw_1023] | |||||
| VP9_STORE_2X 0, 1, 6, 7, 4, 5 | |||||
| lea dstq, [dstq+2*strideq] | |||||
| VP9_STORE_2X 2, 3, 6, 7, 4, 5 | |||||
| RET | |||||
| %endmacro | |||||
| INIT_MMX mmxext | |||||
| IDCT4_10_FN | |||||
| INIT_MMX ssse3 | |||||
| IDCT4_10_FN | |||||
| @@ -35,3 +35,50 @@ | |||||
| paddw m3, m2 | paddw m3, m2 | ||||
| SWAP 3, 2, 1 | SWAP 3, 2, 1 | ||||
| %endmacro | %endmacro | ||||
| ; (a*x + b*y + round) >> shift | |||||
| %macro VP9_MULSUB_2W_2X 5 ; dst1, dst2/src, round, coefs1, coefs2 | |||||
| pmaddwd m%1, m%2, %4 | |||||
| pmaddwd m%2, %5 | |||||
| paddd m%1, %3 | |||||
| paddd m%2, %3 | |||||
| psrad m%1, 14 | |||||
| psrad m%2, 14 | |||||
| %endmacro | |||||
| %macro VP9_MULSUB_2W_4X 7 ; dst1, dst2, coef1, coef2, rnd, tmp1/src, tmp2 | |||||
| VP9_MULSUB_2W_2X %7, %6, %5, [pw_m%3_%4], [pw_%4_%3] | |||||
| VP9_MULSUB_2W_2X %1, %2, %5, [pw_m%3_%4], [pw_%4_%3] | |||||
| packssdw m%1, m%7 | |||||
| packssdw m%2, m%6 | |||||
| %endmacro | |||||
| %macro VP9_UNPACK_MULSUB_2W_4X 7-9 ; dst1, dst2, (src1, src2,) coef1, coef2, rnd, tmp1, tmp2 | |||||
| %if %0 == 7 | |||||
| punpckhwd m%6, m%2, m%1 | |||||
| punpcklwd m%2, m%1 | |||||
| VP9_MULSUB_2W_4X %1, %2, %3, %4, %5, %6, %7 | |||||
| %else | |||||
| punpckhwd m%8, m%4, m%3 | |||||
| punpcklwd m%2, m%4, m%3 | |||||
| VP9_MULSUB_2W_4X %1, %2, %5, %6, %7, %8, %9 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro VP9_IDCT4_1D_FINALIZE 0 | |||||
| SUMSUB_BA w, 3, 2, 4 ; m3=t3+t0, m2=-t3+t0 | |||||
| SUMSUB_BA w, 1, 0, 4 ; m1=t2+t1, m0=-t2+t1 | |||||
| SWAP 0, 3, 2 ; 3102 -> 0123 | |||||
| %endmacro | |||||
| %macro VP9_IDCT4_1D 0 | |||||
| %if cpuflag(ssse3) | |||||
| SUMSUB_BA w, 2, 0, 4 ; m2=IN(0)+IN(2) m0=IN(0)-IN(2) | |||||
| pmulhrsw m2, m6 ; m2=t0 | |||||
| pmulhrsw m0, m6 ; m0=t1 | |||||
| %else ; <= sse2 | |||||
| VP9_UNPACK_MULSUB_2W_4X 0, 2, 11585, 11585, m7, 4, 5 ; m0=t1, m1=t0 | |||||
| %endif | |||||
| VP9_UNPACK_MULSUB_2W_4X 1, 3, 15137, 6270, m7, 4, 5 ; m1=t2, m3=t3 | |||||
| VP9_IDCT4_1D_FINALIZE | |||||
| %endmacro | |||||