Kaby Lake Pentium: - ff_h264_idct_add_8_sse2: ~1.18x faster than mmxext - ff_h264_idct_dc_add_8_sse2: ~1.07x faster than mmxexttags/n3.4
| @@ -1140,8 +1140,6 @@ IDCT_DC_DEQUANT 0 | |||||
| INIT_MMX sse2 | INIT_MMX sse2 | ||||
| IDCT_DC_DEQUANT 7 | IDCT_DC_DEQUANT 7 | ||||
| INIT_XMM avx | |||||
| ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet | ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet | ||||
| %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | ||||
| movd %3, [%7] | movd %3, [%7] | ||||
| @@ -1170,6 +1168,10 @@ INIT_XMM avx | |||||
| packuswb m1, m1 | packuswb m1, m1 | ||||
| %endmacro | %endmacro | ||||
| %macro IDCT_XMM 1 | |||||
| INIT_XMM %1 | |||||
| cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ | cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ | ||||
| movsxdifnidn stride_q, stride_d | movsxdifnidn stride_q, stride_d | ||||
| IDCT4_ADD dst_q, block_q, stride_q | IDCT4_ADD dst_q, block_q, stride_q | ||||
| @@ -1182,3 +1184,8 @@ cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_ | |||||
| DC_ADD_INIT r3 | DC_ADD_INIT r3 | ||||
| DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 | DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3 | ||||
| RET | RET | ||||
| %endmacro | |||||
| IDCT_XMM sse2 | |||||
| IDCT_XMM avx | |||||
| @@ -32,9 +32,11 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||||
| int stride); | int stride); | ||||
| IDCT_ADD_FUNC(, 8, mmx) | IDCT_ADD_FUNC(, 8, mmx) | ||||
| IDCT_ADD_FUNC(, 8, sse2) | |||||
| IDCT_ADD_FUNC(, 8, avx) | IDCT_ADD_FUNC(, 8, avx) | ||||
| IDCT_ADD_FUNC(, 10, sse2) | IDCT_ADD_FUNC(, 10, sse2) | ||||
| IDCT_ADD_FUNC(_dc, 8, mmxext) | IDCT_ADD_FUNC(_dc, 8, mmxext) | ||||
| IDCT_ADD_FUNC(_dc, 8, sse2) | |||||
| IDCT_ADD_FUNC(_dc, 8, avx) | IDCT_ADD_FUNC(_dc, 8, avx) | ||||
| IDCT_ADD_FUNC(_dc, 10, mmxext) | IDCT_ADD_FUNC(_dc, 10, mmxext) | ||||
| IDCT_ADD_FUNC(8_dc, 8, mmxext) | IDCT_ADD_FUNC(8_dc, 8, mmxext) | ||||
| @@ -316,6 +318,9 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
| c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2; | c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_sse2; | ||||
| c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2; | c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_sse2; | ||||
| } | } | ||||
| c->h264_idct_add = ff_h264_idct_add_8_sse2; | |||||
| c->h264_idct_dc_add = ff_h264_idct_dc_add_8_sse2; | |||||
| } | } | ||||
| if (EXTERNAL_SSSE3(cpu_flags)) { | if (EXTERNAL_SSSE3(cpu_flags)) { | ||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; | ||||