Haswell: - 1.11x faster (522±0.4 vs. 469±1.8 decicycles) compared with mmxext Skylake-U: - 1.21x faster (671±5.5 vs. 555±1.4 decicycles) compared with mmxexttags/n3.4
@@ -65,7 +65,15 @@ SECTION .text | |||||
IDCT4_1D w, 0, 1, 2, 3, 4, 5 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 | ||||
mova m6, [pw_32] | mova m6, [pw_32] | ||||
TRANSPOSE4x4W 0, 1, 2, 3, 4 | |||||
%if mmsize == 8 | |||||
TRANSPOSE4x4W 0, 1, 2, 3, 4 | |||||
%else | |||||
punpcklwd m0, m1 | |||||
punpcklwd m2, m3 | |||||
SBUTTERFLY dq, 0, 2, 4 | |||||
MOVHL m1, m0 | |||||
MOVHL m3, m2 | |||||
%endif | |||||
paddw m0, m6 | paddw m0, m6 | ||||
IDCT4_1D w, 0, 1, 2, 3, 4, 5 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 | ||||
pxor m7, m7 | pxor m7, m7 | ||||
@@ -1131,3 +1139,26 @@ INIT_MMX mmx | |||||
IDCT_DC_DEQUANT 0 | IDCT_DC_DEQUANT 0 | ||||
INIT_MMX sse2 | INIT_MMX sse2 | ||||
IDCT_DC_DEQUANT 7 | IDCT_DC_DEQUANT 7 | ||||
INIT_XMM avx | |||||
; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet | |||||
%macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride | |||||
movd %3, [%7] | |||||
movd %4, [%7+%8] | |||||
psraw %1, %6 | |||||
psraw %2, %6 | |||||
punpcklbw %3, %5 | |||||
punpcklbw %4, %5 | |||||
paddw %3, %1 | |||||
paddw %4, %2 | |||||
packuswb %3, %5 | |||||
packuswb %4, %5 | |||||
movd [%7], %3 | |||||
movd [%7+%8], %4 | |||||
%endmacro | |||||
cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_ | |||||
movsxdifnidn stride_q, stride_d | |||||
IDCT4_ADD dst_q, block_q, stride_q | |||||
RET |
@@ -32,6 +32,7 @@ void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ | |||||
int stride); | int stride); | ||||
IDCT_ADD_FUNC(, 8, mmx) | IDCT_ADD_FUNC(, 8, mmx) | ||||
IDCT_ADD_FUNC(, 8, avx) | |||||
IDCT_ADD_FUNC(, 10, sse2) | IDCT_ADD_FUNC(, 10, sse2) | ||||
IDCT_ADD_FUNC(_dc, 8, mmxext) | IDCT_ADD_FUNC(_dc, 8, mmxext) | ||||
IDCT_ADD_FUNC(_dc, 10, mmxext) | IDCT_ADD_FUNC(_dc, 10, mmxext) | ||||
@@ -337,6 +338,8 @@ av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; | c->h264_h_loop_filter_chroma = ff_deblock_h_chroma422_8_avx; | ||||
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; | c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma422_intra_8_avx; | ||||
} | } | ||||
c->h264_idct_add = ff_h264_idct_add_8_avx; | |||||
} | } | ||||
} else if (bit_depth == 10) { | } else if (bit_depth == 10) { | ||||
if (EXTERNAL_MMXEXT(cpu_flags)) { | if (EXTERNAL_MMXEXT(cpu_flags)) { | ||||