Only two functions that use xop multiply-accumulate instructions where the first operand is the same as the fourth actually took advantage of the macros. This further reduces differences with x264's x86inc. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>tags/n2.8
@@ -25,6 +25,15 @@ | |||||
SECTION .text | SECTION .text | ||||
%macro PMACSDQL 5 | |||||
%if cpuflag(xop) | |||||
pmacsdql %1, %2, %3, %1 | |||||
%else | |||||
pmuldq %2, %3 | |||||
paddq %1, %2 | |||||
%endif | |||||
%endmacro | |||||
%macro LPC_32 1 | %macro LPC_32 1 | ||||
INIT_XMM %1 | INIT_XMM %1 | ||||
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j | cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j | ||||
@@ -1427,6 +1427,22 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1 | |||||
%undef i | %undef i | ||||
%undef j | %undef j | ||||
%macro FMA_INSTR 3 | |||||
%macro %1 4-7 %1, %2, %3 | |||||
%if cpuflag(xop) | |||||
v%5 %1, %2, %3, %4 | |||||
%else | |||||
%6 %1, %2, %3 | |||||
%7 %1, %4 | |||||
%endif | |||||
%endmacro | |||||
%endmacro | |||||
FMA_INSTR pmacsww, pmullw, paddw | |||||
FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation | |||||
FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation | |||||
FMA_INSTR pmadcswd, pmaddwd, paddd | |||||
; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. | ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. | ||||
; This lets us use tzcnt without bumping the yasm version requirement yet. | ; This lets us use tzcnt without bumping the yasm version requirement yet. | ||||
%define tzcnt rep bsf | %define tzcnt rep bsf | ||||
@@ -765,25 +765,6 @@ | |||||
%endif | %endif | ||||
%endmacro | %endmacro | ||||
%macro PMA_EMU 4 | |||||
%macro %1 5-8 %2, %3, %4 | |||||
%if cpuflag(xop) | |||||
v%6 %1, %2, %3, %4 | |||||
%elifidn %1, %4 | |||||
%7 %5, %2, %3 | |||||
%8 %1, %4, %5 | |||||
%else | |||||
%7 %1, %2, %3 | |||||
%8 %1, %4 | |||||
%endif | |||||
%endmacro | |||||
%endmacro | |||||
PMA_EMU PMACSWW, pmacsww, pmullw, paddw | |||||
PMA_EMU PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation | |||||
PMA_EMU PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation | |||||
PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd | |||||
; Wrapper for non-FMA version of fmaddps | ; Wrapper for non-FMA version of fmaddps | ||||
%macro FMULADD_PS 5 | %macro FMULADD_PS 5 | ||||
%if cpuflag(fma3) || cpuflag(fma4) | %if cpuflag(fma3) || cpuflag(fma4) | ||||
@@ -176,7 +176,12 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ | |||||
.inner_loop: | .inner_loop: | ||||
movu m1, [srcq+min_filter_count_x4q*1] | movu m1, [srcq+min_filter_count_x4q*1] | ||||
%ifidn %1, int16 | %ifidn %1, int16 | ||||
PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 | |||||
%if cpuflag(xop) | |||||
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0 | |||||
%else | |||||
pmaddwd m1, [filterq+min_filter_count_x4q*1] | |||||
paddd m0, m1 | |||||
%endif | |||||
%else ; float/double | %else ; float/double | ||||
%if cpuflag(fma4) || cpuflag(fma3) | %if cpuflag(fma4) || cpuflag(fma3) | ||||
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 | fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 | ||||