Only two functions that use xop multiply-accumulate instructions where the first operand is the same as the fourth actually took advantage of the macros. This further reduces differences with x264's x86inc. Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com> Signed-off-by: James Almer <jamrial@gmail.com>tags/n2.8
| @@ -25,6 +25,15 @@ | |||||
| SECTION .text | SECTION .text | ||||
| %macro PMACSDQL 5 | |||||
| %if cpuflag(xop) | |||||
| pmacsdql %1, %2, %3, %1 | |||||
| %else | |||||
| pmuldq %2, %3 | |||||
| paddq %1, %2 | |||||
| %endif | |||||
| %endmacro | |||||
| %macro LPC_32 1 | %macro LPC_32 1 | ||||
| INIT_XMM %1 | INIT_XMM %1 | ||||
| cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j | cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j | ||||
| @@ -1427,6 +1427,22 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1 | |||||
| %undef i | %undef i | ||||
| %undef j | %undef j | ||||
| %macro FMA_INSTR 3 | |||||
| %macro %1 4-7 %1, %2, %3 | |||||
| %if cpuflag(xop) | |||||
| v%5 %1, %2, %3, %4 | |||||
| %else | |||||
| %6 %1, %2, %3 | |||||
| %7 %1, %4 | |||||
| %endif | |||||
| %endmacro | |||||
| %endmacro | |||||
| FMA_INSTR pmacsww, pmullw, paddw | |||||
| FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation | |||||
| FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation | |||||
| FMA_INSTR pmadcswd, pmaddwd, paddd | |||||
| ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. | ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. | ||||
| ; This lets us use tzcnt without bumping the yasm version requirement yet. | ; This lets us use tzcnt without bumping the yasm version requirement yet. | ||||
| %define tzcnt rep bsf | %define tzcnt rep bsf | ||||
| @@ -765,25 +765,6 @@ | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| %macro PMA_EMU 4 | |||||
| %macro %1 5-8 %2, %3, %4 | |||||
| %if cpuflag(xop) | |||||
| v%6 %1, %2, %3, %4 | |||||
| %elifidn %1, %4 | |||||
| %7 %5, %2, %3 | |||||
| %8 %1, %4, %5 | |||||
| %else | |||||
| %7 %1, %2, %3 | |||||
| %8 %1, %4 | |||||
| %endif | |||||
| %endmacro | |||||
| %endmacro | |||||
| PMA_EMU PMACSWW, pmacsww, pmullw, paddw | |||||
| PMA_EMU PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation | |||||
| PMA_EMU PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation | |||||
| PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd | |||||
| ; Wrapper for non-FMA version of fmaddps | ; Wrapper for non-FMA version of fmaddps | ||||
| %macro FMULADD_PS 5 | %macro FMULADD_PS 5 | ||||
| %if cpuflag(fma3) || cpuflag(fma4) | %if cpuflag(fma3) || cpuflag(fma4) | ||||
| @@ -176,7 +176,12 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ | |||||
| .inner_loop: | .inner_loop: | ||||
| movu m1, [srcq+min_filter_count_x4q*1] | movu m1, [srcq+min_filter_count_x4q*1] | ||||
| %ifidn %1, int16 | %ifidn %1, int16 | ||||
| PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1 | |||||
| %if cpuflag(xop) | |||||
| vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0 | |||||
| %else | |||||
| pmaddwd m1, [filterq+min_filter_count_x4q*1] | |||||
| paddd m0, m1 | |||||
| %endif | |||||
| %else ; float/double | %else ; float/double | ||||
| %if cpuflag(fma4) || cpuflag(fma3) | %if cpuflag(fma4) || cpuflag(fma3) | ||||
| fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 | fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 | ||||