Browse Source

x86: move XOP emulation code back to x86inc

Only two functions that use xop multiply-accumulate instructions where the
first operand is the same as the fourth actually took advantage of the macros.

This further reduces differences with x264's x86inc.

Reviewed-by: Ronald S. Bultje <rsbultje@gmail.com>
Signed-off-by: James Almer <jamrial@gmail.com>
tags/n2.8
James Almer 9 years ago
parent
commit
5750d6c5e9
4 changed files with 31 additions and 20 deletions
  1. +9
    -0
      libavcodec/x86/flacdsp.asm
  2. +16
    -0
      libavutil/x86/x86inc.asm
  3. +0
    -19
      libavutil/x86/x86util.asm
  4. +6
    -1
      libswresample/x86/resample.asm

+ 9
- 0
libavcodec/x86/flacdsp.asm View File

@@ -25,6 +25,15 @@

SECTION .text

%macro PMACSDQL 5
%if cpuflag(xop)
pmacsdql %1, %2, %3, %1
%else
pmuldq %2, %3
paddq %1, %2
%endif
%endmacro

%macro LPC_32 1
INIT_XMM %1
cglobal flac_lpc_32, 5,6,5, decoded, coeffs, pred_order, qlevel, len, j


+ 16
- 0
libavutil/x86/x86inc.asm View File

@@ -1427,6 +1427,22 @@ AVX_INSTR pfmul, 3dnow, 1, 0, 1
%undef i
%undef j

%macro FMA_INSTR 3
%macro %1 4-7 %1, %2, %3
%if cpuflag(xop)
v%5 %1, %2, %3, %4
%else
%6 %1, %2, %3
%7 %1, %4
%endif
%endmacro
%endmacro

FMA_INSTR pmacsww, pmullw, paddw
FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
FMA_INSTR pmadcswd, pmaddwd, paddd

; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
; This lets us use tzcnt without bumping the yasm version requirement yet.
%define tzcnt rep bsf


+ 0
- 19
libavutil/x86/x86util.asm View File

@@ -765,25 +765,6 @@
%endif
%endmacro

%macro PMA_EMU 4
%macro %1 5-8 %2, %3, %4
%if cpuflag(xop)
v%6 %1, %2, %3, %4
%elifidn %1, %4
%7 %5, %2, %3
%8 %1, %4, %5
%else
%7 %1, %2, %3
%8 %1, %4
%endif
%endmacro
%endmacro

PMA_EMU PMACSWW, pmacsww, pmullw, paddw
PMA_EMU PMACSDD, pmacsdd, pmulld, paddd ; sse4 emulation
PMA_EMU PMACSDQL, pmacsdql, pmuldq, paddq ; sse4 emulation
PMA_EMU PMADCSWD, pmadcswd, pmaddwd, paddd

; Wrapper for non-FMA version of fmaddps
%macro FMULADD_PS 5
%if cpuflag(fma3) || cpuflag(fma4)


+ 6
- 1
libswresample/x86/resample.asm View File

@@ -176,7 +176,12 @@ cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16
PMADCSWD m0, m1, [filterq+min_filter_count_x4q*1], m0, m1
%if cpuflag(xop)
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%endif
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0


Loading…
Cancel
Save