opus_pvq_search: Restore the proper use of conditional define and simplify the function name suffix handling.

Using named define properly documents the code paths. It also avoids passing additional numbered arguments through multiple levels of macro templates. The suffix handling is done by concatenation, like in other asm functions and avoid having two separate "cglobal" defines. Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
7 years ago · 43dab86bcd
--- a/libavcodec/x86/opus_pvq_search.asm
+++ b/libavcodec/x86/opus_pvq_search.asm
@@ -82,7 +82,7 @@ SECTION .text
 %endif
 %endmacro

 %macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
 %macro PULSES_SEARCH 1
 ; m6 Syy_norm
 ; m7 Sxy_norm
    addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
@@ -96,17 +96,17 @@ align 16
    movaps         m4, [tmpY + r4]  ; y[i]
    movaps         m5, [tmpX + r4]  ; X[i]

 %if %2
  %if USE_APPROXIMATION == 1
    xorps          m0, m0
    cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
 %endif
  %endif

    addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
    addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm

 %if %2
  %if USE_APPROXIMATION == 1
    andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
 %endif
  %endif

 %else
    movaps         m5, [tmpY + r4]      ; m5 = y[i]
@@ -119,7 +119,7 @@ align 16
    andps          m5, m0               ; (0<y)?m5:0
 %endif

 %if %2
 %if USE_APPROXIMATION == 1
    rsqrtps        m4, m4
    mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
 %else
@@ -211,13 +211,8 @@ align 16
 ; uint32 K      - Number of pulses to have after quantizations.
 ; uint32 N      - Number of vector elements. Must be 0 < N < 256
 ;
 %macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
 %if %1
 cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
 %else
 cglobal pvq_search_exact,  4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
 %endif

 %macro PVQ_FAST_SEARCH 1
 cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
 %define tmpX rsp
 %define tmpY outYq

@@ -260,7 +255,7 @@ align 16
    jz   %%zero_input       ; if (Sx==0) goto zero_input

    cvtsi2ss  xm0, dword Kd ; m0 = K
 %if %1
 %if USE_APPROXIMATION == 1
    rcpss     xm1, xm1      ; m1 = approx(1/Sx)
    mulss     xm0, xm1      ; m0 = K*(1/Sx)
 %else
@@ -313,7 +308,7 @@ align 16
 align 16                        ; K - pulses > 0
 %%add_pulses_loop:

    PULSES_SEARCH add, %1   ; m6 Syy_norm ; m7 Sxy_norm
    PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm

    sub        Kd, 1
    jnz  %%add_pulses_loop
@@ -325,7 +320,7 @@ align 16                        ; K - pulses > 0
 align 16
 %%remove_pulses_loop:

    PULSES_SEARCH sub, %1   ; m6 Syy_norm ; m7 Sxy_norm
    PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm

    add        Kd, 1
    jnz  %%remove_pulses_loop
@@ -376,11 +371,15 @@ align 16
 ; On Skylake & Ryzen the division is much faster (around 11c/3),
 ; that makes the full precision code about 2% slower.
 ; Opus also does use rsqrt approximation in their intrinsics code.
 %define USE_APPROXIMATION   1

 INIT_XMM sse2
 PVQ_FAST_SEARCH 1
 PVQ_FAST_SEARCH _approx

 INIT_XMM sse4
 PVQ_FAST_SEARCH 1
 PVQ_FAST_SEARCH _approx

 %define USE_APPROXIMATION   0

 INIT_XMM avx
 PVQ_FAST_SEARCH 0
 PVQ_FAST_SEARCH _exact