Browse Source

Add a few math functions to simd.hpp

tags/v1.0.0
Andrew Belt 5 years ago
parent
commit
ca6f050206
2 changed files with 143 additions and 12 deletions
  1. +77
    -6
      include/dsp/simd.hpp
  2. +66
    -6
      include/dsp/sse_mathfun.h

+ 77
- 6
include/dsp/simd.hpp View File

@@ -1,5 +1,5 @@
#include <emmintrin.h>
#include "sse_mathfun.h"
#include <emmintrin.h>


namespace rack {
@@ -36,6 +36,8 @@ typedef f32<4> f32_4;

// Operator overloads


/** `a operator b` */
#define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
return f32_4(func(a.v, b.v)); \
@@ -49,6 +51,7 @@ typedef f32<4> f32_4;
return operator(a, f32_4(b)); \
}

/** `a operator b` */
#define DECLARE_F32_4_OPERATOR_INCREMENT(operator, func) \
inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
a.v = func(a.v, b.v); \
@@ -64,12 +67,56 @@ DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)

/** `+a` */
inline f32_4 operator+(const f32_4 &a) {
return a;
}

/** `-a` */
inline f32_4 operator-(const f32_4 &a) {
return 0.f - a;
}

DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, _mm_add_ps);
DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, _mm_sub_ps);
DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, _mm_mul_ps);
DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, _mm_div_ps);

// TODO Perhaps return a future i32 type for these, or add casting between multiple simd types
/** `++a` */
inline f32_4 &operator++(f32_4 &a) {
a += 1.f;
return a;
}

/** `--a` */
inline f32_4 &operator--(f32_4 &a) {
a -= 1.f;
return a;
}

/** `a++` */
inline f32_4 operator++(f32_4 &a, int) {
f32_4 b = a;
++a;
return b;
}

/** `a--` */
inline f32_4 operator--(f32_4 &a, int) {
f32_4 b = a;
--a;
return b;
}

DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)

/** `~a` */
inline f32_4 operator~(const f32_4 &a) {
return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
}

DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
@@ -78,6 +125,10 @@ DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)



// Math functions


inline f32_4 fmax(f32_4 x, f32_4 b) {
return f32_4(_mm_max_ps(x.v, b.v));
}
@@ -105,19 +156,39 @@ inline f32_4 rcp(f32_4 x) {
}

inline f32_4 log(f32_4 x) {
return f32_4(log_ps(x.v));
return f32_4(sse_mathfun_log_ps(x.v));
}

inline f32_4 exp(f32_4 x) {
return f32_4(exp_ps(x.v));
return f32_4(sse_mathfun_exp_ps(x.v));
}

inline f32_4 sin(f32_4 x) {
return f32_4(sin_ps(x.v));
return f32_4(sse_mathfun_sin_ps(x.v));
}

inline f32_4 cos(f32_4 x) {
return f32_4(cos_ps(x.v));
return f32_4(sse_mathfun_cos_ps(x.v));
}

inline f32_4 floor(f32_4 a) {
return f32_4(sse_mathfun_floor_ps(a.v));
}

inline f32_4 ceil(f32_4 a) {
return f32_4(sse_mathfun_ceil_ps(a.v));
}

inline f32_4 round(f32_4 a) {
return f32_4(sse_mathfun_round_ps(a.v));
}

inline f32_4 fmod(f32_4 a, f32_4 b) {
return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
}

inline f32_4 fabs(f32_4 a) {
return f32_4(sse_mathfun_fabs_ps(a.v));
}




+ 66
- 6
include/dsp/sse_mathfun.h View File

@@ -1,11 +1,13 @@
/* Modified version of http://gruntthepeon.free.fr/ssemath/ for VCV Rack.

The following changes were made.
- Remove typedefs for __m128 to avoid type pollution, and because it's not that ugly.
- Remove typedefs for __m128 to avoid type pollution, and because they're not that ugly.
- Make all functions inline since this is a header file.
- Remove non-SSE2 code, since Rack assumes SSE2 CPUs.
- Move `const static` variables to function variables for clarity. See https://stackoverflow.com/a/52139901/272642 for explanation of why the performance is not worse.
- Change header file to <emmintrin.h> since we're using SSE2 intrinsics.
- Prefix functions with `sse_mathfun_`.
- Add floor, ceil, fmod.

This derived source file is released under the zlib license.
*/
@@ -41,9 +43,20 @@ This derived source file is released under the zlib license.
(this is the zlib license)
*/


#include <emmintrin.h>

inline __m128 log_ps(__m128 x) {

/** Generate 1.f without accessing memory */
inline __m128 sse_mathfun_one_ps() {
__m128i zeros = _mm_setzero_si128();
__m128i ones = _mm_cmpeq_epi32(zeros, zeros);
__m128i a = _mm_slli_epi32(_mm_srli_epi32(ones, 25), 23);
return _mm_castsi128_ps(a);
}


inline __m128 sse_mathfun_log_ps(__m128 x) {
__m128i emm0;
__m128 one = _mm_set_ps1(1.0);

@@ -113,7 +126,8 @@ inline __m128 log_ps(__m128 x) {
return x;
}

inline __m128 exp_ps(__m128 x) {

inline __m128 sse_mathfun_exp_ps(__m128 x) {
__m128 tmp = _mm_setzero_ps(), fx;
__m128i emm0;
__m128 one = _mm_set_ps1(1.0);
@@ -164,6 +178,7 @@ inline __m128 exp_ps(__m128 x) {
return y;
}


/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
it runs also on old athlons XPs and the pentium III of your grand
mother.
@@ -192,7 +207,7 @@ inline __m128 exp_ps(__m128 x) {
Since it is based on SSE intrinsics, it has to be compiled at -O2 to
deliver full speed.
*/
inline __m128 sin_ps(__m128 x) { // any x
inline __m128 sse_mathfun_sin_ps(__m128 x) { // any x
__m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;

__m128i emm0, emm2;
@@ -278,8 +293,9 @@ inline __m128 sin_ps(__m128 x) { // any x
return y;
}


/* almost the same as sin_ps */
inline __m128 cos_ps(__m128 x) { // any x
inline __m128 sse_mathfun_cos_ps(__m128 x) { // any x
__m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y;
__m128i emm0, emm2;
/* take the absolute value */
@@ -356,9 +372,10 @@ inline __m128 cos_ps(__m128 x) { // any x
return y;
}


/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
it is almost as fast, and gives you a free cosine with your sine */
inline void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
inline void sse_mathfun_sincos_ps(__m128 x, __m128 *s, __m128 *c) {
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4;
sign_bit_sin = x;
@@ -452,3 +469,46 @@ inline void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}


inline __m128 sse_mathfun_floor_ps(__m128 a) {
// Convert to i32 and back to f32
__m128 b = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
// If b > a, subtract 1 fom b
b = _mm_sub_ps(b, _mm_and_ps(_mm_cmpgt_ps(b, a), sse_mathfun_one_ps()));
return b;
}


inline __m128 sse_mathfun_ceil_ps(__m128 a) {
// Convert to i32 and back to f32
__m128 b = _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
// If b < a, add 1 to b
b = _mm_add_ps(b, _mm_and_ps(_mm_cmplt_ps(b, a), sse_mathfun_one_ps()));
return b;
}


inline __m128 sse_mathfun_round_ps(__m128 a) {
// TODO Incorrect for -0.5, -1.5, etc.
return sse_mathfun_floor_ps(_mm_add_ps(a, _mm_set_ps1(0.5f)));
}


/* Computes `a % b`.
Works between -8388608 and 8388608.
*/
inline __m128 sse_mathfun_fmod_ps(__m128 a, __m128 b) {
__m128 c = _mm_div_ps(a, b);
// Convert to i32 and back to f32
c = _mm_cvtepi32_ps(_mm_cvttps_epi32(c));
c = _mm_mul_ps(c, b);
return _mm_sub_ps(a, c);
}


inline __m128 sse_mathfun_fabs_ps(__m128 a) {
__m128i minus1 = _mm_set1_epi32(-1);
__m128 abs_mask = _mm_castsi128_ps(_mm_srli_epi32(minus1, 1));
return _mm_and_ps(abs_mask, a);
}

Loading…
Cancel
Save