Browse Source

Port some of sse_mathfun to functions.hpp. Clean up SIMD headers.

tags/v1.1.4
Andrew Belt 5 years ago
parent
commit
a52c79df80
4 changed files with 68 additions and 96 deletions
  1. +3
    -2
      include/dsp/approx.hpp
  2. +58
    -49
      include/simd/functions.hpp
  3. +0
    -42
      include/simd/sse_mathfun.h
  4. +7
    -3
      src/engine/Engine.cpp

+ 3
- 2
include/dsp/approx.hpp View File

@@ -36,7 +36,7 @@ inline simd::float_4 approxExp2Floor(simd::float_4 x, simd::float_4* xf) {
}

template <>
inline float approxExp2Floor<float>(float x, float* xf) {
inline float approxExp2Floor(float x, float* xf) {
int xi = x;
if (xf)
*xf = x - xi;
@@ -46,7 +46,8 @@ inline float approxExp2Floor<float>(float x, float* xf) {

/** Returns 2^x, assuming that x >= 0.
Maximum 0.00024% error.
Roughly 7x faster than `std::pow(2, x)`.
For float, roughly 3x faster than `std::pow(2.f, x)`.
For float_4, roughly 2x faster than `simd::pow(2.f, x)`.

If negative powers are needed, you may use a lower bound and rescale.



+ 58
- 49
include/simd/functions.hpp View File

@@ -9,6 +9,45 @@ namespace rack {
namespace simd {


// Nonstandard functions

inline float ifelse(bool cond, float a, float b) {
return cond ? a : b;
}

/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
return (a & mask) | andnot(mask, b);
}

/** Returns the approximate reciprocal square root.
Much faster than `1/sqrt(x)`.
*/
inline float_4 rsqrt(float_4 x) {
return float_4(_mm_rsqrt_ps(x.v));
}

/** Returns the approximate reciprocal.
Much faster than `1/x`.
*/
inline float_4 rcp(float_4 x) {
return float_4(_mm_rcp_ps(x.v));
}

/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
*/
template <typename T>
T movemaskInverse(int a);

template <>
inline float_4 movemaskInverse<float_4>(int x) {
__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
__m128i x_bc = _mm_set1_epi32(x);
__m128i t = _mm_and_si128(x_bc, msk8421);
return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
}


// Standard math functions from std::

/* Import std:: math functions into the simd namespace so you can use `sin(T)` etc in templated functions and get both the scalar and vector versions.
@@ -93,40 +132,48 @@ inline float_4 atan2(float_4 x, float_4 y) {
return float_4(sse_mathfun_atan2_ps(x.v, y.v));
}

using std::trunc;

inline float_4 trunc(float_4 a) {
return float_4(_mm_cvtepi32_ps(_mm_cvttps_epi32(a.v)));
}

using std::floor;

inline float_4 floor(float_4 a) {
return float_4(sse_mathfun_floor_ps(a.v));
float_4 b = trunc(a);
b -= (b > a) & 1.f;
return b;
}

using std::ceil;

inline float_4 ceil(float_4 a) {
return float_4(sse_mathfun_ceil_ps(a.v));
float_4 b = trunc(a);
b += (b < a) & 1.f;
return b;
}

using std::round;

inline float_4 round(float_4 a) {
return float_4(sse_mathfun_round_ps(a.v));
a += ifelse(a < 0, -0.5f, 0.5f);
float_4 b = trunc(a);
return b;
}

using std::fmod;

inline float_4 fmod(float_4 a, float_4 b) {
return float_4(sse_mathfun_fmod_ps(a.v, b.v));
return a - trunc(a / b) * b;
}

using std::fabs;

inline float_4 fabs(float_4 a) {
return float_4(sse_mathfun_fabs_ps(a.v));
}

using std::trunc;

inline float_4 trunc(float_4 a) {
return float_4(sse_mathfun_trunc_ps(a.v));
// Sign bit
int32_4 mask = ~0x80000000;
return a & float_4::cast(mask);
}

using std::pow;
@@ -151,31 +198,6 @@ T pow(T a, int b) {
return p;
}

// Nonstandard functions

inline float ifelse(bool cond, float a, float b) {
return cond ? a : b;
}

/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
return (a & mask) | andnot(mask, b);
}

/** Returns the approximate reciprocal square root.
Much faster than `1/sqrt(x)`.
*/
inline float_4 rsqrt(float_4 x) {
return float_4(_mm_rsqrt_ps(x.v));
}

/** Returns the approximate reciprocal.
Much faster than `1/x`.
*/
inline float_4 rcp(float_4 x) {
return float_4(_mm_rcp_ps(x.v));
}

// From math.hpp

using math::clamp;
@@ -204,19 +226,6 @@ inline float_4 sgn(float_4 x) {
return signbit | (nonzero & 1.f);
}

/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
*/
template <typename T>
T movemaskInverse(int a);

template <>
inline float_4 movemaskInverse<float_4>(int x) {
__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
__m128i x_bc = _mm_set1_epi32(x);
__m128i t = _mm_and_si128(x_bc, msk8421);
return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
}


} // namespace simd
} // namespace rack

+ 0
- 42
include/simd/sse_mathfun.h View File

@@ -468,45 +468,3 @@ inline void sse_mathfun_sincos_ps(__m128 x, __m128* s, __m128* c) {
*s = _mm_xor_ps(xmm1, sign_bit_sin);
*c = _mm_xor_ps(xmm2, sign_bit_cos);
}


inline __m128 sse_mathfun_trunc_ps(__m128 a) {
return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
}


inline __m128 sse_mathfun_floor_ps(__m128 a) {
__m128 b = sse_mathfun_trunc_ps(a);
// If b > a, subtract 1 fom b
b = _mm_sub_ps(b, _mm_and_ps(_mm_cmpgt_ps(b, a), sse_mathfun_one_ps()));
return b;
}


inline __m128 sse_mathfun_ceil_ps(__m128 a) {
__m128 b = sse_mathfun_trunc_ps(a);
// If b < a, add 1 to b
b = _mm_add_ps(b, _mm_and_ps(_mm_cmplt_ps(b, a), sse_mathfun_one_ps()));
return b;
}


inline __m128 sse_mathfun_round_ps(__m128 a) {
// TODO Incorrect for -0.5, -1.5, etc.
return sse_mathfun_floor_ps(_mm_add_ps(a, _mm_set_ps1(0.5f)));
}


inline __m128 sse_mathfun_fmod_ps(__m128 a, __m128 b) {
__m128 c = _mm_div_ps(a, b);
c = sse_mathfun_trunc_ps(c);
c = _mm_mul_ps(c, b);
return _mm_sub_ps(a, c);
}


inline __m128 sse_mathfun_fabs_ps(__m128 a) {
__m128i minus1 = _mm_set1_epi32(-1);
__m128 abs_mask = _mm_castsi128_ps(_mm_srli_epi32(minus1, 1));
return _mm_and_ps(abs_mask, a);
}

+ 7
- 3
src/engine/Engine.cpp View File

@@ -17,11 +17,15 @@ namespace rack {
namespace engine {


static void disableDenormals() {
static void initMXCSR() {
// Set CPU to flush-to-zero (FTZ) and denormals-are-zero (DAZ) mode
// https://software.intel.com/en-us/node/682949
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
// Reset other flags
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
_MM_SET_EXCEPTION_MASK(0);
_MM_SET_EXCEPTION_STATE(0);
}


@@ -405,7 +409,7 @@ static void Engine_run(Engine* that) {
// Set up thread
system::setThreadName("Engine");
// system::setThreadRealTime();
disableDenormals();
initMXCSR();

internal->frame = 0;
// Every time the that waits and locks a mutex, it steps this many frames
@@ -835,7 +839,7 @@ void Engine::updateParamHandle(ParamHandle* paramHandle, int moduleId, int param
void EngineWorker::run() {
system::setThreadName("Engine worker");
system::setThreadRealTime(engine->internal->realTime);
disableDenormals();
initMXCSR();

while (1) {
engine->internal->engineBarrier.wait();


Loading…
Cancel
Save