diff --git a/include/dsp/approx.hpp b/include/dsp/approx.hpp index 22b9ef6c..fab2bfe3 100644 --- a/include/dsp/approx.hpp +++ b/include/dsp/approx.hpp @@ -36,7 +36,7 @@ inline simd::float_4 approxExp2Floor(simd::float_4 x, simd::float_4* xf) { } template <> -inline float approxExp2Floor(float x, float* xf) { +inline float approxExp2Floor(float x, float* xf) { int xi = x; if (xf) *xf = x - xi; @@ -46,7 +46,8 @@ inline float approxExp2Floor(float x, float* xf) { /** Returns 2^x, assuming that x >= 0. Maximum 0.00024% error. -Roughly 7x faster than `std::pow(2, x)`. +For float, roughly 3x faster than `std::pow(2.f, x)`. +For float_4, roughly 2x faster than `simd::pow(2.f, x)`. If negative powers are needed, you may use a lower bound and rescale. diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp index f58b3856..75610e40 100644 --- a/include/simd/functions.hpp +++ b/include/simd/functions.hpp @@ -9,6 +9,45 @@ namespace rack { namespace simd { +// Nonstandard functions + +inline float ifelse(bool cond, float a, float b) { + return cond ? a : b; +} + +/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */ +inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) { + return (a & mask) | andnot(mask, b); +} + +/** Returns the approximate reciprocal square root. +Much faster than `1/sqrt(x)`. +*/ +inline float_4 rsqrt(float_4 x) { + return float_4(_mm_rsqrt_ps(x.v)); +} + +/** Returns the approximate reciprocal. +Much faster than `1/x`. +*/ +inline float_4 rcp(float_4 x) { + return float_4(_mm_rcp_ps(x.v)); +} + +/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit. +*/ +template +T movemaskInverse(int a); + +template <> +inline float_4 movemaskInverse(int x) { + __m128i msk8421 = _mm_set_epi32(8, 4, 2, 1); + __m128i x_bc = _mm_set1_epi32(x); + __m128i t = _mm_and_si128(x_bc, msk8421); + return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t))); +} + + // Standard math functions from std:: /* Import std:: math functions into the simd namespace so you can use `sin(T)` etc in templated functions and get both the scalar and vector versions. @@ -93,40 +132,48 @@ inline float_4 atan2(float_4 x, float_4 y) { return float_4(sse_mathfun_atan2_ps(x.v, y.v)); } +using std::trunc; + +inline float_4 trunc(float_4 a) { + return float_4(_mm_cvtepi32_ps(_mm_cvttps_epi32(a.v))); +} + using std::floor; inline float_4 floor(float_4 a) { - return float_4(sse_mathfun_floor_ps(a.v)); + float_4 b = trunc(a); + b -= (b > a) & 1.f; + return b; } using std::ceil; inline float_4 ceil(float_4 a) { - return float_4(sse_mathfun_ceil_ps(a.v)); + float_4 b = trunc(a); + b += (b < a) & 1.f; + return b; } using std::round; inline float_4 round(float_4 a) { - return float_4(sse_mathfun_round_ps(a.v)); + a += ifelse(a < 0, -0.5f, 0.5f); + float_4 b = trunc(a); + return b; } using std::fmod; inline float_4 fmod(float_4 a, float_4 b) { - return float_4(sse_mathfun_fmod_ps(a.v, b.v)); + return a - trunc(a / b) * b; } using std::fabs; inline float_4 fabs(float_4 a) { - return float_4(sse_mathfun_fabs_ps(a.v)); -} - -using std::trunc; - -inline float_4 trunc(float_4 a) { - return float_4(sse_mathfun_trunc_ps(a.v)); + // Sign bit + int32_4 mask = ~0x80000000; + return a & float_4::cast(mask); } using std::pow; @@ -151,31 +198,6 @@ T pow(T a, int b) { return p; } -// Nonstandard functions - -inline float ifelse(bool cond, float a, float b) { - return cond ? a : b; -} - -/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */ -inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) { - return (a & mask) | andnot(mask, b); -} - -/** Returns the approximate reciprocal square root. -Much faster than `1/sqrt(x)`. -*/ -inline float_4 rsqrt(float_4 x) { - return float_4(_mm_rsqrt_ps(x.v)); -} - -/** Returns the approximate reciprocal. -Much faster than `1/x`. -*/ -inline float_4 rcp(float_4 x) { - return float_4(_mm_rcp_ps(x.v)); -} - // From math.hpp using math::clamp; @@ -204,19 +226,6 @@ inline float_4 sgn(float_4 x) { return signbit | (nonzero & 1.f); } -/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit. -*/ -template -T movemaskInverse(int a); - -template <> -inline float_4 movemaskInverse(int x) { - __m128i msk8421 = _mm_set_epi32(8, 4, 2, 1); - __m128i x_bc = _mm_set1_epi32(x); - __m128i t = _mm_and_si128(x_bc, msk8421); - return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t))); -} - } // namespace simd } // namespace rack diff --git a/include/simd/sse_mathfun.h b/include/simd/sse_mathfun.h index 5d12bd1f..8f703f91 100644 --- a/include/simd/sse_mathfun.h +++ b/include/simd/sse_mathfun.h @@ -468,45 +468,3 @@ inline void sse_mathfun_sincos_ps(__m128 x, __m128* s, __m128* c) { *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); } - - -inline __m128 sse_mathfun_trunc_ps(__m128 a) { - return _mm_cvtepi32_ps(_mm_cvttps_epi32(a)); -} - - -inline __m128 sse_mathfun_floor_ps(__m128 a) { - __m128 b = sse_mathfun_trunc_ps(a); - // If b > a, subtract 1 fom b - b = _mm_sub_ps(b, _mm_and_ps(_mm_cmpgt_ps(b, a), sse_mathfun_one_ps())); - return b; -} - - -inline __m128 sse_mathfun_ceil_ps(__m128 a) { - __m128 b = sse_mathfun_trunc_ps(a); - // If b < a, add 1 to b - b = _mm_add_ps(b, _mm_and_ps(_mm_cmplt_ps(b, a), sse_mathfun_one_ps())); - return b; -} - - -inline __m128 sse_mathfun_round_ps(__m128 a) { - // TODO Incorrect for -0.5, -1.5, etc. - return sse_mathfun_floor_ps(_mm_add_ps(a, _mm_set_ps1(0.5f))); -} - - -inline __m128 sse_mathfun_fmod_ps(__m128 a, __m128 b) { - __m128 c = _mm_div_ps(a, b); - c = sse_mathfun_trunc_ps(c); - c = _mm_mul_ps(c, b); - return _mm_sub_ps(a, c); -} - - -inline __m128 sse_mathfun_fabs_ps(__m128 a) { - __m128i minus1 = _mm_set1_epi32(-1); - __m128 abs_mask = _mm_castsi128_ps(_mm_srli_epi32(minus1, 1)); - return _mm_and_ps(abs_mask, a); -} diff --git a/src/engine/Engine.cpp b/src/engine/Engine.cpp index 71e68b2e..ef7ac4de 100644 --- a/src/engine/Engine.cpp +++ b/src/engine/Engine.cpp @@ -17,11 +17,15 @@ namespace rack { namespace engine { -static void disableDenormals() { +static void initMXCSR() { // Set CPU to flush-to-zero (FTZ) and denormals-are-zero (DAZ) mode // https://software.intel.com/en-us/node/682949 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + // Reset other flags + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + _MM_SET_EXCEPTION_MASK(0); + _MM_SET_EXCEPTION_STATE(0); } @@ -405,7 +409,7 @@ static void Engine_run(Engine* that) { // Set up thread system::setThreadName("Engine"); // system::setThreadRealTime(); - disableDenormals(); + initMXCSR(); internal->frame = 0; // Every time the that waits and locks a mutex, it steps this many frames @@ -835,7 +839,7 @@ void Engine::updateParamHandle(ParamHandle* paramHandle, int moduleId, int param void EngineWorker::run() { system::setThreadName("Engine worker"); system::setThreadRealTime(engine->internal->realTime); - disableDenormals(); + initMXCSR(); while (1) { engine->internal->engineBarrier.wait();