Port some of sse_mathfun to functions.hpp. Clean up SIMD headers.

5 years ago · a52c79df80
--- a/include/dsp/approx.hpp
+++ b/include/dsp/approx.hpp
@@ -36,7 +36,7 @@ inline simd::float_4 approxExp2Floor(simd::float_4 x, simd::float_4* xf) {
 }

 template <>
 inline float approxExp2Floor<float>(float x, float* xf) {
 inline float approxExp2Floor(float x, float* xf) {
 	int xi = x;
 	if (xf)
 		*xf = x - xi;
@@ -46,7 +46,8 @@ inline float approxExp2Floor<float>(float x, float* xf) {

 /** Returns 2^x, assuming that x >= 0.
 Maximum 0.00024% error.
 Roughly 7x faster than `std::pow(2, x)`.
 For float, roughly 3x faster than `std::pow(2.f, x)`.
 For float_4, roughly 2x faster than `simd::pow(2.f, x)`.

 If negative powers are needed, you may use a lower bound and rescale.

--- a/include/simd/functions.hpp
+++ b/include/simd/functions.hpp
@@ -9,6 +9,45 @@ namespace rack {
 namespace simd {


 // Nonstandard functions

 inline float ifelse(bool cond, float a, float b) {
 	return cond ? a : b;
 }

 /** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
 inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
 	return (a & mask) | andnot(mask, b);
 }

 /** Returns the approximate reciprocal square root.
 Much faster than `1/sqrt(x)`.
 */
 inline float_4 rsqrt(float_4 x) {
 	return float_4(_mm_rsqrt_ps(x.v));
 }

 /** Returns the approximate reciprocal.
 Much faster than `1/x`.
 */
 inline float_4 rcp(float_4 x) {
 	return float_4(_mm_rcp_ps(x.v));
 }

 /** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
 */
 template <typename T>
 T movemaskInverse(int a);

 template <>
 inline float_4 movemaskInverse<float_4>(int x) {
 	__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
 	__m128i x_bc = _mm_set1_epi32(x);
 	__m128i t = _mm_and_si128(x_bc, msk8421);
 	return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
 }


 // Standard math functions from std::

 /* Import std:: math functions into the simd namespace so you can use `sin(T)` etc in templated functions and get both the scalar and vector versions.
@@ -93,40 +132,48 @@ inline float_4 atan2(float_4 x, float_4 y) {
 	return float_4(sse_mathfun_atan2_ps(x.v, y.v));
 }

 using std::trunc;

 inline float_4 trunc(float_4 a) {
 	return float_4(_mm_cvtepi32_ps(_mm_cvttps_epi32(a.v)));
 }

 using std::floor;

 inline float_4 floor(float_4 a) {
 	return float_4(sse_mathfun_floor_ps(a.v));
 	float_4 b = trunc(a);
 	b -= (b > a) & 1.f;
 	return b;
 }

 using std::ceil;

 inline float_4 ceil(float_4 a) {
 	return float_4(sse_mathfun_ceil_ps(a.v));
 	float_4 b = trunc(a);
 	b += (b < a) & 1.f;
 	return b;
 }

 using std::round;

 inline float_4 round(float_4 a) {
 	return float_4(sse_mathfun_round_ps(a.v));
 	a += ifelse(a < 0, -0.5f, 0.5f);
 	float_4 b = trunc(a);
 	return b;
 }

 using std::fmod;

 inline float_4 fmod(float_4 a, float_4 b) {
 	return float_4(sse_mathfun_fmod_ps(a.v, b.v));
 	return a - trunc(a / b) * b;
 }

 using std::fabs;

 inline float_4 fabs(float_4 a) {
 	return float_4(sse_mathfun_fabs_ps(a.v));
 }

 using std::trunc;

 inline float_4 trunc(float_4 a) {
 	return float_4(sse_mathfun_trunc_ps(a.v));
 	// Sign bit
 	int32_4 mask = ~0x80000000;
 	return a & float_4::cast(mask);
 }

 using std::pow;
@@ -151,31 +198,6 @@ T pow(T a, int b) {
 	return p;
 }

 // Nonstandard functions

 inline float ifelse(bool cond, float a, float b) {
 	return cond ? a : b;
 }

 /** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
 inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
 	return (a & mask) | andnot(mask, b);
 }

 /** Returns the approximate reciprocal square root.
 Much faster than `1/sqrt(x)`.
 */
 inline float_4 rsqrt(float_4 x) {
 	return float_4(_mm_rsqrt_ps(x.v));
 }

 /** Returns the approximate reciprocal.
 Much faster than `1/x`.
 */
 inline float_4 rcp(float_4 x) {
 	return float_4(_mm_rcp_ps(x.v));
 }

 // From math.hpp

 using math::clamp;
@@ -204,19 +226,6 @@ inline float_4 sgn(float_4 x) {
 	return signbit | (nonzero & 1.f);
 }

 /** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
 */
 template <typename T>
 T movemaskInverse(int a);

 template <>
 inline float_4 movemaskInverse<float_4>(int x) {
 	__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
 	__m128i x_bc = _mm_set1_epi32(x);
 	__m128i t = _mm_and_si128(x_bc, msk8421);
 	return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
 }


 } // namespace simd
 } // namespace rack
--- a/include/simd/sse_mathfun.h
+++ b/include/simd/sse_mathfun.h
@@ -468,45 +468,3 @@ inline void sse_mathfun_sincos_ps(__m128 x, __m128* s, __m128* c) {
 	*s = _mm_xor_ps(xmm1, sign_bit_sin);
 	*c = _mm_xor_ps(xmm2, sign_bit_cos);
 }


 inline __m128 sse_mathfun_trunc_ps(__m128 a) {
 	return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
 }


 inline __m128 sse_mathfun_floor_ps(__m128 a) {
 	__m128 b = sse_mathfun_trunc_ps(a);
 	// If b > a, subtract 1 fom b
 	b = _mm_sub_ps(b, _mm_and_ps(_mm_cmpgt_ps(b, a), sse_mathfun_one_ps()));
 	return b;
 }


 inline __m128 sse_mathfun_ceil_ps(__m128 a) {
 	__m128 b = sse_mathfun_trunc_ps(a);
 	// If b < a, add 1 to b
 	b = _mm_add_ps(b, _mm_and_ps(_mm_cmplt_ps(b, a), sse_mathfun_one_ps()));
 	return b;
 }


 inline __m128 sse_mathfun_round_ps(__m128 a) {
 	// TODO Incorrect for -0.5, -1.5, etc.
 	return sse_mathfun_floor_ps(_mm_add_ps(a, _mm_set_ps1(0.5f)));
 }


 inline __m128 sse_mathfun_fmod_ps(__m128 a, __m128 b) {
 	__m128 c = _mm_div_ps(a, b);
 	c = sse_mathfun_trunc_ps(c);
 	c = _mm_mul_ps(c, b);
 	return _mm_sub_ps(a, c);
 }


 inline __m128 sse_mathfun_fabs_ps(__m128 a) {
 	__m128i minus1 = _mm_set1_epi32(-1);
 	__m128 abs_mask = _mm_castsi128_ps(_mm_srli_epi32(minus1, 1));
 	return _mm_and_ps(abs_mask, a);
 }
--- a/src/engine/Engine.cpp
+++ b/src/engine/Engine.cpp
@@ -17,11 +17,15 @@ namespace rack {
 namespace engine {


 static void disableDenormals() {
 static void initMXCSR() {
 	// Set CPU to flush-to-zero (FTZ) and denormals-are-zero (DAZ) mode
 	// https://software.intel.com/en-us/node/682949
 	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
 	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
 	// Reset other flags
 	_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 	_MM_SET_EXCEPTION_MASK(0);
 	_MM_SET_EXCEPTION_STATE(0);
 }


@@ -405,7 +409,7 @@ static void Engine_run(Engine* that) {
 	// Set up thread
 	system::setThreadName("Engine");
 	// system::setThreadRealTime();
 	disableDenormals();
 	initMXCSR();

 	internal->frame = 0;
 	// Every time the that waits and locks a mutex, it steps this many frames
@@ -835,7 +839,7 @@ void Engine::updateParamHandle(ParamHandle* paramHandle, int moduleId, int param
 void EngineWorker::run() {
 	system::setThreadName("Engine worker");
 	system::setThreadRealTime(engine->internal->realTime);
 	disableDenormals();
 	initMXCSR();

 	while (1) {
 		engine->internal->engineBarrier.wait();