diff --git a/include/dsp/approx.hpp b/include/dsp/approx.hpp
index 22b9ef6c..fab2bfe3 100644
--- a/include/dsp/approx.hpp
+++ b/include/dsp/approx.hpp
@@ -36,7 +36,7 @@ inline simd::float_4 approxExp2Floor(simd::float_4 x, simd::float_4* xf) {
 }
 
 template <>
-inline float approxExp2Floor<float>(float x, float* xf) {
+inline float approxExp2Floor(float x, float* xf) {
 	int xi = x;
 	if (xf)
 		*xf = x - xi;
@@ -46,7 +46,8 @@ inline float approxExp2Floor<float>(float x, float* xf) {
 
 /** Returns 2^x, assuming that x >= 0.
 Maximum 0.00024% error.
-Roughly 7x faster than `std::pow(2, x)`.
+For float, roughly 3x faster than `std::pow(2.f, x)`.
+For float_4, roughly 2x faster than `simd::pow(2.f, x)`.
 
 If negative powers are needed, you may use a lower bound and rescale.
 
diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp
index f58b3856..75610e40 100644
--- a/include/simd/functions.hpp
+++ b/include/simd/functions.hpp
@@ -9,6 +9,45 @@ namespace rack {
 namespace simd {
 
 
+// Nonstandard functions
+
+inline float ifelse(bool cond, float a, float b) {
+	return cond ? a : b;
+}
+
+/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
+inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
+	return (a & mask) | andnot(mask, b);
+}
+
+/** Returns the approximate reciprocal square root.
+Much faster than `1/sqrt(x)`.
+*/
+inline float_4 rsqrt(float_4 x) {
+	return float_4(_mm_rsqrt_ps(x.v));
+}
+
+/** Returns the approximate reciprocal.
+Much faster than `1/x`.
+*/
+inline float_4 rcp(float_4 x) {
+	return float_4(_mm_rcp_ps(x.v));
+}
+
+/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
+*/
+template <typename T>
+T movemaskInverse(int a);
+
+template <>
+inline float_4 movemaskInverse<float_4>(int x) {
+	__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
+	__m128i x_bc = _mm_set1_epi32(x);
+	__m128i t = _mm_and_si128(x_bc, msk8421);
+	return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
+}
+
+
 // Standard math functions from std::
 
 /* Import std:: math functions into the simd namespace so you can use `sin(T)` etc in templated functions and get both the scalar and vector versions.
@@ -93,40 +132,48 @@ inline float_4 atan2(float_4 x, float_4 y) {
 	return float_4(sse_mathfun_atan2_ps(x.v, y.v));
 }
 
+using std::trunc;
+
+inline float_4 trunc(float_4 a) {
+	return float_4(_mm_cvtepi32_ps(_mm_cvttps_epi32(a.v)));
+}
+
 using std::floor;
 
 inline float_4 floor(float_4 a) {
-	return float_4(sse_mathfun_floor_ps(a.v));
+	float_4 b = trunc(a);
+	b -= (b > a) & 1.f;
+	return b;
 }
 
 using std::ceil;
 
 inline float_4 ceil(float_4 a) {
-	return float_4(sse_mathfun_ceil_ps(a.v));
+	float_4 b = trunc(a);
+	b += (b < a) & 1.f;
+	return b;
 }
 
 using std::round;
 
 inline float_4 round(float_4 a) {
-	return float_4(sse_mathfun_round_ps(a.v));
+	a += ifelse(a < 0, -0.5f, 0.5f);
+	float_4 b = trunc(a);
+	return b;
 }
 
 using std::fmod;
 
 inline float_4 fmod(float_4 a, float_4 b) {
-	return float_4(sse_mathfun_fmod_ps(a.v, b.v));
+	return a - trunc(a / b) * b;
 }
 
 using std::fabs;
 
 inline float_4 fabs(float_4 a) {
-	return float_4(sse_mathfun_fabs_ps(a.v));
-}
-
-using std::trunc;
-
-inline float_4 trunc(float_4 a) {
-	return float_4(sse_mathfun_trunc_ps(a.v));
+	// Sign bit
+	int32_4 mask = ~0x80000000;
+	return a & float_4::cast(mask);
 }
 
 using std::pow;
@@ -151,31 +198,6 @@ T pow(T a, int b) {
 	return p;
 }
 
-// Nonstandard functions
-
-inline float ifelse(bool cond, float a, float b) {
-	return cond ? a : b;
-}
-
-/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
-inline float_4 ifelse(float_4 mask, float_4 a, float_4 b) {
-	return (a & mask) | andnot(mask, b);
-}
-
-/** Returns the approximate reciprocal square root.
-Much faster than `1/sqrt(x)`.
-*/
-inline float_4 rsqrt(float_4 x) {
-	return float_4(_mm_rsqrt_ps(x.v));
-}
-
-/** Returns the approximate reciprocal.
-Much faster than `1/x`.
-*/
-inline float_4 rcp(float_4 x) {
-	return float_4(_mm_rcp_ps(x.v));
-}
-
 // From math.hpp
 
 using math::clamp;
@@ -204,19 +226,6 @@ inline float_4 sgn(float_4 x) {
 	return signbit | (nonzero & 1.f);
 }
 
-/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit.
-*/
-template <typename T>
-T movemaskInverse(int a);
-
-template <>
-inline float_4 movemaskInverse<float_4>(int x) {
-	__m128i msk8421 = _mm_set_epi32(8, 4, 2, 1);
-	__m128i x_bc = _mm_set1_epi32(x);
-	__m128i t = _mm_and_si128(x_bc, msk8421);
-	return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t)));
-}
-
 
 } // namespace simd
 } // namespace rack
diff --git a/include/simd/sse_mathfun.h b/include/simd/sse_mathfun.h
index 5d12bd1f..8f703f91 100644
--- a/include/simd/sse_mathfun.h
+++ b/include/simd/sse_mathfun.h
@@ -468,45 +468,3 @@ inline void sse_mathfun_sincos_ps(__m128 x, __m128* s, __m128* c) {
 	*s = _mm_xor_ps(xmm1, sign_bit_sin);
 	*c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
-
-
-inline __m128 sse_mathfun_trunc_ps(__m128 a) {
-	return _mm_cvtepi32_ps(_mm_cvttps_epi32(a));
-}
-
-
-inline __m128 sse_mathfun_floor_ps(__m128 a) {
-	__m128 b = sse_mathfun_trunc_ps(a);
-	// If b > a, subtract 1 fom b
-	b = _mm_sub_ps(b, _mm_and_ps(_mm_cmpgt_ps(b, a), sse_mathfun_one_ps()));
-	return b;
-}
-
-
-inline __m128 sse_mathfun_ceil_ps(__m128 a) {
-	__m128 b = sse_mathfun_trunc_ps(a);
-	// If b < a, add 1 to b
-	b = _mm_add_ps(b, _mm_and_ps(_mm_cmplt_ps(b, a), sse_mathfun_one_ps()));
-	return b;
-}
-
-
-inline __m128 sse_mathfun_round_ps(__m128 a) {
-	// TODO Incorrect for -0.5, -1.5, etc.
-	return sse_mathfun_floor_ps(_mm_add_ps(a, _mm_set_ps1(0.5f)));
-}
-
-
-inline __m128 sse_mathfun_fmod_ps(__m128 a, __m128 b) {
-	__m128 c = _mm_div_ps(a, b);
-	c = sse_mathfun_trunc_ps(c);
-	c = _mm_mul_ps(c, b);
-	return _mm_sub_ps(a, c);
-}
-
-
-inline __m128 sse_mathfun_fabs_ps(__m128 a) {
-	__m128i minus1 = _mm_set1_epi32(-1);
-	__m128 abs_mask = _mm_castsi128_ps(_mm_srli_epi32(minus1, 1));
-	return _mm_and_ps(abs_mask, a);
-}
diff --git a/src/engine/Engine.cpp b/src/engine/Engine.cpp
index 71e68b2e..ef7ac4de 100644
--- a/src/engine/Engine.cpp
+++ b/src/engine/Engine.cpp
@@ -17,11 +17,15 @@ namespace rack {
 namespace engine {
 
 
-static void disableDenormals() {
+static void initMXCSR() {
 	// Set CPU to flush-to-zero (FTZ) and denormals-are-zero (DAZ) mode
 	// https://software.intel.com/en-us/node/682949
 	_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
 	_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+	// Reset other flags
+	_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+	_MM_SET_EXCEPTION_MASK(0);
+	_MM_SET_EXCEPTION_STATE(0);
 }
 
 
@@ -405,7 +409,7 @@ static void Engine_run(Engine* that) {
 	// Set up thread
 	system::setThreadName("Engine");
 	// system::setThreadRealTime();
-	disableDenormals();
+	initMXCSR();
 
 	internal->frame = 0;
 	// Every time the that waits and locks a mutex, it steps this many frames
@@ -835,7 +839,7 @@ void Engine::updateParamHandle(ParamHandle* paramHandle, int moduleId, int param
 void EngineWorker::run() {
 	system::setThreadName("Engine worker");
 	system::setThreadRealTime(engine->internal->realTime);
-	disableDenormals();
+	initMXCSR();
 
 	while (1) {
 		engine->internal->engineBarrier.wait();