From ea8eca4cd35896021e25a250e2d5ed9ec50ec8c3 Mon Sep 17 00:00:00 2001
From: Andrew Belt <andrewpbelt@gmail.com>
Date: Mon, 20 May 2019 00:31:23 -0400
Subject: [PATCH] Add simd to various dsp functions.

---
 include/dsp/common.hpp     |   4 +-
 include/dsp/filter.hpp     | 194 ++++++++++++++++++++++---------------
 include/dsp/ode.hpp        |  30 +++---
 include/dsp/window.hpp     |  30 +++---
 include/simd/functions.hpp |  16 +++
 include/simd/vector.hpp    |   5 -
 6 files changed, 167 insertions(+), 112 deletions(-)
diff --git a/include/dsp/common.hpp b/include/dsp/common.hpp
index a577a459..4143f187 100644
--- a/include/dsp/common.hpp
+++ b/include/dsp/common.hpp
@@ -87,9 +87,9 @@ T exponentialBipolar(T b, T x) {
 
 
 /** Useful for storing arrays of samples in ring buffers and casting them to `float*` to be used by interleaved processors, like SampleRateConverter */
-template <size_t CHANNELS>
+template <size_t CHANNELS, typename T = float>
 struct Frame {
-	float samples[CHANNELS];
+	T samples[CHANNELS];
 };
 
 
diff --git a/include/dsp/filter.hpp b/include/dsp/filter.hpp
index 909c6137..d0fdee69 100644
--- a/include/dsp/filter.hpp
+++ b/include/dsp/filter.hpp
@@ -6,128 +6,168 @@ namespace rack {
 namespace dsp {
 
 
-struct RCFilter {
-	float c = 0.f;
-	float xstate[1] = {};
-	float ystate[1] = {};
+/** The simplest possible analog filter using an Euler solver.
+https://en.wikipedia.org/wiki/RC_circuit
+Use two RC filters in series for a bandpass filter.
+*/
+template <typename T = float>
+struct TRCFilter {
+	T c = 0.f;
+	T xstate[1];
+	T ystate[1];
+
+	TRCFilter() {
+		reset();
+	}
+
+	void reset() {
+		xstate[0] = 0.f;
+		ystate[0] = 0.f;
+	}
 
-	// `r` is the ratio between the cutoff frequency and sample rate, i.e. r = f_c / f_s
-	void setCutoff(float r) {
+	/** Sets the cutoff frequency.
+	`r` is the ratio between the cutoff frequency and sample rate, i.e. r = f_c / f_s
+	*/
+	void setCutoff(T r) {
 		c = 2.f / r;
 	}
-	void process(float x) {
-		float y = (x + xstate[0] - ystate[0] * (1 - c)) / (1 + c);
+	void process(T x) {
+		T y = (x + xstate[0] - ystate[0] * (1 - c)) / (1 + c);
 		xstate[0] = x;
 		ystate[0] = y;
 	}
-	float lowpass() {
+	T lowpass() {
 		return ystate[0];
 	}
-	float highpass() {
+	T highpass() {
 		return xstate[0] - ystate[0];
 	}
 };
 
+typedef TRCFilter<> RCFilter;
+
 
-struct PeakFilter {
-	float state = 0.f;
-	float c = 0.f;
+/** Applies exponential smoothing to a signal with the ODE
+\f$ \frac{dy}{dt} = x \lambda \f$.
+*/
+template <typename T = float>
+struct TExponentialFilter {
+	T out = 0.f;
+	T lambda = 0.f;
+
+	void reset() {
+		out = 0.f;
+	}
 
-	/** Rate is lambda / sampleRate */
-	void setRate(float r) {
-		c = 1.f - r;
+	void setLambda(T lambda) {
+		this->lambda = lambda;
 	}
-	void process(float x) {
-		if (x > state)
-			state = x;
-		state *= c;
+
+	T process(T deltaTime, T in) {
+		T y = out + (in - out) * lambda * deltaTime;
+		// If no change was made between the old and new output, assume T granularity is too small and snap output to input
+		out = simd::ifelse(out == y, in, y);
+		return out;
 	}
-	float peak() {
-		return state;
+
+	DEPRECATED T process(T in) {
+		return process(1.f, in);
 	}
 };
 
+typedef TExponentialFilter<> ExponentialFilter;
+
+
+/** Like ExponentialFilter but jumps immediately to higher values.
+*/
+template <typename T = float>
+struct TPeakFilter {
+	T out = 0.f;
+	T lambda = 0.f;
+
+	void reset() {
+		out = 0.f;
+	}
 
-struct SlewLimiter {
-	float rise = 0.f;
-	float fall = 0.f;
-	float out = NAN;
-
-	float process(float deltaTime, float in) {
-		if (std::isnan(out)) {
-			out = in;
-		}
-		else if (out < in) {
-			float y = out + rise * deltaTime;
-			out = std::fmin(y, in);
-		}
-		else if (out > in) {
-			float y = out - fall * deltaTime;
-			out = std::fmax(y, in);
-		}
+	void setLambda(T lambda) {
+		this->lambda = lambda;
+	}
+
+	T process(T deltaTime, T in) {
+		T y = out + (in - out) * lambda * deltaTime;
+		out = simd::fmax(y, in);
 		return out;
 	}
-	DEPRECATED float process(float in) {
-		return process(1.f, in);
+	/** Use the return value of process() instead. */
+	DEPRECATED T peak() {
+		return out;
 	}
-	DEPRECATED void setRiseFall(float rise, float fall) {
-		this->rise = rise;
-		this->fall = fall;
+	/** Use setLambda() instead. */
+	DEPRECATED void setRate(T r) {
+		lambda = 1.f - r;
+	}
+	DEPRECATED T process(T x) {
+		return process(1.f, x);
 	}
 };
 
+typedef TPeakFilter<> PeakFilter;
+
+
+template <typename T = float>
+struct TSlewLimiter {
+	T out = 0.f;
+	T rise = 0.f;
+	T fall = 0.f;
 
-struct ExponentialSlewLimiter {
-	float riseLambda = 0.f;
-	float fallLambda = 0.f;
-	float out = NAN;
-
-	float process(float deltaTime, float in) {
-		if (std::isnan(out)) {
-			out = in;
-		}
-		else if (out < in) {
-			float y = out + (in - out) * riseLambda * deltaTime;
-			out = (out == y) ? in : y;
-		}
-		else if (out > in) {
-			float y = out + (in - out) * fallLambda * deltaTime;
-			out = (out == y) ? in : y;
-		}
+	void reset() {
+		out = 0.f;
+	}
+
+	void setRiseFall(T rise, T fall) {
+		this->rise = rise;
+		this->fall = fall;
+	}
+	T process(T deltaTime, T in) {
+		out = simd::clamp(in, out - fall * deltaTime, out + rise * deltaTime);
 		return out;
 	}
-	DEPRECATED float process(float in) {
+	DEPRECATED T process(T in) {
 		return process(1.f, in);
 	}
 };
 
+typedef TSlewLimiter<> SlewLimiter;
 
-/** Applies exponential smoothing to a signal with the ODE
-\f$ \frac{dy}{dt} = x \lambda \f$.
-*/
-struct ExponentialFilter {
-	float out = 0.f;
-	float lambda = 0.f;
+
+template <typename T = float>
+struct TExponentialSlewLimiter {
+	T out = 0.f;
+	T riseLambda = 0.f;
+	T fallLambda = 0.f;
 
 	void reset() {
 		out = 0.f;
 	}
 
-	float process(float deltaTime, float in) {
-		float y = out + (in - out) * lambda * deltaTime;
-		// If no change was made between the old and new output, assume float granularity is too small and snap output to input
-		if (out == y)
-			out = in;
-		else
-			out = y;
+	void setRiseFall(T riseLambda, T fallLambda) {
+		this->riseLambda = riseLambda;
+		this->fallLambda = fallLambda;
+	}
+	T process(T deltaTime, T in) {
+		T lambda = simd::ifelse(in > out, riseLambda, fallLambda);
+		T y = out + (in - out) * lambda * deltaTime;
+		// If the change from the old out to the new out is too small for floats, set `in` directly.
+		out = simd::ifelse(out == y, in, y);
 		return out;
 	}
-
-	DEPRECATED float process(float in) {
+	DEPRECATED T process(T in) {
 		return process(1.f, in);
 	}
 };
 
+typedef TExponentialSlewLimiter<> ExponentialSlewLimiter;
+
 
 } // namespace dsp
 } // namespace rack
diff --git a/include/dsp/ode.hpp b/include/dsp/ode.hpp
index f50fc5b1..9b0345be 100644
--- a/include/dsp/ode.hpp
+++ b/include/dsp/ode.hpp
@@ -26,9 +26,9 @@ For example, the following solves the system x''(t) = -x(t) using a fixed timest
 */
 
 /** Solves an ODE system using the 1st order Euler method */
-template <typename F>
-void stepEuler(float t, float dt, float x[], int len, F f) {
-	float k[len];
+template <typename T, typename F>
+void stepEuler(T t, T dt, T x[], int len, F f) {
+	T k[len];
 
 	f(t, x, k);
 	for (int i = 0; i < len; i++) {
@@ -37,11 +37,11 @@ void stepEuler(float t, float dt, float x[], int len, F f) {
 }
 
 /** Solves an ODE system using the 2nd order Runge-Kutta method */
-template <typename F>
-void stepRK2(float t, float dt, float x[], int len, F f) {
-	float k1[len];
-	float k2[len];
-	float yi[len];
+template <typename T, typename F>
+void stepRK2(T t, T dt, T x[], int len, F f) {
+	T k1[len];
+	T k2[len];
+	T yi[len];
 
 	f(t, x, k1);
 
@@ -56,13 +56,13 @@ void stepRK2(float t, float dt, float x[], int len, F f) {
 }
 
 /** Solves an ODE system using the 4th order Runge-Kutta method */
-template <typename F>
-void stepRK4(float t, float dt, float x[], int len, F f) {
-	float k1[len];
-	float k2[len];
-	float k3[len];
-	float k4[len];
-	float yi[len];
+template <typename T, typename F>
+void stepRK4(T t, T dt, T x[], int len, F f) {
+	T k1[len];
+	T k2[len];
+	T k3[len];
+	T k4[len];
+	T yi[len];
 
 	f(t, x, k1);
 
diff --git a/include/dsp/window.hpp b/include/dsp/window.hpp
index 4cc74bd1..010c7734 100644
--- a/include/dsp/window.hpp
+++ b/include/dsp/window.hpp
@@ -10,8 +10,9 @@ namespace dsp {
 p: proportion from [0, 1], usually `i / (len - 1)`
 https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
 */
-inline float hann(float p) {
-	return 0.5f * (1.f - std::cos(2*M_PI * p));
+template <typename T>
+inline T hann(T p) {
+	return 0.5f * (1.f - simd::cos(2*M_PI * p));
 }
 
 /** Multiplies the Hann window by a signal `x` of length `len` in-place. */
@@ -25,11 +26,12 @@ inline void hannWindow(float *x, int len) {
 https://en.wikipedia.org/wiki/Window_function#Blackman_window
 A typical alpha value is 0.16.
 */
-inline float blackman(float alpha, float p) {
+template <typename T>
+inline T blackman(T alpha, T p) {
 	return
 		+ (1 - alpha) / 2.f
-		- 1 / 2.f * std::cos(2*M_PI * p)
-		+ alpha / 2.f * std::cos(4*M_PI * p);
+		- 1 / 2.f * simd::cos(2*M_PI * p)
+		+ alpha / 2.f * simd::cos(4*M_PI * p);
 }
 
 inline void blackmanWindow(float alpha, float *x, int len) {
@@ -42,12 +44,13 @@ inline void blackmanWindow(float alpha, float *x, int len) {
 /** Blackman-Nuttall window function.
 https://en.wikipedia.org/wiki/Window_function#Blackman%E2%80%93Nuttall_window
 */
-inline float blackmanNuttall(float p) {
+template <typename T>
+inline T blackmanNuttall(T p) {
 	return
 		+ 0.3635819f
-		- 0.4891775f * std::cos(2*M_PI * p)
-		+ 0.1365995f * std::cos(4*M_PI * p)
-		- 0.0106411f * std::cos(6*M_PI * p);
+		- 0.4891775f * simd::cos(2*M_PI * p)
+		+ 0.1365995f * simd::cos(4*M_PI * p)
+		- 0.0106411f * simd::cos(6*M_PI * p);
 }
 
 inline void blackmanNuttallWindow(float *x, int len) {
@@ -59,12 +62,13 @@ inline void blackmanNuttallWindow(float *x, int len) {
 /** Blackman-Harris window function.
 https://en.wikipedia.org/wiki/Window_function#Blackman%E2%80%93Harris_window
 */
-inline float blackmanHarris(float p) {
+template <typename T>
+inline T blackmanHarris(T p) {
 	return
 		+ 0.35875f
-		- 0.48829f * std::cos(2*M_PI * p)
-		+ 0.14128f * std::cos(4*M_PI * p)
-		- 0.01168f * std::cos(6*M_PI * p);
+		- 0.48829f * simd::cos(2*M_PI * p)
+		+ 0.14128f * simd::cos(4*M_PI * p)
+		- 0.01168f * simd::cos(6*M_PI * p);
 }
 
 inline void blackmanHarrisWindow(float *x, int len) {
diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp
index cfd075e0..0ac446ef 100644
--- a/include/simd/functions.hpp
+++ b/include/simd/functions.hpp
@@ -52,6 +52,12 @@ inline f32_4 log10(f32_4 x) {
 	return f32_4(sse_mathfun_log_ps(x.v)) / std::log(10.f);
 }
 
+using std::log2;
+
+inline f32_4 log2(f32_4 x) {
+	return f32_4(sse_mathfun_log_ps(x.v)) / std::log(2.f);
+}
+
 using std::exp;
 
 inline f32_4 exp(f32_4 x) {
@@ -118,6 +124,16 @@ inline f32_4 pow(float a, f32_4 b) {
 
 // Nonstandard functions
 
+inline float ifelse(bool cond, float a, float b) {
+	return cond ? a : b;
+}
+
+/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
+inline f32_4 ifelse(f32_4 mask, f32_4 a, f32_4 b) {
+	return (a & mask) | andnot(mask, b);
+}
+
+
 /** Returns the approximate reciprocal square root.
 Much faster than `1/sqrt(x)`.
 */
diff --git a/include/simd/vector.hpp b/include/simd/vector.hpp
index b78dda27..fea42387 100644
--- a/include/simd/vector.hpp
+++ b/include/simd/vector.hpp
@@ -187,11 +187,6 @@ inline f32_4 andnot(const f32_4 &a, const f32_4 &b) {
 	return f32_4(_mm_andnot_ps(a.v, b.v));
 }
 
-/** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
-inline f32_4 ifelse(const f32_4 &mask, const f32_4 &a, const f32_4 &b) {
-	return (a & mask) | andnot(mask, b);
-}
-
 
 } // namespace simd
 } // namespace rack