diff --git a/src/VCO.cpp b/src/VCO.cpp
index b1eea10..5cb955e 100644
--- a/src/VCO.cpp
+++ b/src/VCO.cpp
@@ -3,30 +3,27 @@
 
 // TODO: Remove these DSP classes after released in Rack SDK
 
-inline simd::float_4 ifelse(simd::float_4 mask, simd::float_4 a, simd::float_4 b) {
-	return simd::float_4(_mm_blendv_ps(b.v, a.v, mask.v));
-}
-
-/** Approximates sin(2pi x) in the domain [0, 1].
-Evaluates a 7th order odd polynomial on the range [0, 0.25] after folding the argument.
-Optimized for THD and max error.
-THD -84 dB, max error 1e-04
+/** Evaluates sin(pi x) for x in [-1, 1].
+9th order polynomial with roots at 0, -1, and 1.
+Optimized coefficients for best THD (-103.7 dB) and max absolute error (6.68e-06).
 */
 template <typename T>
-T sin2pi_fast(T x) {
-	// Get sign of result
-	T sign = ::ifelse(x >= T(0.5), -1, 1);
-	// Shift if negative
-	x = ::ifelse(x >= T(0.5), x - T(0.5), x);
-	// Flip if >0.25
-	x = ::ifelse(x >= T(0.25), T(0.5) - x, x);
+inline T sin_pi_9(T x) {
 	T x2 = x * x;
-	const T c1 = 6.281422578399;
-	const T c3 = -41.122058514502;
-	const T c5 = 74.010711837743;
-	return sign * x * (c1 + x2 * (c3 + x2 * c5));
+	return x * (T(1) - x2) * (T(3.141521108990) + x2 * (T(-2.024773594411) + x2 * (T(0.517493161073) + x2 * T(-0.063691343423))));
 }
 
+/** Catmull-Rom cubic interpolation
+t is the fractional position between y1 and y2.
+*/
+template <typename T>
+T cubicInterp(T y0, T y1, T y2, T y3, T t) {
+	T t2 = t * t;
+	T t3 = t2 * t;
+	return y1 + T(0.5) * t * (y2 - y0)
+		+ t2 * (y0 - T(2.5)*y1 + T(2)*y2 - T(0.5)*y3)
+		+ t3 * (T(-0.5)*y0 + T(1.5)*y1 - T(1.5)*y2 + T(0.5)*y3);
+}
 
 /** One-pole low-pass filter, exponential moving average.
 -6 dB/octave slope.
@@ -50,7 +47,7 @@ struct OnePoleLowpass {
 
 	/** Advances the state with input x. Returns the output. */
 	template <typename T>
-	T process(State<T>& s, T x) {
+	T process(State<T>& s, T x) const {
 		s.y += alpha * (x - s.y);
 		return s.y;
 	}
@@ -77,7 +74,7 @@ Has a pole at (1 - alpha) and a zero at 1.
 */
 struct OnePoleHighpass : OnePoleLowpass {
 	template <typename T>
-	T process(State<T>& s, T x) {
+	T process(State<T>& s, T x) const {
 		return x - OnePoleLowpass::process(s, x);
 	}
 
@@ -93,82 +90,103 @@ struct OnePoleHighpass : OnePoleLowpass {
 };
 
 
-/** Computes the minimum-phase bandlimited step (MinBLEP)
-Z: number of zero-crossings on each side of the original symmetric sync signal
-O: oversample factor
-output: must be length `(2 * Z) * O`.
-First sample is >0. Last sample is 1.
-
-Algorithm from "Hard Sync Without Aliasing" by Eli Brandt (2001).
-https://www.cs.cmu.edu/~eli/papers/icmc01-hardsync.pdf
-*/
-void minBlepImpulse(int Z, int O, float* output);
-
-
-template <int Z, int O, typename T = float>
-struct MinBlepGenerator {
-	T buffer[2 * Z] = {};
-	int bufferIndex = 0;
-
-	/** Reordered impulse response for adjacent access, minus 1.0.
-	Padded at end with zeros so impulseReordered[o][2 * Z] can be loaded into a simd::float_4.
+template <int Z, int O>
+struct MinBlep {
+	/** Reordered impulse response for cubic interpolation, minus 1.0.
+	Z dimension has +1 padding at start (for z-1 wrap) and +4 at end (for SIMD + z+1 wrap).
+	Access via getImpulse() which handles o wrapping and z offset.
 	*/
-	alignas(16) float impulseReordered[O][2 * Z + 4] = {};
+	float impulseReordered[O][1 + 2 * Z + 4] = {};
 
-	MinBlepGenerator() {
+	MinBlep() {
 		float impulse[2 * Z][O];
 		dsp::minBlepImpulse(Z, O, &impulse[0][0]);
 
-		// Transpose array and pre-subtract 1.
+		// Fill impulseReordered with transposed data, minus 1.
+		// Storage index = z + 1 (to allow z = -1 access at index 0)
 		for (int o = 0; o < O; o++) {
+			// z = -1: before impulse starts
+			impulseReordered[o][0] = -1.f;
+			// z = 0 to 2*Z-1: actual impulse data
 			for (int z = 0; z < 2 * Z; z++) {
-				impulseReordered[o][z] = impulse[z][o] - 1.f;
+				impulseReordered[o][1 + z] = impulse[z][o] - 1.f;
+			}
+			// z = 2*Z to 2*Z+3: after impulse ends (for SIMD padding)
+			for (int z = 2 * Z; z < 2 * Z + 4; z++) {
+				impulseReordered[o][1 + z] = 0.f;
 			}
 		}
 	}
 
-	/** Places a discontinuity with magnitude `x` at -1 < p <= 0 relative to the current frame.
-	For example if a square wave jumps from 1 to -1 at the subsample time 0.1 frames ago, use insertDiscontinuity(-0.1, -2.0)
+	/** Get pointer to impulse data, handling o wrapping and z offset. */
+	const float* getImpulse(int o, int z) const {
+		int index = z * O + o;
+		z = index / O;
+		o = index % O;
+		if (o < 0) {
+			z -= 1;
+			o += O;
+		}
+		return &impulseReordered[o][z + 1];
+	}
+
+	template <typename T = float>
+	struct State {
+		// Double buffer of length 2 * Z
+		T buffer[4 * Z] = {};
+		int32_t bufferIndex = 0;
+	};
+
+	/** Places a discontinuity with magnitude `x` at 0 < p <= 1 relative to the current frame.
+	For example if a square wave will jump from 1 to -1 in 0.1 frames, use insertDiscontinuity(0.1, -2.0).
+
+	Note: In the deprecated MinBlepGenerator, `p` was in the range (-1, 0], so add 1 to p if updating to MinBlep.
 	*/
-	void insertDiscontinuity(float p, T x) {
-		if (!(-1 < p && p <= 0))
+	template <typename T>
+	void insertDiscontinuity(State<T>& s, float p, T x) const {
+		if (!(0 < p && p <= 1))
 			return;
 
-		// Calculate oversampling index and fractional part
-		float subsample = -p * O;
+		// Calculate impulse array index and fractional part
+		float subsample = (1 - p) * O;
 		int o = (int) subsample;
-		float frac = subsample - o;
+		float t = subsample - o;
 
-		// For each zero crossing, interpolate impulse response between oversample indices
-		// Compute 4 at a time with SIMD
+		// For each zero crossing, cubic interpolate impulse response
 		for (int z = 0; z < 2 * Z; z += 4) {
-			simd::float_4 ir0 = simd::float_4::load(&impulseReordered[o][z]);
-			// If oversample index reaches next zero crossing, wrap to next z
-			simd::float_4 ir1 = simd::float_4::load((o + 1 < O) ? &impulseReordered[o + 1][z] : &impulseReordered[0][z + 1]);
-			simd::float_4 ir = (ir0 + (ir1 - ir0) * frac);
+			simd::float_4 y0 = simd::float_4::load(getImpulse(o - 1, z));
+			simd::float_4 y1 = simd::float_4::load(getImpulse(o, z));
+			simd::float_4 y2 = simd::float_4::load(getImpulse(o + 1, z));
+			simd::float_4 y3 = simd::float_4::load(getImpulse(o + 2, z));
+			simd::float_4 ir = cubicInterp(y0, y1, y2, y3, T(t));
 
+			// Write to double buffer
 			for (int zz = 0; zz < 4; zz++) {
-				buffer[(bufferIndex + z + zz) % (2 * Z)] += ir[zz] * x;
+				s.buffer[s.bufferIndex + z + zz] += ir[zz] * x;
 			}
 		}
-		// for (int z = 0; z < 2 * Z; z++) {
-		// 	float ir0 = impulseReordered[o][z];
-		// 	float ir1 = (o + 1 < O) ? impulseReordered[o + 1][z] : impulseReordered[0][z + 1];
-		// 	float ir = (ir0 + (ir1 - ir0) * frac);
-		// 	buffer[(bufferIndex + z) % (2 * Z)] += ir * x;
-		// }
 	}
 
 	/** Should be called every frame after inserting any discontinuities. */
-	T process() {
-		T v = buffer[bufferIndex];
-		buffer[bufferIndex] = T(0);
-		bufferIndex = (bufferIndex + 1) % (2 * Z);
+	template <typename T>
+	T process(State<T>& s) const {
+		T v = s.buffer[s.bufferIndex];
+		s.buffer[s.bufferIndex] = T(0);
+		s.bufferIndex++;
+		if (s.bufferIndex >= 2 * Z) {
+			// Move second half of buffer to beginning
+			std::memcpy(s.buffer, s.buffer + 2 * Z, 2 * Z * sizeof(T));
+			std::memset(s.buffer + 2 * Z, 0, 2 * Z * sizeof(T));
+			s.bufferIndex = 0;
+		}
 		return v;
 	}
 };
 
 
+static MinBlep<16, 16> minBlep;
+
+
 template <typename T>
 struct VCOProcessor {
 	T phase = 0.f;
@@ -183,12 +201,12 @@ struct VCOProcessor {
 	OnePoleHighpass::State<T> dcFilterStateTri;
 	OnePoleHighpass::State<T> dcFilterStateSin;
 
-	MinBlepGenerator<16, 16, T> sqrMinBlep;
-	MinBlepGenerator<16, 16, T> sawMinBlep;
-	MinBlepGenerator<16, 16, T> sinMinBlep;
+	MinBlep<16, 16>::State<T> sqrMinBlep;
+	MinBlep<16, 16>::State<T> sawMinBlep;
+	MinBlep<16, 16>::State<T> sinMinBlep;
 
 	void setSampleTime(float sampleTime) {
-		dcFilter.setCutoff(std::min(0.4f, 40.f * sampleTime));
+		dcFilter.setCutoff(std::min(0.4f, 20.f * sampleTime));
 	}
 
 	struct Frame {
@@ -241,9 +259,9 @@ struct VCOProcessor {
 					if (wrapM & (1 << i)) {
 						T mask = simd::movemaskInverse<T>(1 << i);
 						// TODO: MinBlepGenerator::insertDiscontinuity() should handle subframes outside the range -1 < p <= 0 instead of failing silently.
-						float p = clamp(wrapCrossing[i] - 1.f, -1.f, 0.f);
+						float p = clamp(wrapCrossing[i], 0.f, 1.f);
 						T x = mask & (2.f * syncDirection);
-						sqrMinBlep.insertDiscontinuity(p, x);
+						minBlep.insertDiscontinuity(sqrMinBlep, p, x);
 					}
 				}
 			}
@@ -261,9 +279,9 @@ struct VCOProcessor {
 				for (int i = 0; i < frame.channels; i++) {
 					if (pw & (1 << i)) {
 						T mask = simd::movemaskInverse<T>(1 << i);
-						float p = clamp(pulseCrossing[i] - 1.f, -1.f, 0.f);
+						float p = clamp(pulseCrossing[i], 0.f, 1.f);
 						T x = mask & (-2.f * syncDirection);
-						sqrMinBlep.insertDiscontinuity(p, x);
+						minBlep.insertDiscontinuity(sqrMinBlep, p, x);
 					}
 				}
 			}
@@ -278,18 +296,19 @@ struct VCOProcessor {
 				for (int i = 0; i < frame.channels; i++) {
 					if (halfMask & (1 << i)) {
 						T mask = simd::movemaskInverse<T>(1 << i);
-						float p = halfCrossing[i] - 1.f;
+						float p = halfCrossing[i];
 						T x = mask & (-2.f * syncDirection);
-						sawMinBlep.insertDiscontinuity(p, x);
+						minBlep.insertDiscontinuity(sawMinBlep, p, x);
 					}
 				}
 			}
 		}
 
 		// Detect sync
-		// Might be NAN or outside of [0, 1) range
 		if (frame.syncEnabled) {
 			T deltaSync = frame.sync - lastSyncValue;
+			// Sample position where sync crosses 0.
+			// Might be NAN or outside of (0, 1] range
 			T syncCrossing = -lastSyncValue / deltaSync;
 			lastSyncValue = frame.sync;
 			T sync = (0.f < syncCrossing) & (syncCrossing <= 1.f) & (frame.sync >= 0.f);
@@ -304,21 +323,21 @@ struct VCOProcessor {
 					for (int i = 0; i < frame.channels; i++) {
 						if (syncMask & (1 << i)) {
 							T mask = simd::movemaskInverse<T>(1 << i);
-							float p = syncCrossing[i] - 1.f;
-							T x;
-							// Assume that hard-syncing a square always resets it to HIGH
+							float p = syncCrossing[i];
 							if (frame.sqrEnabled || frame.triEnabled) {
-								x = mask & (1.f - sqrState);
+								// Assume that hard-syncing a square always resets it to HIGH
+								T x = mask & (1.f - sqrState);
 								sqrState = simd::ifelse(mask, 1.f, sqrState);
-								sqrMinBlep.insertDiscontinuity(p, x);
+								minBlep.insertDiscontinuity(sqrMinBlep, p, x);
+								DEBUG("%f %f %f", p, x[i], sqrState[i]);
 							}
 							if (frame.sawEnabled) {
-								x = mask & (saw(newPhase) - saw(phase));
-								sawMinBlep.insertDiscontinuity(p, x);
+								T x = mask & (saw(newPhase) - saw(phase));
+								minBlep.insertDiscontinuity(sawMinBlep, p, x);
 							}
 							if (frame.sinEnabled) {
-								x = mask & (sin(newPhase) - sin(phase));
-								sinMinBlep.insertDiscontinuity(p, x);
+								T x = mask & (sin(newPhase) - sin(phase));
+								minBlep.insertDiscontinuity(sinMinBlep, p, x);
 							}
 						}
 					}
@@ -330,14 +349,14 @@ struct VCOProcessor {
 		// Saw
 		if (frame.sawEnabled) {
 			frame.saw = saw(phase);
-			frame.saw += sawMinBlep.process();
+			frame.saw += minBlep.process(sawMinBlep);
 			frame.saw = dcFilter.process(dcFilterStateSaw, frame.saw);
 		}
 
 		// Square
 		if (frame.sqrEnabled || frame.triEnabled) {
 			frame.sqr = sqrState;
-			frame.sqr += sqrMinBlep.process();
+			frame.sqr += minBlep.process(sqrMinBlep);
 			T triSqr = frame.sqr;
 			frame.sqr = dcFilter.process(dcFilterStateSqr, frame.sqr);
 
@@ -359,7 +378,7 @@ struct VCOProcessor {
 		// Sin
 		if (frame.sinEnabled) {
 			frame.sin = sin(phase);
-			frame.sin += sinMinBlep.process();
+			frame.sin += minBlep.process(sinMinBlep);
 			frame.sin = dcFilter.process(dcFilterStateSin, frame.sin);
 		}
 	}
@@ -377,8 +396,7 @@ struct VCOProcessor {
 		return 1 - 4 * simd::fmin(simd::fabs(phase - 0.25f), simd::fabs(phase - 1.25f));
 	}
 	static T sin(T phase) {
-		return sin2pi_fast(phase);
-		// return simd::sin(2.f * M_PI * phase);
+		return sin_pi_9(2 * phase - 1);
 	}
 };