From 617f6ddafd8aea581d41b68c70ed428d9be79001 Mon Sep 17 00:00:00 2001
From: Andrew Belt <andrewpbelt@gmail.com>
Date: Thu, 13 Mar 2025 18:29:56 -0400
Subject: [PATCH] Refactor all Barrier implementations. Explicitly use
 appropriate memory ordering everywhere. Use __yield() on ARM64.

---
 src/engine/Engine.cpp | 121 ++++++++++++++++++++++++------------------
 1 file changed, 69 insertions(+), 52 deletions(-)
diff --git a/src/engine/Engine.cpp b/src/engine/Engine.cpp
index 9ed9a245..305fd5cb 100644
--- a/src/engine/Engine.cpp
+++ b/src/engine/Engine.cpp
@@ -21,70 +21,79 @@ namespace rack {
 namespace engine {
 
 
-/** Barrier based on mutexes.
-Not finished or tested, do not use.
+inline void cpuPause() {
+#if defined ARCH_X64
+	_mm_pause();
+#elif defined ARCH_ARM64
+	__yield();
+#endif
+}
+
+
+/** Multiple-phase barrier based on C++ mutexes, as a reference.
 */
 struct Barrier {
-	int count = 0;
-	uint8_t step = 0;
-	int threads = 0;
+	size_t threads = 0;
+	size_t count = 0;
+	size_t phase = 0;
 
 	std::mutex mutex;
 	std::condition_variable cv;
 
-	void setThreads(int threads) {
+	void setThreads(size_t threads) {
 		this->threads = threads;
 	}
 
 	void wait() {
 		std::unique_lock<std::mutex> lock(mutex);
-		uint8_t s = step;
+		size_t currentPhase = phase;
+
+		// Check if we're the last thread.
 		if (++count >= threads) {
-			// We're the last thread. Reset next phase.
+			// Advance phase and reset count
 			count = 0;
-			// Allow other threads to exit wait()
-			step++;
+			phase++;
+			// Notify all other threads
 			cv.notify_all();
 			return;
 		}
 
+		// Unlock and wait on phase to change
 		cv.wait(lock, [&] {
-			return step != s;
+			return phase != currentPhase;
 		});
 	}
 };
 
 
-/** 2-phase barrier based on spin-locking.
+/** Multiple-phase barrier based on spin-locking.
 */
 struct SpinBarrier {
-	std::atomic<int> count{0};
-	std::atomic<uint8_t> step{0};
-	int threads = 0;
+	size_t threads = 0;
+	std::atomic<size_t> count{0};
+	std::atomic<size_t> phase{0};
 
-	/** Must be called when no threads are calling wait().
-	*/
-	void setThreads(int threads) {
+	void setThreads(size_t threads) {
 		this->threads = threads;
 	}
 
 	void wait() {
-		uint8_t s = step;
-		if (count.fetch_add(1, std::memory_order_acquire) + 1 >= threads) {
-			// We're the last thread. Reset next phase.
-			count = 0;
-			// Allow other threads to exit wait()
-			step++;
+		size_t currentPhase = phase.load(std::memory_order_acquire);
+
+		if (count.fetch_add(1, std::memory_order_acq_rel) + 1 >= threads) {
+			// Reset count
+			count.store(0, std::memory_order_release);
+			// Advance phase, which notifies all other threads, which are all spinning
+			phase.fetch_add(1, std::memory_order_release);
 			return;
 		}
 
-		// Spin until the last thread begins waiting
+		// Spin until the phase is changed by the last thread
 		while (true) {
-			if (step.load(std::memory_order_relaxed) != s)
+			if (phase.load(std::memory_order_acquire) != currentPhase)
 				return;
-#if defined ARCH_X64
-			__builtin_ia32_pause();
-#endif
+			// std::this_thread::yield();
+			cpuPause();
 		}
 	}
 };
@@ -95,51 +104,59 @@ yield() should be called if it is likely that all threads will block for a while
 Saves CPU power after yield is called.
 */
 struct HybridBarrier {
-	std::atomic<int> count{0};
-	std::atomic<uint8_t> step{0};
-	int threads = 0;
-
+	size_t threads = 0;
+	std::atomic<size_t> count{0};
+	std::atomic<size_t> phase{0};
 	std::atomic<bool> yielded{false};
+
 	std::mutex mutex;
 	std::condition_variable cv;
 
-	void setThreads(int threads) {
+	void setThreads(size_t threads) {
 		this->threads = threads;
 	}
 
 	void yield() {
-		yielded = true;
+		yielded.store(true, std::memory_order_release);
 	}
 
 	void wait() {
-		uint8_t s = step;
-		if (count.fetch_add(1, std::memory_order_acquire) + 1 >= threads) {
-			// We're the last thread. Reset next phase.
-			count = 0;
-			bool wasYielded = yielded;
-			yielded = false;
-			// Allow other threads to exit wait()
-			step++;
-			if (wasYielded) {
+		size_t currentPhase = phase.load(std::memory_order_acquire);
+
+		// Check if we're the last thread
+		if (count.fetch_add(1, std::memory_order_acq_rel) + 1 >= threads) {
+			// Reset count
+			count.store(0, std::memory_order_release);
+
+			// If yielded, advance phase and notify all other threads
+			if (yielded.load(std::memory_order_acquire)) {
 				std::unique_lock<std::mutex> lock(mutex);
+				yielded.store(false, std::memory_order_release);
+				phase.fetch_add(1, std::memory_order_release);
 				cv.notify_all();
+				return;
 			}
+
+			// Advance phase, which notifies all other threads, which are all spinning
+			phase.fetch_add(1, std::memory_order_release);
 			return;
 		}
 
-		// Spin until the last thread begins waiting
-		while (!yielded.load(std::memory_order_relaxed)) {
-			if (step.load(std::memory_order_relaxed) != s)
+		// Spin until the phase is changed by the last thread, or yield() is called
+		while (true) {
+			if (phase.load(std::memory_order_acquire) != currentPhase)
 				return;
-#if defined ARCH_X64
-			__builtin_ia32_pause();
-#endif
+			if (yielded.load(std::memory_order_acquire))
+				break;
+			// std::this_thread::yield();
+			cpuPause();
 		}
 
-		// Wait on mutex CV
+
+		// yield() was called, so use cv to wait on phase to be changed by the last thread
 		std::unique_lock<std::mutex> lock(mutex);
 		cv.wait(lock, [&] {
-			return step != s;
+			return phase.load(std::memory_order_acquire) != currentPhase;
 		});
 	}
 };