Rewrite barriers in Engine to be 2-phase and hopefully less CPU.

3 years ago · 09ff13a4f0
--- a/src/engine/Engine.cpp
+++ b/src/engine/Engine.cpp
@@ -84,88 +84,122 @@ struct WriteLock {
 };


 /** Barrier based on mutexes.
 Not finished or tested, do not use.
 */
 struct Barrier {
 	int count = 0;
 	uint8_t step = 0;
 	int threads = 0;

 	std::mutex mutex;
 	std::condition_variable cv;
 	int count = 0;
 	int total = 0;

 	void setThreads(int threads) {
 		this->threads = threads;
 	}

 	void wait() {
 		// Waiting on one thread is trivial.
 		if (total <= 1)
 			return;
 		std::unique_lock<std::mutex> lock(mutex);
 		int id = ++count;
 		if (id == total) {
 		uint8_t s = step;
 		if (++count >= threads) {
 			// We're the last thread. Reset next phase.
 			count = 0;
 			// Allow other threads to exit wait()
 			step++;
 			cv.notify_all();
 			return;
 		}
 		else {
 			cv.wait(lock);
 		}

 		cv.wait(lock, [&] {
 			return step != s;
 		});
 	}
 };


 /** 2-phase barrier based on spin-locking.
 */
 struct SpinBarrier {
 	std::atomic<int> count{0};
 	int total = 0;
 	std::atomic<uint8_t> step{0};
 	int threads = 0;

 	/** Must be called when no threads are calling wait().
 	*/
 	void setThreads(int threads) {
 		this->threads = threads;
 	}

 	void wait() {
 		int id = ++count;
 		if (id == total) {
 		uint8_t s = step;
 		if (count.fetch_add(1, std::memory_order_acquire) + 1 >= threads) {
 			// We're the last thread. Reset next phase.
 			count = 0;
 			// Allow other threads to exit wait()
 			step++;
 			return;
 		}
 		else {
 			while (count != 0) {
 				_mm_pause();
 			}

 		// Spin until the last thread begins waiting
 		while (true) {
 			if (step.load(std::memory_order_relaxed) != s)
 				return;
 			__builtin_ia32_pause();
 		}
 	}
 };


 /** Spinlocks until all `total` threads are waiting.
 If `yield` is set to true at any time, all threads will switch to waiting on a mutex instead.
 All threads must return before beginning a new phase. Alternating between two barriers solves this problem.
 /** Barrier that spin-locks until yield() is called, and then all threads switch to a mutex.
 yield() should be called if it is likely that all threads will block for a while and continuing to spin-lock is unnecessary.
 Saves CPU power after yield is called.
 */
 struct HybridBarrier {
 	std::atomic<int> count {0};
 	int total = 0;
 	std::atomic<int> count{0};
 	std::atomic<uint8_t> step{0};
 	int threads = 0;

 	std::atomic<bool> yielded{false};
 	std::mutex mutex;
 	std::condition_variable cv;

 	std::atomic<bool> yield {false};
 	void setThreads(int threads) {
 		this->threads = threads;
 	}

 	void wait() {
 		int id = ++count;
 	void yield() {
 		yielded = true;
 	}

 		// End and reset phase if this is the last thread
 		if (id == total) {
 	void wait() {
 		uint8_t s = step;
 		if (count.fetch_add(1, std::memory_order_acquire) + 1 >= threads) {
 			// We're the last thread. Reset next phase.
 			count = 0;
 			if (yield) {
 			bool wasYielded = yielded;
 			yielded = false;
 			// Allow other threads to exit wait()
 			step++;
 			if (wasYielded) {
 				std::unique_lock<std::mutex> lock(mutex);
 				cv.notify_all();
 				yield = false;
 			}
 			return;
 		}

 		// Spinlock
 		while (!yield) {
 			if (count == 0)
 		// Spin until the last thread begins waiting
 		while (!yielded.load(std::memory_order_relaxed)) {
 			if (step.load(std::memory_order_relaxed) != s)
 				return;
 			_mm_pause();
 			__builtin_ia32_pause();
 		}

 		// Wait on mutex
 		{
 			std::unique_lock<std::mutex> lock(mutex);
 			cv.wait(lock, [&] {
 				return count == 0;
 			});
 		}
 		// Wait on mutex CV
 		std::unique_lock<std::mutex> lock(mutex);
 		cv.wait(lock, [&] {
 			return step != s;
 		});
 	}
 };

@@ -296,8 +330,8 @@ static void Engine_relaunchWorkers(Engine* that, int threadCount) {
 	internal->threadCount = threadCount;

 	// Set barrier counts
 	internal->engineBarrier.total = threadCount;
 	internal->workerBarrier.total = threadCount;
 	internal->engineBarrier.setThreads(threadCount);
 	internal->workerBarrier.setThreads(threadCount);

 	if (threadCount > 0) {
 		// Create and start engine workers
@@ -641,7 +675,7 @@ float Engine::getSampleTime() {


 void Engine::yieldWorkers() {
 	internal->workerBarrier.yield = true;
 	internal->workerBarrier.yield();
 }