diff --git a/.gitmodules b/.gitmodules index e86a9221..dc7d1db1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -34,3 +34,6 @@ [submodule "dep/rtmidi"] path = dep/rtmidi url = https://github.com/VCVRack/rtmidi.git +[submodule "dep/sse2neon"] + path = dep/sse2neon + url = https://github.com/DLTcollab/sse2neon.git diff --git a/arch.mk b/arch.mk index 1d62c9f3..a8e41d98 100644 --- a/arch.mk +++ b/arch.mk @@ -3,6 +3,9 @@ MACHINE = $(shell $(CC) -dumpmachine) ifneq (,$(findstring x86_64-,$(MACHINE))) ARCH_X64 := 1 ARCH_NAME := x64 +else ifneq (,$(findstring arm64-,$(MACHINE))) + ARCH_ARM64 := 1 + ARCH_NAME := arm64 else $(error Could not determine CPU architecture of $(MACHINE). Try hacking around in arch.mk) endif diff --git a/compile.mk b/compile.mk index 58f0d89f..b72fd683 100644 --- a/compile.mk +++ b/compile.mk @@ -14,7 +14,7 @@ FLAGS += -MMD -MP # Debugger symbols. These are removed with `strip`. FLAGS += -g # Optimization -FLAGS += -O3 -march=nehalem -funsafe-math-optimizations -fno-omit-frame-pointer +FLAGS += -O3 -funsafe-math-optimizations -fno-omit-frame-pointer # Warnings FLAGS += -Wall -Wextra -Wno-unused-parameter # C++ standard @@ -23,6 +23,11 @@ CXXFLAGS += -std=c++11 # Architecture-independent flags ifdef ARCH_X64 FLAGS += -DARCH_X64 + FLAGS += -march=nehalem +endif +ifdef ARCH_ARM64 + FLAGS += -DARCH_ARM64 + FLAGS += -march=armv8-a+fp+simd endif ifdef ARCH_LIN diff --git a/dep/Makefile b/dep/Makefile index 2ca5a4a8..d0bcdbc3 100755 --- a/dep/Makefile +++ b/dep/Makefile @@ -55,6 +55,7 @@ osdialog = include/osdialog.h pffft = include/pffft.h fuzzysearchdatabase = include/FuzzySearchDatabase.hpp ghcfilesystem = include/ghc/filesystem.hpp +sse2neon = include/sse2neon/sse2neon.h DEPS += $(glew) DEPS += $(glfw) @@ -72,6 +73,7 @@ DEPS += $(osdialog) DEPS += $(pffft) DEPS += $(fuzzysearchdatabase) DEPS += $(ghcfilesystem) +DEPS += $(sse2neon) DEP_LOCAL := . @@ -255,6 +257,10 @@ $(ghcfilesystem): filesystem/include/ghc mkdir -p include cp -r $^ include/ +$(sse2neon): sse2neon/sse2neon.h + mkdir -p include + cp $^ include/ + # Helpers src: glew-2.1.0 glfw jansson-2.12 speexdsp-SpeexDSP-1.2rc3 libsamplerate-0.1.9 openssl-1.1.1k curl-7.79.1 zstd-1.4.5 libarchive-3.4.3 rtaudio nanovg nanosvg oui-blendish osdialog diff --git a/dep/sse2neon b/dep/sse2neon new file mode 160000 index 00000000..988782cb --- /dev/null +++ b/dep/sse2neon @@ -0,0 +1 @@ +Subproject commit 988782cbadf95c2072b4b1b2b8fa0afa81b01c36 diff --git a/include/simd/Vector.hpp b/include/simd/Vector.hpp index 9d255669..c8a9c6f7 100644 --- a/include/simd/Vector.hpp +++ b/include/simd/Vector.hpp @@ -1,6 +1,6 @@ #pragma once #include -#include +#include "common.hpp" namespace rack { diff --git a/include/simd/common.hpp b/include/simd/common.hpp new file mode 100644 index 00000000..718414a9 --- /dev/null +++ b/include/simd/common.hpp @@ -0,0 +1,9 @@ +#pragma once + +#if defined ARCH_X64 + // Intel intrinsics header + #include +#elif defined ARCH_ARM64 + // Translation header for using SSE3 intrinsics on ARM64 NEON + #include +#endif diff --git a/include/simd/sse_mathfun.h b/include/simd/sse_mathfun.h index 8f703f91..b2dbb760 100644 --- a/include/simd/sse_mathfun.h +++ b/include/simd/sse_mathfun.h @@ -43,7 +43,7 @@ This derived source file is released under the zlib license. (this is the zlib license) */ #pragma once -#include +#include "common.hpp" /** Generate 1.f without accessing memory */ diff --git a/src/common.cpp b/src/common.cpp index ef020738..d6f0ba2d 100644 --- a/src/common.cpp +++ b/src/common.cpp @@ -30,6 +30,9 @@ const std::string APP_VERSION = TOSTRING(_APP_VERSION); #if defined ARCH_X64 const std::string APP_ARCH = "x64"; #endif +#if defined ARCH_ARM64 + const std::string APP_ARCH = "arm64"; +#endif const std::string API_URL = "https://api.vcvrack.com"; diff --git a/src/engine/Engine.cpp b/src/engine/Engine.cpp index 022813a6..453a4a09 100644 --- a/src/engine/Engine.cpp +++ b/src/engine/Engine.cpp @@ -5,7 +5,9 @@ #include #include #include -#include +#if defined ARCH_X64 + #include +#endif #include #include @@ -21,6 +23,7 @@ namespace rack { namespace engine { +#if defined ARCH_X64 static void initMXCSR() { // Set CPU to flush-to-zero (FTZ) and denormals-are-zero (DAZ) mode // https://software.intel.com/en-us/node/682949 @@ -29,6 +32,7 @@ static void initMXCSR() { // Reset other flags _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); } +#endif /** Barrier based on mutexes. @@ -92,7 +96,9 @@ struct SpinBarrier { while (true) { if (step.load(std::memory_order_relaxed) != s) return; +#if defined ARCH_X64 __builtin_ia32_pause(); +#endif } } }; @@ -139,7 +145,9 @@ struct HybridBarrier { while (!yielded.load(std::memory_order_relaxed)) { if (step.load(std::memory_order_relaxed) != s) return; +#if defined ARCH_X64 __builtin_ia32_pause(); +#endif } // Wait on mutex CV @@ -529,8 +537,10 @@ void Engine::stepBlock(int frames) { std::lock_guard stepLock(internal->blockMutex); SharedLock lock(internal->mutex); // Configure thread +#if defined ARCH_X64 uint32_t csr = _mm_getcsr(); initMXCSR(); +#endif random::init(); internal->blockFrame = internal->frame; @@ -573,8 +583,10 @@ void Engine::stepBlock(int frames) { internal->meterMax = 0.0; } +#if defined ARCH_X64 // Reset MXCSR back to original value _mm_setcsr(csr); +#endif } @@ -1299,7 +1311,9 @@ void EngineWorker::run() { // Configure thread contextSet(engine->internal->context); system::setThreadName(string::f("Worker %d", id)); +#if defined ARCH_X64 initMXCSR(); +#endif random::init(); while (true) {