From 970483b1cd7173f9030f6405e8498c41b5388057 Mon Sep 17 00:00:00 2001 From: reuk Date: Tue, 22 Mar 2022 16:03:28 +0000 Subject: [PATCH] SSE SIMDNativeOps: Reimplement sum for SSE3 to work around an AppleClang bug With clang 13.0.0, and Apple clang version 13.1.6 (clang-1316.0.21.2), the following code fails to compile with `-std=c++20 -O3 -msse3`: #include auto test (__m128 a) { return _mm_hadd_ps (_mm_hadd_ps (a, a), a); } --- modules/juce_dsp/native/juce_sse_SIMDNativeOps.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h index 7f64d433d7..846bd8eba5 100644 --- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h @@ -106,11 +106,13 @@ struct SIMDNativeOps static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept { #if defined(__SSE4__) - __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff); + const auto retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff); #elif defined(__SSE3__) - __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a); + const auto shuffled = _mm_movehdup_ps (a); + const auto sums = _mm_add_ps (a, shuffled); + const auto retval = _mm_add_ss (sums, _mm_movehl_ps (shuffled, sums)); #else - __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a); + auto retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a); retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1)); #endif return _mm_cvtss_f32 (retval);