diff --git a/modules/juce_dsp/containers/juce_SIMDRegister.h b/modules/juce_dsp/containers/juce_SIMDRegister.h index 65165fe233..8fcba5f9a1 100644 --- a/modules/juce_dsp/containers/juce_SIMDRegister.h +++ b/modules/juce_dsp/containers/juce_SIMDRegister.h @@ -115,6 +115,20 @@ struct SIMDRegister __mm128 for single-precision floating point on SSE architectures). */ inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromNative (vSIMDType a) noexcept { return {a}; } + /** Creates a new SIMDRegister from the first SIMDNumElements of a scalar array. */ + inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromRawArray (const ElementType* a) noexcept + { + jassert (isSIMDAligned (a)); + return {CmplxOps::load (a)}; + } + + /** Copies the elements of the SIMDRegister to a scalar array in memory. */ + inline void JUCE_VECTOR_CALLTYPE copyToRawArray (ElementType* a) const noexcept + { + jassert (isSIMDAligned (a)); + CmplxOps::store (value, a); + } + //============================================================================== /** Returns the idx-th element of the receiver. Note that this does not check if idx is larger than the native register size. */ @@ -269,7 +283,7 @@ struct SIMDRegister //============================================================================== /** Checks if the given pointer is suffeciently aligned for using SIMD operations. */ - static inline bool isSIMDAligned (ElementType* ptr) noexcept + static inline bool isSIMDAligned (const ElementType* ptr) noexcept { uintptr_t bitmask = SIMDRegisterSize - 1; return (reinterpret_cast (ptr) & bitmask) == 0; @@ -285,6 +299,13 @@ struct SIMDRegister return snapPointerToAlignment (ptr, SIMDRegisterSize); } + #ifndef DOXYGEN + static inline const ElementType* getNextSIMDAlignedPtr (const ElementType* ptr) noexcept + { + return snapPointerToAlignment (ptr, SIMDRegisterSize); + } + #endif + private: static inline vMaskType JUCE_VECTOR_CALLTYPE toMaskType (vSIMDType a) noexcept { @@ -333,6 +354,16 @@ struct CmplxSIMDOps { typedef typename SIMDNativeOps::vSIMDType vSIMDType; + static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept + { + return SIMDNativeOps::load (a); + } + + static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept + { + SIMDNativeOps::store (value, dest); + } + static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept { return SIMDNativeOps::expand (s); @@ -360,6 +391,16 @@ struct CmplxSIMDOps> { typedef typename SIMDNativeOps::vSIMDType vSIMDType; + static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex* a) noexcept + { + return SIMDNativeOps::load (reinterpret_cast (a)); + } + + static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex* dest) noexcept + { + SIMDNativeOps::store (value, reinterpret_cast (dest)); + } + static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex s) noexcept { const int n = sizeof (vSIMDType) / sizeof (Scalar); diff --git a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp index 1e91de6a5d..c23ca22c82 100644 --- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp +++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp @@ -95,7 +95,7 @@ namespace SIMDRegister_test_internal class SIMDRegisterUnitTests : public UnitTest { public: - SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests") {} + SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests", "DSP") {} //============================================================================== // Some helper classes @@ -113,7 +113,10 @@ public: template static bool vecEqualToArray (const SIMDRegister& vec, const type* array) { - const type* ptr = reinterpret_cast (&vec); + HeapBlock vecElementsStorage (SIMDRegister::SIMDNumElements * 2); + auto* ptr = SIMDRegister::getNextSIMDAlignedPtr (vecElementsStorage.getData()); + vec.copyToRawArray (ptr); + for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) { double delta = SIMDRegister_test_internal::difference (ptr[i], array[i]); @@ -130,8 +133,15 @@ public: template static void copy (SIMDRegister& vec, const type* ptr) { - for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) - vec[i] = ptr[i]; + if (SIMDRegister::isSIMDAligned (ptr)) + { + vec = SIMDRegister::fromRawArray (ptr); + } + else + { + for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) + vec[i] = ptr[i]; + } } //============================================================================== diff --git a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h index c5894209f7..935c83866b 100644 --- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h @@ -66,6 +66,8 @@ struct SIMDNativeOps static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); } + static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); } @@ -120,28 +122,30 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (double, kOne); //============================================================================== - static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256d expand (double s) noexcept { return _mm256_broadcast_sd (&s); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); } - static forcedinline __m256d bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); } //============================================================================== static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept @@ -170,24 +174,38 @@ struct SIMDNativeOps //============================================================================== DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet); - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + + //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], + b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], + b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], + b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } - //============================================================================== static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); @@ -247,6 +265,20 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], + b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], + b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], + b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); @@ -306,6 +338,17 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept + { + return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8], + a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi16 (a, a); @@ -349,6 +392,18 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], + b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi16 (a, a); @@ -390,6 +445,16 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept + { + return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi32 (a, a); @@ -432,6 +497,17 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi32 (a, a); @@ -469,6 +545,16 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept + { + return _mm256_set_epi64x (a[3], a[2], a[1], a[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { #ifdef _MSC_VER @@ -530,6 +616,17 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm256_set_epi64x (b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { #ifdef _MSC_VER diff --git a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h index 6456f4fc2c..d88ee12fcb 100644 --- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h @@ -200,6 +200,25 @@ struct SIMDFallbackOps return retval; } + static forcedinline vSIMDType load (const ScalarType* a) noexcept + { + vSIMDType retval; + auto* dst = reinterpret_cast (&retval); + + for (size_t i = 0; i < n; ++i) + dst [i] = a[i]; + + return retval; + } + + static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept + { + const auto* src = reinterpret_cast (&value); + + for (size_t i = 0; i < n; ++i) + dest[i] = src[i]; + } + template static forcedinline vSIMDType shuffle (vSIMDType a) noexcept { diff --git a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h index 55a23e107b..0f67564476 100644 --- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h @@ -66,15 +66,17 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (float, kOne); //============================================================================== - static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } + static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } + static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); } + static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); } static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); } static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); } @@ -109,6 +111,8 @@ struct SIMDNativeOps typedef SIMDFallbackOps fb; static forcedinline vSIMDType expand (double s) noexcept { return fb::expand (s); } + static forcedinline vSIMDType load (const double* a) noexcept { return fb::load (a); } + static forcedinline void store (vSIMDType value, double* a) noexcept { fb::store (value, a); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return fb::add (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return fb::sub (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } @@ -143,6 +147,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); } + static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); } + static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); } @@ -174,7 +180,9 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } + static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } + static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); } + static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); } @@ -207,6 +215,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); } + static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); } + static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); } @@ -239,7 +249,9 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } + static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } + static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); } + static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); } @@ -272,6 +284,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); } + static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); } + static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); } @@ -304,7 +318,9 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } + static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } + static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); } + static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); } @@ -337,6 +353,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); } + static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); } + static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } @@ -370,6 +388,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); } + static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); } + static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); } static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); } static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); } static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } diff --git a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h index 3b2c741fbd..ce3bd819f6 100644 --- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h @@ -65,6 +65,8 @@ struct SIMDNativeOps //============================================================================== static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); } + static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); } static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); } static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); } static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); } @@ -124,6 +126,8 @@ struct SIMDNativeOps static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); } + static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); } @@ -199,6 +203,18 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], + b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -268,6 +284,18 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], + b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -337,6 +365,16 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept + { + return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -395,6 +433,17 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -428,6 +477,7 @@ struct SIMDNativeOps //============================================================================== static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } @@ -442,6 +492,11 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -522,6 +577,17 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm_set_epi32 (b[3], b[2], b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -591,6 +657,7 @@ struct SIMDNativeOps return retval; } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); } @@ -606,6 +673,11 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { const int64_t* ptr = reinterpret_cast (&a); @@ -692,6 +764,17 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } //============================================================================== + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept + { + const auto* b = reinterpret_cast (a); + return _mm_set_epi64x (b[1], b[0]); + } + + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept + { + SIMDFallbackOps::store (value, dest); + } + static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { const uint64_t* ptr = reinterpret_cast (&a);