DSP: Added SIMDRegister::copyToRawArray and SIMDRegister::fromRawArray to easily convert between raw arrays and SIMD registers

7 years ago · eb8400b366
--- a/modules/juce_dsp/containers/juce_SIMDRegister.h
+++ b/modules/juce_dsp/containers/juce_SIMDRegister.h
@@ -115,6 +115,20 @@ struct SIMDRegister
        __mm128 for single-precision floating point on SSE architectures). */
    inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromNative (vSIMDType a) noexcept       { return {a}; }

    /** Creates a new SIMDRegister from the first SIMDNumElements of a scalar array. */
    inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromRawArray (const ElementType* a) noexcept
    {
        jassert (isSIMDAligned (a));
        return {CmplxOps::load (a)};
    }

    /** Copies the elements of the SIMDRegister to a scalar array in memory. */
    inline void JUCE_VECTOR_CALLTYPE copyToRawArray (ElementType* a) const noexcept
    {
        jassert (isSIMDAligned (a));
        CmplxOps::store (value, a);
    }

    //==============================================================================
    /** Returns the idx-th element of the receiver. Note that this does not check if idx
        is larger than the native register size. */
@@ -269,7 +283,7 @@ struct SIMDRegister

    //==============================================================================
    /** Checks if the given pointer is suffeciently aligned for using SIMD operations. */
    static inline bool isSIMDAligned (ElementType* ptr) noexcept
    static inline bool isSIMDAligned (const ElementType* ptr) noexcept
    {
        uintptr_t bitmask = SIMDRegisterSize - 1;
        return (reinterpret_cast<uintptr_t> (ptr) & bitmask) == 0;
@@ -285,6 +299,13 @@ struct SIMDRegister
        return snapPointerToAlignment (ptr, SIMDRegisterSize);
    }

   #ifndef DOXYGEN
    static inline const ElementType* getNextSIMDAlignedPtr (const ElementType* ptr) noexcept
    {
        return snapPointerToAlignment (ptr, SIMDRegisterSize);
    }
   #endif

 private:
    static inline vMaskType JUCE_VECTOR_CALLTYPE toMaskType (vSIMDType a) noexcept
    {
@@ -333,6 +354,16 @@ struct CmplxSIMDOps
 {
    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;

    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept
    {
        return SIMDNativeOps<Scalar>::load (a);
    }

    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept
    {
        SIMDNativeOps<Scalar>::store (value, dest);
    }

    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept
    {
        return SIMDNativeOps<Scalar>::expand (s);
@@ -360,6 +391,16 @@ struct CmplxSIMDOps<std::complex<Scalar>>
 {
    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;

    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex<Scalar>* a) noexcept
    {
        return SIMDNativeOps<Scalar>::load (reinterpret_cast<const Scalar*> (a));
    }

    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex<Scalar>* dest) noexcept
    {
        SIMDNativeOps<Scalar>::store (value, reinterpret_cast<Scalar*> (dest));
    }

    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex<Scalar> s) noexcept
    {
        const int n = sizeof (vSIMDType) / sizeof (Scalar);
--- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
+++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
@@ -95,7 +95,7 @@ namespace SIMDRegister_test_internal
 class SIMDRegisterUnitTests   : public UnitTest
 {
 public:
    SIMDRegisterUnitTests()  : UnitTest ("SIMDRegister UnitTests") {}
    SIMDRegisterUnitTests()  : UnitTest ("SIMDRegister UnitTests", "DSP") {}

    //==============================================================================
    // Some helper classes
@@ -113,7 +113,10 @@ public:
    template <typename type>
    static bool vecEqualToArray (const SIMDRegister<type>& vec, const type* array)
    {
        const type* ptr = reinterpret_cast<const type*> (&vec);
        HeapBlock<type> vecElementsStorage (SIMDRegister<type>::SIMDNumElements * 2);
        auto* ptr = SIMDRegister<type>::getNextSIMDAlignedPtr (vecElementsStorage.getData());
        vec.copyToRawArray (ptr);

        for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
        {
            double delta = SIMDRegister_test_internal::difference (ptr[i], array[i]);
@@ -130,8 +133,15 @@ public:
    template <typename type>
    static void copy (SIMDRegister<type>& vec, const type* ptr)
    {
        for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
            vec[i] = ptr[i];
        if (SIMDRegister<type>::isSIMDAligned (ptr))
        {
            vec = SIMDRegister<type>::fromRawArray (ptr);
        }
        else
        {
            for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
                vec[i] = ptr[i];
        }
    }

    //==============================================================================
--- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
@@ -66,6 +66,8 @@ struct SIMDNativeOps<float>
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept                     { return *reinterpret_cast<const __m256*> (a); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                   { return *reinterpret_cast<const __m256*> (a); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept                            { return _mm256_broadcast_ss (&s); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept                       { return _mm256_load_ps (a); }
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept             { _mm256_store_ps (dest, value); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept                    { return _mm256_add_ps (a, b); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept                    { return _mm256_sub_ps (a, b); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept                    { return _mm256_mul_ps (a, b); }
@@ -120,28 +122,30 @@ struct SIMDNativeOps<double>
    DECLARE_AVX_SIMD_CONST (double, kOne);

    //==============================================================================
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                       { return *reinterpret_cast<const __m256d*> (a); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m256d*> (a); }
    static forcedinline __m256d expand (double s) noexcept                              { return _mm256_broadcast_sd (&s); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept                     { return _mm256_add_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept                     { return _mm256_sub_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept                     { return _mm256_mul_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept                 { return _mm256_and_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or  (__m256d a, __m256d b) noexcept                 { return _mm256_or_pd  (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept                 { return _mm256_xor_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept              { return _mm256_andnot_pd (a, b); }
    static forcedinline __m256d bit_not (__m256d a) noexcept                            { return bit_notand (a, vconst (kAllBitsSet)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept                     { return _mm256_min_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept                     { return _mm256_max_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept                   { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept                { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept             { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept      { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept  { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept                            { return _mm256_shuffle_pd (a, a, 0); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept                             { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept                        { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept                         { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                      { return *reinterpret_cast<const __m256d*> (a); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                     { return *reinterpret_cast<const __m256d*> (a); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept                             { return _mm256_broadcast_sd (&s); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept                        { return _mm256_load_pd (a); }
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept              { _mm256_store_pd (dest, value); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept                    { return _mm256_add_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept                    { return _mm256_sub_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept                    { return _mm256_mul_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept                { return _mm256_and_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or  (__m256d a, __m256d b) noexcept                { return _mm256_or_pd  (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept                { return _mm256_xor_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept             { return _mm256_andnot_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept                           { return bit_notand (a, vconst (kAllBitsSet)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept                    { return _mm256_min_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept                    { return _mm256_max_pd (a, b); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept                  { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept               { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept            { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept     { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept                           { return _mm256_shuffle_pd (a, a, 0); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept                            { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept                       { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept                        { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }

    //==============================================================================
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
@@ -170,24 +174,38 @@ struct SIMDNativeOps<int8_t>
    //==============================================================================
    DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);

    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept                       { return *reinterpret_cast<const __m256i*> (a); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept                              { return _mm256_set1_epi8 (s); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept                             { return _mm256_set1_epi8 (s); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                    { return _mm256_add_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                    { return _mm256_sub_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                { return _mm256_and_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                { return _mm256_or_si256  (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                { return _mm256_xor_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept             { return _mm256_andnot_si256 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                           { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                    { return _mm256_min_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                    { return _mm256_max_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                  { return _mm256_cmpeq_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept            { return _mm256_cmpgt_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept     { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept               { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const char*> (a);
        return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
                                b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
                                b[15], b[14], b[13], b[12], b[11], b[10], b[9],  b[8],
                                b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1],  b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
    {
        SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
    }

    //==============================================================================
    static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -247,6 +265,20 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const char*> (a);
        return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
                                b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
                                b[15], b[14], b[13], b[12], b[11], b[10], b[9],  b[8],
                                b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1],  b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
    {
        SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
    }

    static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -306,6 +338,17 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
    {
        return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
                                 a[7],  a[6],  a[5],  a[4],  a[3],  a[2],  a[1], a[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
    {
        SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
    }

    static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i tmp = _mm256_hadd_epi16 (a, a);
@@ -349,6 +392,18 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int16_t*> (a);
        return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
                                 b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
    {
        SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
    }

    static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i tmp = _mm256_hadd_epi16 (a, a);
@@ -390,6 +445,16 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
    {
        return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
    {
        SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
    }

    static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i tmp = _mm256_hadd_epi32 (a, a);
@@ -432,6 +497,17 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int32_t*> (a);
        return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
    {
        SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
    }

    static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
    {
        __m256i tmp = _mm256_hadd_epi32 (a, a);
@@ -469,6 +545,16 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
    {
        return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
    {
        SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
    }

    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
    {
       #ifdef _MSC_VER
@@ -530,6 +616,17 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int64_t*> (a);
        return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
    {
        SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
    }

    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
    {
       #ifdef _MSC_VER
--- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
@@ -200,6 +200,25 @@ struct SIMDFallbackOps
        return retval;
    }

    static forcedinline vSIMDType load (const ScalarType* a) noexcept
    {
        vSIMDType retval;
        auto* dst = reinterpret_cast<ScalarType*> (&retval);

        for (size_t i = 0; i < n; ++i)
            dst [i] = a[i];

        return retval;
    }

    static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept
    {
        const auto* src = reinterpret_cast<const ScalarType*> (&value);

        for (size_t i = 0; i < n; ++i)
            dest[i] = src[i];
    }

    template <unsigned int shuffle_idx>
    static forcedinline vSIMDType shuffle (vSIMDType a) noexcept
    {
--- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
@@ -66,15 +66,17 @@ struct SIMDNativeOps<float>
    DECLARE_NEON_SIMD_CONST (float, kOne);

    //==============================================================================
    static forcedinline vSIMDType expand (float s) noexcept                          { return vdupq_n_f32 (s); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_f32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_f32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_f32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                   { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
    static forcedinline vSIMDType expand (float s) noexcept                         { return vdupq_n_f32 (s); }
    static forcedinline vSIMDType load (const float* a) noexcept                    { return vld1q_f32 (a); }
    static forcedinline void store (vSIMDType value, float* a) noexcept             { vst1q_f32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept           { return vaddq_f32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept           { return vsubq_f32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept           { return vmulq_f32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                    { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
@@ -109,6 +111,8 @@ struct SIMDNativeOps<double>
    typedef SIMDFallbackOps<double, vSIMDType> fb;

    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
@@ -143,6 +147,8 @@ struct SIMDNativeOps<int8_t>

    //==============================================================================
    static forcedinline vSIMDType expand (int8_t s) noexcept                     { return vdupq_n_s8 (s); }
    static forcedinline vSIMDType load (const int8_t* a) noexcept                { return vld1q_s8 (a); }
    static forcedinline void store (vSIMDType value, int8_t* a) noexcept         { vst1q_s8 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s8 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s8 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s8 (a, b); }
@@ -174,7 +180,9 @@ struct SIMDNativeOps<uint8_t>
    DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (uint8_t s) noexcept                     { return vdupq_n_u8 (s); }
    static forcedinline vSIMDType expand (uint8_t s) noexcept                    { return vdupq_n_u8 (s); }
    static forcedinline vSIMDType load (const uint8_t* a) noexcept               { return vld1q_u8 (a); }
    static forcedinline void store (vSIMDType value, uint8_t* a) noexcept        { vst1q_u8 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u8 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u8 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u8 (a, b); }
@@ -207,6 +215,8 @@ struct SIMDNativeOps<int16_t>

    //==============================================================================
    static forcedinline vSIMDType expand (int16_t s) noexcept                    { return vdupq_n_s16 (s); }
    static forcedinline vSIMDType load (const int16_t* a) noexcept               { return vld1q_s16 (a); }
    static forcedinline void store (vSIMDType value, int16_t* a) noexcept        { vst1q_s16 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s16 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s16 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s16 (a, b); }
@@ -239,7 +249,9 @@ struct SIMDNativeOps<uint16_t>
    DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (uint16_t s) noexcept                    { return vdupq_n_u16 (s); }
    static forcedinline vSIMDType expand (uint16_t s) noexcept                   { return vdupq_n_u16 (s); }
    static forcedinline vSIMDType load (const uint16_t* a) noexcept              { return vld1q_u16 (a); }
    static forcedinline void store (vSIMDType value, uint16_t* a) noexcept       { vst1q_u16 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u16 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u16 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u16 (a, b); }
@@ -272,6 +284,8 @@ struct SIMDNativeOps<int32_t>

    //==============================================================================
    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
@@ -304,7 +318,9 @@ struct SIMDNativeOps<uint32_t>
    DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (uint32_t s) noexcept                    { return vdupq_n_u32 (s); }
    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
@@ -337,6 +353,8 @@ struct SIMDNativeOps<int64_t>

    //==============================================================================
    static forcedinline vSIMDType expand (int64_t s) noexcept                    { return vdupq_n_s64 (s); }
    static forcedinline vSIMDType load (const int64_t* a) noexcept               { return vld1q_s64 (a); }
    static forcedinline void store (vSIMDType value, int64_t* a) noexcept        { vst1q_s64 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s64 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s64 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
@@ -370,6 +388,8 @@ struct SIMDNativeOps<uint64_t>

    //==============================================================================
    static forcedinline vSIMDType expand (uint64_t s) noexcept                   { return vdupq_n_u64 (s); }
    static forcedinline vSIMDType load (const uint64_t* a) noexcept              { return vld1q_u64 (a); }
    static forcedinline void store (vSIMDType value, uint64_t* a) noexcept       { vst1q_u64 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u64 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u64 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
--- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
@@ -65,6 +65,8 @@ struct SIMDNativeOps<float>

    //==============================================================================
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept                            { return _mm_load1_ps (&s); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept                       { return _mm_load_ps (a); }
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept             { _mm_store_ps (dest, value); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept                    { return _mm_add_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept                    { return _mm_sub_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept                    { return _mm_mul_ps (a, b); }
@@ -124,6 +126,8 @@ struct SIMDNativeOps<double>
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                       { return *reinterpret_cast<const __m128d*> (a); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m128d*> (a); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept                              { return _mm_load1_pd (&s); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept                         { return _mm_load_pd (a); }
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept               { _mm_store_pd (dest, value); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept                     { return _mm_add_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept                     { return _mm_sub_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept                     { return _mm_mul_pd (a, b); }
@@ -199,6 +203,18 @@ struct SIMDNativeOps<int8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const char*> (a);
        return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
                             b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
    {
        SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
    }

    static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -268,6 +284,18 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const char*> (a);
        return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
                             b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
    {
        SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
    }

    static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -337,6 +365,16 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
    {
        return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
    {
        SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
    }

    static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -395,6 +433,17 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int16_t*> (a);
        return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
    {
        SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
    }

    static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -428,6 +477,7 @@ struct SIMDNativeOps<int32_t>
    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept                             { return _mm_set1_epi32 (s); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept                        { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi32 (a, b); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi32 (a, b); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
@@ -442,6 +492,11 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
    {
        SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
    }

    static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -522,6 +577,17 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int32_t*> (a);
        return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
    {
        SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
    }

    static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
       #ifdef __SSSE3__
@@ -591,6 +657,7 @@ struct SIMDNativeOps<int64_t>
        return retval;
    }

    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept                        { return _mm_set_epi64x (a[1], a[0]); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi64 (a, b); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi64 (a, b); }
@@ -606,6 +673,11 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
    {
        SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
    }

    static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
        const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
@@ -692,6 +764,17 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
    {
        const auto* b = reinterpret_cast<const int64_t*> (a);
        return _mm_set_epi64x (b[1], b[0]);
    }

    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
    {
        SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
    }

    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
    {
        const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);