diff --git a/modules/juce_dsp/containers/juce_SIMDRegister.h b/modules/juce_dsp/containers/juce_SIMDRegister.h
index ad9730894c..2e2b13a77d 100644
--- a/modules/juce_dsp/containers/juce_SIMDRegister.h
+++ b/modules/juce_dsp/containers/juce_SIMDRegister.h
@@ -95,6 +95,10 @@ struct SIMDRegister
         and scalar types (used internally). */
     typedef CmplxSIMDOps<ElementType> CmplxOps;
 
+    /** Type which is returned when using the subscript operator. The returned type
+        should be used just like the type ElementType. */
+    struct ElementAccess;
+
     //==============================================================================
     /** The size in bytes of this register. */
     static constexpr size_t SIMDRegisterSize = sizeof (vSIMDType);
@@ -146,18 +150,34 @@ struct SIMDRegister
     //==============================================================================
     /** Returns the idx-th element of the receiver. Note that this does not check if idx
         is larger than the native register size. */
-    inline ElementType JUCE_VECTOR_CALLTYPE operator[] (size_t idx) const noexcept
+    inline ElementType JUCE_VECTOR_CALLTYPE get (size_t idx) const noexcept
+    {
+        jassert (idx < SIMDNumElements);
+        return CmplxOps::get (value, idx);
+    }
+
+    /** Sets the idx-th element of the receiver. Note that this does not check if idx
+        is larger than the native register size. */
+    inline void JUCE_VECTOR_CALLTYPE set (size_t idx, ElementType v) noexcept
     {
         jassert (idx < SIMDNumElements);
-        return reinterpret_cast<const ElementType*> (&value) [idx];
+        value = CmplxOps::set (value, idx, v);
     }
 
+    //==============================================================================
     /** Returns the idx-th element of the receiver. Note that this does not check if idx
         is larger than the native register size. */
-    inline ElementType& JUCE_VECTOR_CALLTYPE operator[] (size_t idx) noexcept
+    inline ElementType JUCE_VECTOR_CALLTYPE operator[] (size_t idx) const noexcept
+    {
+        return get (idx);
+    }
+
+    /** Returns the idx-th element of the receiver. Note that this does not check if idx
+        is larger than the native register size. */
+    inline ElementAccess JUCE_VECTOR_CALLTYPE operator[] (size_t idx) noexcept
     {
         jassert (idx < SIMDNumElements);
-        return reinterpret_cast<ElementType*> (&value) [idx];
+        return ElementAccess (*this, idx);
     }
 
     //==============================================================================
@@ -371,114 +391,9 @@ private:
     }
 };
 
-#ifndef DOXYGEN
-//==============================================================================
-/* This class is used internally by SIMDRegister to abstract away differences
-   in operations which are different for complex and pure floating point types. */
-
-// the pure floating-point version
-template <typename Scalar>
-struct CmplxSIMDOps
-{
-    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept
-    {
-        return SIMDNativeOps<Scalar>::load (a);
-    }
-
-    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept
-    {
-        SIMDNativeOps<Scalar>::store (value, dest);
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept
-    {
-        return SIMDNativeOps<Scalar>::expand (s);
-    }
-
-    static inline Scalar JUCE_VECTOR_CALLTYPE sum (vSIMDType a)  noexcept
-    {
-        return SIMDNativeOps<Scalar>::sum (a);
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept
-    {
-        return SIMDNativeOps<Scalar>::mul (a, b);
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept
-    {
-        return SIMDNativeOps<Scalar>::multiplyAdd (a, b, c);
-    }
-};
-
-// The pure complex version
-template <typename Scalar>
-struct CmplxSIMDOps<std::complex<Scalar>>
-{
-    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex<Scalar>* a) noexcept
-    {
-        return SIMDNativeOps<Scalar>::load (reinterpret_cast<const Scalar*> (a));
-    }
-
-    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex<Scalar>* dest) noexcept
-    {
-        SIMDNativeOps<Scalar>::store (value, reinterpret_cast<Scalar*> (dest));
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex<Scalar> s) noexcept
-    {
-        const int n = sizeof (vSIMDType) / sizeof (Scalar);
-
-        union
-        {
-            vSIMDType v;
-            Scalar floats[n];
-        } u;
-
-        for (int i = 0; i < n; ++i)
-            u.floats[i] = (i & 1) == 0 ? s.real() : s.imag();
-
-        return u.v;
-    }
-
-    static inline std::complex<Scalar> JUCE_VECTOR_CALLTYPE sum (vSIMDType a)  noexcept
-    {
-        vSIMDType result = SIMDNativeOps<Scalar>::oddevensum (a);
-        auto* ptr = reinterpret_cast<const Scalar*> (&result);
-        return std::complex<Scalar> (ptr[0], ptr[1]);
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b)  noexcept
-    {
-        return SIMDNativeOps<Scalar>::cmplxmul (a, b);
-    }
-
-    static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept
-    {
-        return SIMDNativeOps<Scalar>::add (a, SIMDNativeOps<Scalar>::cmplxmul (b, c));
-    }
-};
-#endif
+} // namespace dsp
+} // namespace juce
 
-//==============================================================================
 #ifndef DOXYGEN
- namespace util
- {
-     template <typename Type>
-     inline void snapToZero (SIMDRegister<Type>&) noexcept      {}
- }
+ #include "juce_SIMDRegister_Impl.h"
 #endif
-
-} // namespace dsp
-
-// Extend some common used global functions to SIMDRegister types
-template <typename Type>
-inline dsp::SIMDRegister<Type> JUCE_VECTOR_CALLTYPE jmin (dsp::SIMDRegister<Type> a, dsp::SIMDRegister<Type> b) { return dsp::SIMDRegister<Type>::min (a, b); }
-template <typename Type>
-inline dsp::SIMDRegister<Type> JUCE_VECTOR_CALLTYPE jmax (dsp::SIMDRegister<Type> a, dsp::SIMDRegister<Type> b) { return dsp::SIMDRegister<Type>::max (a, b); }
-
-} // namespace juce
diff --git a/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h b/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h
new file mode 100644
index 0000000000..6d11240e3d
--- /dev/null
+++ b/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h
@@ -0,0 +1,176 @@
+/*
+  ==============================================================================
+
+   This file is part of the JUCE library.
+   Copyright (c) 2017 - ROLI Ltd.
+
+   JUCE is an open source library subject to commercial or open-source
+   licensing.
+
+   By using JUCE, you agree to the terms of both the JUCE 5 End-User License
+   Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
+   27th April 2017).
+
+   End User License Agreement: www.juce.com/juce-5-licence
+   Privacy Policy: www.juce.com/juce-5-privacy-policy
+
+   Or: You may also use this code under the terms of the GPL v3 (see
+   www.gnu.org/licenses).
+
+   JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
+   EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
+   DISCLAIMED.
+
+  ==============================================================================
+*/
+
+namespace juce
+{
+namespace dsp
+{
+
+
+//==============================================================================
+template <typename Type>
+struct SIMDRegister<Type>::ElementAccess
+{
+    operator Type() const                                { return simd.get (idx); }
+    ElementAccess& operator= (Type scalar) noexcept      { simd.set (idx, scalar); return *this; }
+    ElementAccess& operator= (ElementAccess& o) noexcept { return operator= ((Type) o); }
+
+private:
+    friend struct SIMDRegister;
+    ElementAccess (SIMDRegister& owner, size_t index) noexcept : simd (owner), idx (index) {}
+    SIMDRegister& simd;
+    size_t idx;
+};
+
+//==============================================================================
+/* This class is used internally by SIMDRegister to abstract away differences
+   in operations which are different for complex and pure floating point types. */
+
+// the pure floating-point version
+template <typename Scalar>
+struct CmplxSIMDOps
+{
+    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept
+    {
+        return SIMDNativeOps<Scalar>::load (a);
+    }
+
+    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept
+    {
+        SIMDNativeOps<Scalar>::store (value, dest);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept
+    {
+        return SIMDNativeOps<Scalar>::expand (s);
+    }
+
+    static inline Scalar JUCE_VECTOR_CALLTYPE get (vSIMDType v, std::size_t i) noexcept
+    {
+        return SIMDNativeOps<Scalar>::get (v, i);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE set (vSIMDType v, std::size_t i, Scalar s) noexcept
+    {
+        return SIMDNativeOps<Scalar>::set (v, i, s);
+    }
+
+    static inline Scalar JUCE_VECTOR_CALLTYPE sum (vSIMDType a)  noexcept
+    {
+        return SIMDNativeOps<Scalar>::sum (a);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept
+    {
+        return SIMDNativeOps<Scalar>::mul (a, b);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept
+    {
+        return SIMDNativeOps<Scalar>::multiplyAdd (a, b, c);
+    }
+};
+
+// The pure complex version
+template <typename Scalar>
+struct CmplxSIMDOps<std::complex<Scalar>>
+{
+    typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex<Scalar>* a) noexcept
+    {
+        return SIMDNativeOps<Scalar>::load (reinterpret_cast<const Scalar*> (a));
+    }
+
+    static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex<Scalar>* dest) noexcept
+    {
+        SIMDNativeOps<Scalar>::store (value, reinterpret_cast<Scalar*> (dest));
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex<Scalar> s) noexcept
+    {
+        const int n = sizeof (vSIMDType) / sizeof (Scalar);
+
+        union
+        {
+            vSIMDType v;
+            Scalar floats[n];
+        } u;
+
+        for (int i = 0; i < n; ++i)
+            u.floats[i] = (i & 1) == 0 ? s.real() : s.imag();
+
+        return u.v;
+    }
+
+    static inline std::complex<Scalar> JUCE_VECTOR_CALLTYPE get (vSIMDType v, std::size_t i) noexcept
+    {
+        auto j = i << 1;
+        return std::complex<Scalar> (SIMDNativeOps<Scalar>::get (v, j), SIMDNativeOps<Scalar>::get (v, j + 1));
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE set (vSIMDType v, std::size_t i, std::complex<Scalar> s) noexcept
+    {
+        auto j = i << 1;
+        return SIMDNativeOps<Scalar>::set (SIMDNativeOps<Scalar>::set (v, j, s.real()), j + 1, s.imag());
+    }
+
+    static inline std::complex<Scalar> JUCE_VECTOR_CALLTYPE sum (vSIMDType a)  noexcept
+    {
+        vSIMDType result = SIMDNativeOps<Scalar>::oddevensum (a);
+        auto* ptr = reinterpret_cast<const Scalar*> (&result);
+        return std::complex<Scalar> (ptr[0], ptr[1]);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b)  noexcept
+    {
+        return SIMDNativeOps<Scalar>::cmplxmul (a, b);
+    }
+
+    static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept
+    {
+        return SIMDNativeOps<Scalar>::add (a, SIMDNativeOps<Scalar>::cmplxmul (b, c));
+    }
+};
+
+//==============================================================================
+ namespace util
+ {
+     template <typename Type>
+     inline void snapToZero (SIMDRegister<Type>&) noexcept      {}
+ }
+
+} // namespace dsp
+
+// Extend some common used global functions to SIMDRegister types
+template <typename Type>
+inline dsp::SIMDRegister<Type> JUCE_VECTOR_CALLTYPE jmin (dsp::SIMDRegister<Type> a, dsp::SIMDRegister<Type> b) { return dsp::SIMDRegister<Type>::min (a, b); }
+template <typename Type>
+inline dsp::SIMDRegister<Type> JUCE_VECTOR_CALLTYPE jmax (dsp::SIMDRegister<Type> a, dsp::SIMDRegister<Type> b) { return dsp::SIMDRegister<Type>::max (a, b); }
+
+} // namespace juce
diff --git a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
index 5b13b63a21..8213aa3439 100644
--- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
+++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
@@ -31,37 +31,78 @@ namespace dsp
 
 namespace SIMDRegister_test_internal
 {
+    template <typename type, typename = void> struct RandomPrimitive {};
+
     template <typename type>
-    static void fillRandom (type* dst, const int size, Random& random)
+    struct RandomPrimitive<type, typename std::enable_if<std::is_floating_point<type>::value>::type>
     {
-        bool is_signed = std::is_signed<type>::value;
+        static type next (Random& random)
+        {
+            return static_cast<type> (std::is_signed<type>::value ? (random.nextFloat() * 16.0) - 8.0
+                                                                  : (random.nextFloat() * 8.0));
+
+        }
+    };
 
-        for (int i = 0; i < size; ++i)
+    template <typename type>
+    struct RandomPrimitive<type, typename std::enable_if<std::is_integral<type>::value>::type>
+    {
+        static type next (Random& random)
         {
-            if (is_signed)
-            {
-                *dst++ = static_cast<type> ((random.nextFloat() * 16.0) - 8.0);
-            }
-            else
-            {
-                *dst++ = static_cast<type> (random.nextFloat() * 8.0);
-            }
+            return static_cast<type> (random.nextInt64());
+
         }
-    }
+    };
 
+    template <typename type> struct RandomValue { static type next (Random& random) { return RandomPrimitive<type>::next (random); } };
     template <typename type>
-    static void fillRandom (std::complex<type>* dst, const int size, Random& random)
+    struct RandomValue<std::complex<type>>
     {
-        for (int i = 0; i < size; ++i)
+        static std::complex<type> next (Random& random)
         {
-            type real, imag;
+            return {RandomPrimitive<type>::next (random), RandomPrimitive<type>::next (random)};
+        }
+    };
 
-            real = static_cast<type> ((random.nextFloat() * 16.0) - 8.0);
-            imag = static_cast<type> ((random.nextFloat() * 16.0) - 8.0);
 
-            *dst++ = std::complex<type> (real, imag);
+    template <typename type>
+    struct VecFiller
+    {
+        static void fill (type* dst, const int size, Random& random)
+        {
+            for (int i = 0; i < size; ++i)
+                dst[i] = RandomValue<type>::next (random);
         }
-    }
+    };
+
+    // We need to specialise for complex types: otherwise GCC 6 gives
+    // us an ICE internal compiler error after which the compiler seg faults.
+    template <typename type>
+    struct VecFiller<std::complex<type>>
+    {
+        static void fill (std::complex<type>* dst, const int size, Random& random)
+        {
+            for (int i = 0; i < size; ++i)
+                dst[i] = std::complex<type> (RandomValue<type>::next (random), RandomValue<type>::next (random));
+        }
+    };
+
+    template <typename type>
+    struct VecFiller<SIMDRegister<type>>
+    {
+        static SIMDRegister<type> fill(Random& random)
+        {
+            constexpr int size = (int) SIMDRegister<type>::SIMDNumElements;
+           #ifdef _MSC_VER
+            __declspec(align(sizeof (SIMDRegister<type>))) type elements[size];
+           #else
+            type elements[size] __attribute__((aligned(sizeof (SIMDRegister<type>))));
+           #endif
+
+            VecFiller<type>::fill (elements, size, random);
+            return SIMDRegister<type>::fromRawArray (elements);
+        }
+    };
 
     // Avoid visual studio warning
     template <typename type>
@@ -102,10 +143,17 @@ public:
     template <typename type>
     static bool allValuesEqualTo (const SIMDRegister<type>& vec, const type scalar)
     {
+       #ifdef _MSC_VER
+        __declspec(align(sizeof (SIMDRegister<type>))) type elements[SIMDRegister<type>::SIMDNumElements];
+       #else
+        type elements[SIMDRegister<type>::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister<type>))));
+       #endif
+
+        vec.copyToRawArray (elements);
+
         // as we do not want to rely on the access operator we cast this to a primitive pointer
-        const type* ptr = reinterpret_cast<const type*> (&vec);
         for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
-            if (ptr[i] != scalar) return false;
+            if (elements[i] != scalar) return false;
 
         return true;
     }
@@ -246,12 +294,20 @@ public:
             u.expect (allValuesEqualTo<type> (SIMDRegister<type>::expand (static_cast<type> (23)), 23));
 
             {
-                SIMDRegister<type> a;
+               #ifdef _MSC_VER
+                __declspec(align(sizeof (SIMDRegister<type>))) type elements[SIMDRegister<type>::SIMDNumElements];
+               #else
+                type elements[SIMDRegister<type>::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister<type>))));
+               #endif
+                SIMDRegister_test_internal::VecFiller<type>::fill (elements, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister<type> a (SIMDRegister<type>::fromRawArray (elements));
+
+                u.expect (vecEqualToArray (a, elements));
 
-                type* ptr = reinterpret_cast<type*>(&a);
-                SIMDRegister_test_internal::fillRandom (ptr, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister<type> b (a);
+                a *= static_cast<type> (2);
 
-                u.expect (vecEqualToArray (SIMDRegister<type> (a), ptr));
+                u.expect (vecEqualToArray (b, elements));
             }
         }
     };
@@ -265,7 +321,7 @@ public:
             SIMDRegister<type> a;
             type array [SIMDRegister<type>::SIMDNumElements];
 
-            SIMDRegister_test_internal::fillRandom (array, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array, SIMDRegister<type>::SIMDNumElements, random);
 
             // Test non-const access operator
             for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
@@ -290,14 +346,17 @@ public:
             for (int n = 0; n < 100; ++n)
             {
                 // set-up
-                SIMDRegister<type> a, b, c;
+                SIMDRegister<type> a (static_cast<type> (0));
+                SIMDRegister<type> b (static_cast<type> (0));
+                SIMDRegister<type> c (static_cast<type> (0));
+
                 type array_a [SIMDRegister<type>::SIMDNumElements];
                 type array_b [SIMDRegister<type>::SIMDNumElements];
                 type array_c [SIMDRegister<type>::SIMDNumElements];
 
-                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_c, SIMDRegister<type>::SIMDNumElements, random);
 
                 copy (a, array_a); copy (b, array_b); copy (c, array_c);
 
@@ -310,9 +369,9 @@ public:
                 u.expect (vecEqualToArray (a, array_a));
                 u.expect (vecEqualToArray (b, array_b));
 
-                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_c, SIMDRegister<type>::SIMDNumElements, random);
 
                 copy (a, array_a); copy (b, array_b); copy (c, array_c);
 
@@ -326,9 +385,9 @@ public:
                 u.expect (vecEqualToArray (b, array_b));
 
                 // set-up again
-                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_c, SIMDRegister<type>::SIMDNumElements, random);
                 copy (a, array_a); copy (b, array_b); copy (c, array_c);
 
                 // test out-of-place with both params being vectors
@@ -363,7 +422,6 @@ public:
             typedef typename SIMDRegister<type>::vMaskType vMaskType;
             typedef typename SIMDRegister<type>::MaskType MaskType;
 
-
             for (int n = 0; n < 100; ++n)
             {
                 // Check flip sign bit and using as a union
@@ -372,21 +430,28 @@ public:
 
                     union ConversionUnion
                     {
-                        inline ConversionUnion() {}
+                        inline ConversionUnion() : floatVersion (static_cast<type> (0)) {}
                         inline ~ConversionUnion() {}
                         SIMDRegister<type> floatVersion;
                         vMaskType intVersion;
                     } a, b;
 
                     vMaskType bitmask = vMaskType::expand (static_cast<MaskType> (1) << (sizeof (MaskType) - 1));
-                    SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                    SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
                     copy (a.floatVersion, array_a);
                     copy (b.floatVersion, array_a);
 
                     Operation::template inplace<SIMDRegister<type>, vMaskType> (a.floatVersion, bitmask);
                     Operation::template inplace<vMaskType, vMaskType> (b.intVersion, bitmask);
 
-                    u.expect (vecEqualToArray (a.floatVersion, reinterpret_cast<const type*> (&b.floatVersion)));
+                   #ifdef _MSC_VER
+                    __declspec(align(sizeof (SIMDRegister<type>))) type elements[SIMDRegister<type>::SIMDNumElements];
+                   #else
+                    type elements[SIMDRegister<type>::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister<type>))));
+                   #endif
+                    b.floatVersion.copyToRawArray (elements);
+
+                    u.expect (vecEqualToArray (a.floatVersion, elements));
                 }
 
                 // set-up
@@ -397,42 +462,51 @@ public:
                 MaskType array_b [SIMDRegister<MaskType>::SIMDNumElements];
                 MaskType array_c [SIMDRegister<MaskType>::SIMDNumElements];
 
-                type* conv_a = reinterpret_cast<type*> (array_a);
-                type* conv_c = reinterpret_cast<type*> (array_c);
+                type float_a [SIMDRegister<type>::SIMDNumElements];
+                type float_c [SIMDRegister<type>::SIMDNumElements];
 
-                SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister<type>::SIMDNumElements, random);
-                copy (a, conv_a); copy (b, array_b); copy (c, conv_c);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<MaskType>::fill (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_c, SIMDRegister<type>::SIMDNumElements, random);
+
+                memcpy (array_a, float_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                memcpy (array_c, float_c, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                copy (a, float_a); copy (b, array_b); copy (c, float_c);
 
                 // test in-place with both params being vectors
                 for (size_t i = 0; i < SIMDRegister<MaskType>::SIMDNumElements; ++i)
                     Operation::template inplace<MaskType, MaskType> (array_a[i], array_b[i]);
+                memcpy (float_a, array_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
 
                 Operation::template inplace<SIMDRegister<type>, vMaskType> (a, b);
 
-                u.expect (vecEqualToArray (a, conv_a));
+                u.expect (vecEqualToArray (a, float_a));
                 u.expect (vecEqualToArray (b, array_b));
 
-                SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister<type>::SIMDNumElements, random);
-                copy (a, conv_a); copy (b, array_b); copy (c, conv_c);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<MaskType>::fill (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_c, SIMDRegister<type>::SIMDNumElements, random);
+                memcpy (array_a, float_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                memcpy (array_c, float_c, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                copy (a, float_a); copy (b, array_b); copy (c, float_c);
 
                 // test in-place with one param being scalar
                 for (size_t i = 0; i < SIMDRegister<MaskType>::SIMDNumElements; ++i)
                     Operation::template inplace<MaskType, MaskType> (array_a[i], static_cast<MaskType> (9));
+                memcpy (float_a, array_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
 
                 Operation::template inplace<SIMDRegister<type>, MaskType> (a, static_cast<MaskType> (9));
 
-                u.expect (vecEqualToArray (a, conv_a));
+                u.expect (vecEqualToArray (a, float_a));
                 u.expect (vecEqualToArray (b, array_b));
 
                 // set-up again
-                SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister<type>::SIMDNumElements, random);
-                copy (a, conv_a); copy (b, array_b); copy (c, conv_c);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<MaskType>::fill (array_b, SIMDRegister<MaskType>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (float_c, SIMDRegister<type>::SIMDNumElements, random);
+                memcpy (array_a, float_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                memcpy (array_c, float_c, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                copy (a, float_a); copy (b, array_b); copy (c, float_c);
 
                 // test out-of-place with both params being vectors
                 for (size_t i = 0; i < SIMDRegister<MaskType>::SIMDNumElements; ++i)
@@ -440,22 +514,26 @@ public:
                     array_c[i] =
                         Operation::template outofplace<MaskType, MaskType> (array_a[i], array_b[i]);
                 }
+                memcpy (float_a, array_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                memcpy (float_c, array_c, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
 
                 c = Operation::template outofplace<SIMDRegister<type>, vMaskType> (a, b);
 
-                u.expect (vecEqualToArray (a, conv_a));
+                u.expect (vecEqualToArray (a, float_a));
                 u.expect (vecEqualToArray (b, array_b));
-                u.expect (vecEqualToArray (c, conv_c));
+                u.expect (vecEqualToArray (c, float_c));
 
                 // test out-of-place with one param being scalar
                 for (size_t i = 0; i < SIMDRegister<MaskType>::SIMDNumElements; ++i)
                     array_c[i] = Operation::template outofplace<MaskType, MaskType> (array_a[i], static_cast<MaskType> (9));
+                memcpy (float_a, array_a, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
+                memcpy (float_c, array_c, sizeof (type) * SIMDRegister<type>::SIMDNumElements);
 
                 c = Operation::template outofplace<SIMDRegister<type>, MaskType> (a, static_cast<MaskType> (9));
 
-                u.expect (vecEqualToArray (a, conv_a));
+                u.expect (vecEqualToArray (a, float_a));
                 u.expect (vecEqualToArray (b, array_b));
-                u.expect (vecEqualToArray (c, conv_c));
+                u.expect (vecEqualToArray (c, float_c));
             }
         }
     };
@@ -481,8 +559,8 @@ public:
                 MaskType array_ge  [SIMDRegister<type>::SIMDNumElements];
 
 
-                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-                SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
 
                 // do check
                 for (size_t j = 0; j < SIMDRegister<type>::SIMDNumElements; ++j)
@@ -495,7 +573,9 @@ public:
                     array_ge  [j] = (array_a[j] >= array_b[j]) ? static_cast<MaskType> (-1) : 0;
                 }
 
-                SIMDRegister<type> a, b;
+                SIMDRegister<type> a (static_cast<type> (0));
+                SIMDRegister<type> b (static_cast<type> (0));
+
                 vMaskType eq, neq, lt, le, gt, ge;
 
                 copy (a, array_a);
@@ -517,8 +597,8 @@ public:
 
                 do
                 {
-                    SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-                    SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
+                    SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                    SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
                 } while (std::equal (array_a, array_a + SIMDRegister<type>::SIMDNumElements, array_b));
 
                 copy (a, array_a);
@@ -528,7 +608,7 @@ public:
                 u.expect (! (a == b));
                 u.expect (! (b == a));
 
-                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
+                SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
                 copy (a, array_a);
                 copy (b, array_a);
 
@@ -537,7 +617,7 @@ public:
                 u.expect (! (a != b));
                 u.expect (! (b != a));
 
-                auto scalar = a[0];
+                type scalar = a[0];
                 a = SIMDRegister<type>::expand (scalar);
 
                 u.expect (a == scalar);
@@ -562,10 +642,10 @@ public:
             type array_c [SIMDRegister<type>::SIMDNumElements];
             type array_d [SIMDRegister<type>::SIMDNumElements];
 
-            SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
-            SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
-            SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister<type>::SIMDNumElements, random);
-            SIMDRegister_test_internal::fillRandom (array_d, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array_a, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array_b, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array_c, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array_d, SIMDRegister<type>::SIMDNumElements, random);
 
             // check
             for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
@@ -607,7 +687,10 @@ public:
                     array_max[j] = (array_a[j] > array_b[j]) ? array_a[j] : array_b[j];
                 }
 
-                SIMDRegister<type> a, b, vMin, vMax;
+                SIMDRegister<type> a (static_cast<type> (0));
+                SIMDRegister<type> b (static_cast<type> (0));
+                SIMDRegister<type> vMin (static_cast<type> (0));
+                SIMDRegister<type> vMax (static_cast<type> (0));
 
                 copy (a, array_a);
                 copy (b, array_b);
@@ -638,7 +721,7 @@ public:
             type array [SIMDRegister<type>::SIMDNumElements];
             type sumCheck = 0;
 
-            SIMDRegister_test_internal::fillRandom (array, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array, SIMDRegister<type>::SIMDNumElements, random);
 
             for (size_t j = 0; j < SIMDRegister<type>::SIMDNumElements; ++j)
             {
@@ -674,14 +757,14 @@ public:
             u.expect (a != value);
             u.expect (! (a == value));
 
-            SIMDRegister_test_internal::fillRandom (array, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array, SIMDRegister<type>::SIMDNumElements, random);
             copy (a, array);
             copy (b, array);
 
             u.expect (a == b);
             u.expect (! (a != b));
 
-            SIMDRegister_test_internal::fillRandom (array, SIMDRegister<type>::SIMDNumElements, random);
+            SIMDRegister_test_internal::VecFiller<type>::fill (array, SIMDRegister<type>::SIMDNumElements, random);
             copy (b, array);
 
             u.expect (a != b);
diff --git a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
index 1c20774346..642642fcf4 100644
--- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
@@ -71,11 +71,11 @@ struct SIMDNativeOps<float>
     DECLARE_AVX_SIMD_CONST (float, kOne);
 
     //==============================================================================
-    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept                     { return *reinterpret_cast<const __m256*> (a); }
-    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                   { return *reinterpret_cast<const __m256*> (a); }
+    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept                     { return load (a); }
+    static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                   { return _mm256_castsi256_ps (_mm256_load_si256 ((const __m256i*) a)); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept                            { return _mm256_broadcast_ss (&s); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept                       { return _mm256_load_ps (a); }
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept             { _mm256_store_ps (dest, value); }
+    static forcedinline void   JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept           { _mm256_store_ps (dest, value); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept                    { return _mm256_add_ps (a, b); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept                    { return _mm256_sub_ps (a, b); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept                    { return _mm256_mul_ps (a, b); }
@@ -95,6 +95,8 @@ struct SIMDNativeOps<float>
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept                          { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept                           { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept                      { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
+    static forcedinline float  JUCE_VECTOR_CALLTYPE get (__m256 v, size_t i) noexcept                    { return SIMDFallbackOps<float, __m256>::get (v, i); }
+    static forcedinline __m256 JUCE_VECTOR_CALLTYPE set (__m256 v, size_t i, float s) noexcept           { return SIMDFallbackOps<float, __m256>::set (v, i, s); }
     static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
     {
         a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
@@ -114,7 +116,12 @@ struct SIMDNativeOps<float>
        __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
        __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
        retval = _mm256_add_ps (retval, tmp);
-       return ((float*) &retval)[0];
+
+      #if JUCE_GCC
+       return retval[0];
+      #else
+       return _mm256_cvtss_f32 (retval);
+      #endif
     }
 };
 
@@ -134,8 +141,8 @@ struct SIMDNativeOps<double>
     DECLARE_AVX_SIMD_CONST (double, kOne);
 
     //==============================================================================
-    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                      { return *reinterpret_cast<const __m256d*> (a); }
-    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                     { return *reinterpret_cast<const __m256d*> (a); }
+    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                      { return load (a); }
+    static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                     { return _mm256_castsi256_pd (_mm256_load_si256 ((const __m256i*) a)); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept                             { return _mm256_broadcast_sd (&s); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept                        { return _mm256_load_pd (a); }
     static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept              { _mm256_store_pd (dest, value); }
@@ -153,12 +160,15 @@ struct SIMDNativeOps<double>
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept               { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept            { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept     { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept                { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
+    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept               { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept                           { return _mm256_shuffle_pd (a, a, 0); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept                            { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept                       { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept                        { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
+    static forcedinline double  JUCE_VECTOR_CALLTYPE get (__m256d v, size_t i) noexcept                     { return SIMDFallbackOps<double, __m256d>::get (v, i); }
+    static forcedinline __m256d JUCE_VECTOR_CALLTYPE set (__m256d v, size_t i, double s) noexcept           { return SIMDFallbackOps<double, __m256d>::set (v, i, s); }
+
 
     //==============================================================================
     static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
@@ -173,7 +183,12 @@ struct SIMDNativeOps<double>
         __m256d retval = _mm256_hadd_pd (a, a);
         __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
         retval = _mm256_add_pd (retval, tmp);
-        return ((double*) &retval)[0];
+
+       #if JUCE_GCC
+        return retval[0];
+       #else
+        return _mm256_cvtsd_f64 (retval);
+       #endif
     }
 };
 
@@ -190,15 +205,16 @@ struct SIMDNativeOps<int8_t>
     //==============================================================================
     DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
 
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept                             { return _mm256_set1_epi8 (s); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* p) noexcept                        { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept              { _mm256_store_si256 ((__m256i*) dest, value); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                    { return _mm256_add_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                    { return _mm256_sub_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                { return _mm256_and_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                { return _mm256_or_si256  (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                { return _mm256_xor_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept             { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                           { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                           { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                    { return _mm256_min_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                    { return _mm256_max_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                  { return _mm256_cmpeq_epi8 (a, b); }
@@ -207,22 +223,10 @@ struct SIMDNativeOps<int8_t>
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept               { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept               { return bit_not (equal (a, b)); }
+    static forcedinline int8_t  JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                     { return SIMDFallbackOps<int8_t, __m256i>::get (v, i); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int8_t s) noexcept           { return SIMDFallbackOps<int8_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const char*> (a);
-        return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
-                                b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
-                                b[15], b[14], b[13], b[12], b[11], b[10], b[9],  b[8],
-                                b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1],  b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
-    {
-        SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -234,10 +238,19 @@ struct SIMDNativeOps<int8_t>
             hi = _mm256_hadd_epi16 (hi, hi);
         }
 
-        const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
-        const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
+       #if JUCE_GCC
+        return (int8_t) ((lo[0] & 0xff) +
+                         (hi[0] & 0xff) +
+                         (lo[2] & 0xff) +
+                         (hi[2] & 0xff));
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
 
-        return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
+        return (int8_t) ((_mm256_cvtsi256_si32 (lo) & 0xff) +
+                         (_mm256_cvtsi256_si32 (hi) & 0xff) +
+                         (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask)) & 0xff) +
+                         (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask)) & 0xff));
+       #endif
     }
 
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
@@ -266,16 +279,17 @@ struct SIMDNativeOps<uint8_t>
     DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
     DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
 
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, vconst (kHighBit)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, load (kHighBit)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept                             { return _mm256_set1_epi8 ((int8_t) s); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* p) noexcept                        { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept              { _mm256_store_si256 ((__m256i*) dest, value); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epu8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epu8 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi8 (a, b); }
@@ -284,22 +298,10 @@ struct SIMDNativeOps<uint8_t>
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<uint8_t, __m256i>::get (v, i); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint8_t s) noexcept           { return SIMDFallbackOps<uint8_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const char*> (a);
-        return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
-                                b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
-                                b[15], b[14], b[13], b[12], b[11], b[10], b[9],  b[8],
-                                b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1],  b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -311,10 +313,19 @@ struct SIMDNativeOps<uint8_t>
             hi = _mm256_hadd_epi16 (hi, hi);
         }
 
-        const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
-        const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
+       #if JUCE_GCC
+        return (uint8_t) ((static_cast<uint32_t> (lo[0]) & 0xffu) +
+                          (static_cast<uint32_t> (hi[0]) & 0xffu) +
+                          (static_cast<uint32_t> (lo[2]) & 0xffu) +
+                          (static_cast<uint32_t> (hi[2]) & 0xffu));
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
 
-        return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
+        return (uint8_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (lo)) & 0xffu) +
+                          (static_cast<uint32_t> (_mm256_cvtsi256_si32 (hi)) & 0xffu) +
+                          (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask))) & 0xffu) +
+                          (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask))) & 0xffu));
+       #endif
     }
 
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
@@ -343,8 +354,9 @@ struct SIMDNativeOps<int16_t>
     DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept                             { return _mm256_set1_epi16 (s); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* p) noexcept                        { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept              { _mm256_store_si256 ((__m256i*) dest, value); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi16 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi16 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi16 (a, b); }
@@ -352,7 +364,7 @@ struct SIMDNativeOps<int16_t>
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epi16 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epi16 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi16 (a, b); }
@@ -361,26 +373,24 @@ struct SIMDNativeOps<int16_t>
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<int16_t, __m256i>::get (v, i); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int16_t s) noexcept           { return SIMDFallbackOps<int16_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
-    {
-        return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
-                                 a[7],  a[6],  a[5],  a[4],  a[3],  a[2],  a[1], a[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
-    {
-        SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i tmp = _mm256_hadd_epi16 (a, a);
         tmp = _mm256_hadd_epi16 (tmp, tmp);
         tmp = _mm256_hadd_epi16 (tmp, tmp);
-        int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
-        return (int16_t) (ptr[0] + ptr[8]);
+
+       #if JUCE_GCC
+        return (int16_t) ((tmp[0] & 0xffff) + (tmp[2] & 0xffff));
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
+
+        return (int16_t) ((_mm256_cvtsi256_si32 (tmp) & 0xffff) +
+                          (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)) & 0xffff));
+       #endif
     }
 };
 
@@ -400,46 +410,45 @@ struct SIMDNativeOps<uint16_t>
     DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept                     { return *reinterpret_cast<const __m256i*> (a); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, vconst (kHighBit)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept                            { return _mm256_set1_epi16 ((int16_t) s); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epu16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epu16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi16 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, load (kHighBit)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept                            { return _mm256_set1_epi16 ((int16_t) s); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE load (const uint16_t* p) noexcept                       { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept          { _mm256_store_si256 ((__m256i*) dest, value); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epu16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epu16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi16 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<uint16_t, __m256i>::get (v, i); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint16_t s) noexcept          { return SIMDFallbackOps<uint16_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int16_t*> (a);
-        return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
-                                 b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i tmp = _mm256_hadd_epi16 (a, a);
         tmp = _mm256_hadd_epi16 (tmp, tmp);
         tmp = _mm256_hadd_epi16 (tmp, tmp);
-        uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
-        return (uint16_t) (ptr[0] + ptr[8]);
+
+       #if JUCE_GCC
+        return (uint16_t) ((static_cast<uint32_t> (tmp[0]) & 0xffffu) +
+                           (static_cast<uint32_t> (tmp[2]) & 0xffffu));
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
+
+        return (uint16_t) ((static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp)) & 0xffffu) +
+                           (static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))) & 0xffffu));
+       #endif
     }
 };
 
@@ -458,8 +467,9 @@ struct SIMDNativeOps<int32_t>
     DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept                             { return _mm256_set1_epi32 (s); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* p) noexcept                        { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept           { _mm256_store_si256 ((__m256i*) dest, value); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi32 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi32 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi32 (a, b); }
@@ -467,7 +477,7 @@ struct SIMDNativeOps<int32_t>
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epi32 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epi32 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi32 (a, b); }
@@ -476,24 +486,22 @@ struct SIMDNativeOps<int32_t>
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<int32_t, __m256i>::get (v, i); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int32_t s) noexcept           { return SIMDFallbackOps<int32_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
-    {
-        return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
-    {
-        SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i tmp = _mm256_hadd_epi32 (a, a);
         tmp = _mm256_hadd_epi32 (tmp, tmp);
-        int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
-        return ptr[0] + ptr[4];
+
+       #if JUCE_GCC
+        return tmp[0] + tmp[2];
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
+
+        return _mm256_cvtsi256_si32 (tmp) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask));
+       #endif
     }
 };
 
@@ -513,44 +521,43 @@ struct SIMDNativeOps<uint32_t>
     DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept                     { return *reinterpret_cast<const __m256i*> (a); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, vconst (kHighBit)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept                            { return _mm256_set1_epi32 ((int32_t) s); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epu32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epu32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi32 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, load (kHighBit)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept                            { return _mm256_set1_epi32 ((int32_t) s); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE load (const uint32_t* p) noexcept                       { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept          { _mm256_store_si256 ((__m256i*) dest, value); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return _mm256_mullo_epi32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { return _mm256_min_epu32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { return _mm256_max_epu32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi32 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<uint32_t, __m256i>::get (v, i); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint32_t s) noexcept          { return SIMDFallbackOps<uint32_t, __m256i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int32_t*> (a);
-        return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
-    }
-
     static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
     {
         __m256i tmp = _mm256_hadd_epi32 (a, a);
         tmp = _mm256_hadd_epi32 (tmp, tmp);
-        uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
-        return ptr[0] + ptr[4];
+
+       #if JUCE_GCC
+        return static_cast<uint32_t> (tmp[0]) + static_cast<uint32_t> (tmp[2]);
+       #else
+        constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6);
+
+        return static_cast<uint32_t> (_mm256_cvtsi256_si32 (tmp))
+            + static_cast<uint32_t> (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)));
+       #endif
     }
 };
 
@@ -568,14 +575,16 @@ struct SIMDNativeOps<int64_t>
     //==============================================================================
     DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
 
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m256i*> (a); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept                             { return _mm256_set1_epi64x ((int64_t) s); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* p) noexcept                        { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept           { _mm256_store_si256 ((__m256i*) dest, value); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi64 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi64 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi64 (a, b); }
@@ -584,47 +593,10 @@ struct SIMDNativeOps<int64_t>
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
-
-    //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
-    {
-        return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
-    {
-        SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
-    }
-
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
-    {
-       #ifdef _MSC_VER
-        __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
-        return *reinterpret_cast<const __m256i*> (&tmp);
-       #else
-        return _mm256_set1_epi64x ((int64_t) s);
-       #endif
-    }
-
-    static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
-    {
-        const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
-        return ptr[0] + ptr[1] + ptr[2] + ptr[3];
-    }
-
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
-    {
-        __m256i retval;
-
-        const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
-        const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
-        int64_t* dst =  reinterpret_cast<int64_t*> (&retval);
-
-        for (int i = 0; i < 4; ++i)
-            dst[i] = aptr[i] * bptr[i];
-
-        return retval;
-    }
+    static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<int64_t, __m256i>::get (v, i); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int64_t s) noexcept           { return SIMDFallbackOps<int64_t, __m256i>::set (v, i, s); }
+    static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept                                { return SIMDFallbackOps<int64_t, __m256i>::sum (a); }
+    static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return SIMDFallbackOps<int64_t, __m256i>::mul (a, b); }
 };
 
 //==============================================================================
@@ -642,65 +614,29 @@ struct SIMDNativeOps<uint64_t>
     DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
     DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
 
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept                     { return *reinterpret_cast<const __m256i*> (a); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, vconst (kHighBit)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi64 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi64 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi64 (a, b); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
-
-    //==============================================================================
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int64_t*> (a);
-        return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
-    }
-
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
-    {
-       #ifdef _MSC_VER
-        __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
-        return *reinterpret_cast<const __m256i*> (&tmp);
-       #else
-        return _mm256_set1_epi64x ((int64_t) s);
-       #endif
-    }
-
-    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
-    {
-        const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
-        return ptr[0] + ptr[1] + ptr[2] + ptr[3];
-    }
-
-    static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
-    {
-        __m256i retval;
-
-        const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
-        const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
-        uint64_t* dst =  reinterpret_cast<uint64_t*> (&retval);
-
-        for (int i = 0; i < 4; ++i)
-            dst[i] = aptr[i] * bptr[i];
-
-        return retval;
-    }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept                            { return _mm256_set1_epi64x ((int64_t) s); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE load (const uint64_t* p) noexcept                       { return _mm256_load_si256 ((const __m256i*) p); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept          { _mm256_store_si256 ((__m256i*) dest, value); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept                              { return _mm256_xor_si256 (a, load (kHighBit)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept                     { return _mm256_add_epi64 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept                     { return _mm256_sub_epi64 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept                 { return _mm256_and_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_or  (__m256i a, __m256i b) noexcept                 { return _mm256_or_si256  (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept                 { return _mm256_xor_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept              { return _mm256_andnot_si256 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept                            { return _mm256_andnot_si256 (a, load (kAllBitsSet)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept                     { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept                     { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi64 (a, b); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
+    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept                      { return SIMDFallbackOps<uint64_t, __m256i>::get (v, i); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint64_t s) noexcept          { return SIMDFallbackOps<uint64_t, __m256i>::set (v, i, s); }
+    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept                                { return SIMDFallbackOps<uint64_t, __m256i>::sum (a); }
+    static forcedinline __m256i  JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept                     { return SIMDFallbackOps<uint64_t, __m256i>::mul (a, b); }
 };
 
 #endif
diff --git a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
index d7336c8818..1502d26cbe 100644
--- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
@@ -63,8 +63,11 @@ struct SIMDFallbackOps
     static constexpr size_t mask = (sizeof (vSIMDType) / sizeof (ScalarType)) - 1;
     static constexpr size_t bits = SIMDInternal::Log2Helper<n>::value;
 
-    // corresponding mask type
-    typedef typename SIMDInternal::MaskTypeFor<ScalarType>::type MaskType;
+    // helper types
+    using MaskType = typename SIMDInternal::MaskTypeFor<ScalarType>::type;
+    union UnionType     { vSIMDType v; ScalarType s[n]; };
+    union UnionMaskType { vSIMDType v; MaskType   m[n]; };
+
 
     // fallback methods
     static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return apply<ScalarAdd> (a, b); }
@@ -82,69 +85,80 @@ struct SIMDFallbackOps
     static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept        { return cmp<ScalarGt > (a, b); }
     static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return cmp<ScalarGeq> (a, b); }
 
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept
+    static forcedinline ScalarType get (vSIMDType v, size_t i) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<MaskType*> (&retval);
-        auto* aSrc = reinterpret_cast<const MaskType*> (&a);
+        UnionType u {v};
+        return u.s[i];
+    }
+
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, ScalarType s) noexcept
+    {
+        UnionType u {v};
+
+        u.s[i] = s;
+        return u.v;
+    }
+
+    static forcedinline vSIMDType bit_not (vSIMDType av) noexcept
+    {
+        UnionMaskType a {av};
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = ~aSrc [i];
+            a.m[i] = ~a.m[i];
 
-        return retval;
+        return a.v;
     }
 
-    static forcedinline ScalarType sum (vSIMDType a) noexcept
+    static forcedinline ScalarType sum (vSIMDType av) noexcept
     {
+        UnionType a {av};
         auto retval = static_cast<ScalarType> (0);
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
 
         for (size_t i = 0; i < n; ++i)
-            retval += aSrc [i];
+            retval += a.s[i];
 
         return retval;
     }
 
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept
+    static forcedinline vSIMDType multiplyAdd (vSIMDType av, vSIMDType bv, vSIMDType cv) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<ScalarType*> (&retval);
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
-        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);
-        auto* cSrc = reinterpret_cast<const ScalarType*> (&c);
+        UnionType a {av}, b {bv}, c {cv};
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = aSrc [i] + (bSrc [i] * cSrc [i]);
+            a.s[i] += b.s[i] * c.s[i];
 
-        return retval;
+        return a.v;
     }
 
     //==============================================================================
-    static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline bool allEqual (vSIMDType av, vSIMDType bv) noexcept
     {
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
-        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);
+        UnionType a {av}, b {bv};
 
         for (size_t i = 0; i < n; ++i)
-            if (aSrc[i] != bSrc[i])
+            if (a.s[i] != b.s[i])
                 return false;
 
         return true;
     }
 
     //==============================================================================
-    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline vSIMDType cmplxmul (vSIMDType av, vSIMDType bv) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<std::complex<ScalarType>*> (&retval);
-        auto* aSrc = reinterpret_cast<const std::complex<ScalarType>*> (&a);
-        auto* bSrc = reinterpret_cast<const std::complex<ScalarType>*> (&b);
+        UnionType a {av}, b {bv}, r;
 
         const int m = n >> 1;
         for (int i = 0; i < m; ++i)
-            dst [i] = aSrc [i] * bSrc [i];
+        {
+            std::complex<ScalarType> result
+                  = std::complex<ScalarType> (a.s[i<<1], a.s[(i<<1)|1])
+                  * std::complex<ScalarType> (b.s[i<<1], b.s[(i<<1)|1]);
 
-        return retval;
+            r.s[i<<1]     = result.real();
+            r.s[(i<<1)|1] = result.imag();
+        }
+
+        return r.v;
     }
 
     struct ScalarAdd { static forcedinline ScalarType   op (ScalarType a, ScalarType b)   noexcept { return a + b; } };
@@ -163,90 +177,78 @@ struct SIMDFallbackOps
 
     // generic apply routines for operations above
     template <typename Op>
-    static forcedinline vSIMDType apply (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline vSIMDType apply (vSIMDType av, vSIMDType bv) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<ScalarType*> (&retval);
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
-        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);
+        UnionType a {av}, b {bv};
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = Op::op (aSrc [i], bSrc [i]);
+            a.s[i] = Op::op (a.s[i], b.s[i]);
 
-        return retval;
+        return a.v;
     }
 
     template <typename Op>
-    static forcedinline vSIMDType cmp (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline vSIMDType cmp (vSIMDType av, vSIMDType bv) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<MaskType*> (&retval);
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
-        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);
+        UnionType a {av}, b {bv};
+        UnionMaskType r;
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = Op::op (aSrc [i], bSrc [i]) ? static_cast<MaskType> (-1) : static_cast<MaskType> (0);
+            r.m[i] = Op::op (a.s[i], b.s[i]) ? static_cast<MaskType> (-1) : static_cast<MaskType> (0);
 
-        return retval;
+        return r.v;
     }
 
     template <typename Op>
-    static forcedinline vSIMDType bitapply (vSIMDType a, vSIMDType b) noexcept
+    static forcedinline vSIMDType bitapply (vSIMDType av, vSIMDType bv) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<MaskType*> (&retval);
-        auto* aSrc = reinterpret_cast<const MaskType*> (&a);
-        auto* bSrc = reinterpret_cast<const MaskType*> (&b);
+        UnionMaskType a {av}, b {bv};
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = Op::op (aSrc [i], bSrc [i]);
+            a.m[i] = Op::op (a.m[i], b.m[i]);
 
-        return retval;
+        return a.v;
     }
 
     static forcedinline vSIMDType expand (ScalarType s) noexcept
     {
-        vSIMDType retval;
-        auto* dst = reinterpret_cast<ScalarType*> (&retval);
+        UnionType r;
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = s;
+            r.s[i] = s;
 
-        return retval;
+        return r.v;
     }
 
     static forcedinline vSIMDType load (const ScalarType* a) noexcept
     {
-        vSIMDType retval;
-        auto* dst = reinterpret_cast<ScalarType*> (&retval);
+        UnionType r;
 
         for (size_t i = 0; i < n; ++i)
-            dst [i] = a[i];
+            r.s[i] = a[i];
 
-        return retval;
+        return r.v;
     }
 
-    static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept
+    static forcedinline void store (vSIMDType av, ScalarType* dest) noexcept
     {
-        const auto* src = reinterpret_cast<const ScalarType*> (&value);
+        UnionType a {av};
 
         for (size_t i = 0; i < n; ++i)
-            dest[i] = src[i];
+            dest[i] = a.s[i];
     }
 
     template <unsigned int shuffle_idx>
-    static forcedinline vSIMDType shuffle (vSIMDType a) noexcept
+    static forcedinline vSIMDType shuffle (vSIMDType av) noexcept
     {
-        vSIMDType retval;
-        auto* dst  = reinterpret_cast<ScalarType*> (&retval);
-        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
+        UnionType a {av}, r;
 
         // the compiler will unroll this loop and the index can
         // be computed at compile-time, so this will be super fast
         for (size_t i = 0; i < n; ++i)
-            dst [i] = aSrc [(shuffle_idx >> (bits * i)) & mask];
+            r.s[i] = a.s[(shuffle_idx >> (bits * i)) & mask];
 
-        return retval;
+        return r.v;
     }
 };
 
diff --git a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
index af0b6cd6b7..5eef5051f6 100644
--- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
@@ -71,24 +71,26 @@ struct SIMDNativeOps<uint32_t>
     DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
-    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
-    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u32 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u32  (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u32 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u32 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u32 (a, b); }
+    static forcedinline vSIMDType expand (uint32_t s) noexcept                                  { return vdupq_n_u32 (s); }
+    static forcedinline vSIMDType load (const uint32_t* a) noexcept                             { return vld1q_u32 (a); }
+    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept                      { vst1q_u32 (a, value); }
+    static forcedinline uint32_t get (vSIMDType v, size_t i) noexcept                           { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, uint32_t s) noexcept              { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                       { return vaddq_u32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                       { return vsubq_u32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                       { return vmulq_u32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                   { return vandq_u32 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                   { return vorrq_u32  (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                   { return veorq_u32 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept                { return vbicq_u32 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                                { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                       { return vminq_u32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                       { return vmaxq_u32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                     { return (vSIMDType) vceqq_u32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                  { return (sum (notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                  { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept               { return (vSIMDType) vcgtq_u32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept        { return (vSIMDType) vcgeq_u32 (a, b); }
     static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u32 (a, b, c); }
     static forcedinline uint32_t sum (vSIMDType a) noexcept
     {
@@ -113,24 +115,26 @@ struct SIMDNativeOps<int32_t>
     DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
-    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
-    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s32 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s32 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s32 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s32 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s32 (a, b); }
+    static forcedinline vSIMDType expand (int32_t s) noexcept                                   { return vdupq_n_s32 (s); }
+    static forcedinline vSIMDType load (const int32_t* a) noexcept                              { return vld1q_s32 (a); }
+    static forcedinline void store (vSIMDType value, int32_t* a) noexcept                       { vst1q_s32 (a, value); }
+    static forcedinline int32_t get (vSIMDType v, size_t i) noexcept                            { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, int32_t s) noexcept               { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                       { return vaddq_s32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                       { return vsubq_s32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                       { return vmulq_s32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                   { return vandq_s32 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                   { return vorrq_s32 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                   { return veorq_s32 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept                { return vbicq_s32 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                                { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                       { return vminq_s32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                       { return vmaxq_s32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                     { return (vSIMDType) vceqq_s32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                  { return (sum (notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                  { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept               { return (vSIMDType) vcgtq_s32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept        { return (vSIMDType) vcgeq_s32 (a, b); }
     static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s32 (a, b, c); }
     static forcedinline int32_t sum (vSIMDType a) noexcept
     {
@@ -156,26 +160,28 @@ struct SIMDNativeOps<int8_t>
     DECLARE_NEON_SIMD_CONST (int8_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (int8_t s) noexcept                     { return vdupq_n_s8 (s); }
-    static forcedinline vSIMDType load (const int8_t* a) noexcept                { return vld1q_s8 (a); }
-    static forcedinline void store (vSIMDType value, int8_t* a) noexcept         { vst1q_s8 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s8 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s8 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s8 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s8 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s8 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s8 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s8 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s8 ((int8_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s8 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s8 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s8 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s8 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s8 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s8 (a, b, c); }
-    static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType expand (int8_t s) noexcept                                   { return vdupq_n_s8 (s); }
+    static forcedinline vSIMDType load (const int8_t* a) noexcept                              { return vld1q_s8 (a); }
+    static forcedinline void store (vSIMDType value, int8_t* a) noexcept                       { vst1q_s8 (a, value); }
+    static forcedinline int8_t get (vSIMDType v, size_t i) noexcept                            { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, int8_t s) noexcept               { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_s8 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_s8 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return vmulq_s8 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return vandq_s8 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return vorrq_s8 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return veorq_s8 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return vbicq_s8 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_s8 ((int8_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return vminq_s8 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return vmaxq_s8 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return (vSIMDType) vceqq_s8 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return (vSIMDType) vcgtq_s8 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vcgeq_s8 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s8 (a, b, c); }
+    static forcedinline int8_t sum (vSIMDType a) noexcept                                      { return fb::sum (a); }
 };
 
 //==============================================================================
@@ -194,26 +200,28 @@ struct SIMDNativeOps<uint8_t>
     DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (uint8_t s) noexcept                    { return vdupq_n_u8 (s); }
-    static forcedinline vSIMDType load (const uint8_t* a) noexcept               { return vld1q_u8 (a); }
-    static forcedinline void store (vSIMDType value, uint8_t* a) noexcept        { vst1q_u8 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u8 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u8 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u8 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u8 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u8 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u8 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u8 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u8 ((uint8_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u8 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u8 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u8 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u8 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u8 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u8 (a, b, c); }
-    static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType expand (uint8_t s) noexcept                                  { return vdupq_n_u8 (s); }
+    static forcedinline vSIMDType load (const uint8_t* a) noexcept                             { return vld1q_u8 (a); }
+    static forcedinline void store (vSIMDType value, uint8_t* a) noexcept                      { vst1q_u8 (a, value); }
+    static forcedinline uint8_t get (vSIMDType v, size_t i) noexcept                           { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, uint8_t s) noexcept              { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_u8 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_u8 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return vmulq_u8 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return vandq_u8 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return vorrq_u8 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return veorq_u8 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return vbicq_u8 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_u8 ((uint8_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return vminq_u8 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return vmaxq_u8 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return (vSIMDType) vceqq_u8 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return (vSIMDType) vcgtq_u8 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vcgeq_u8 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u8 (a, b, c); }
+    static forcedinline uint8_t sum (vSIMDType a) noexcept                                     { return fb::sum (a); }
 };
 
 //==============================================================================
@@ -232,26 +240,28 @@ struct SIMDNativeOps<int16_t>
     DECLARE_NEON_SIMD_CONST (int16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (int16_t s) noexcept                    { return vdupq_n_s16 (s); }
-    static forcedinline vSIMDType load (const int16_t* a) noexcept               { return vld1q_s16 (a); }
-    static forcedinline void store (vSIMDType value, int16_t* a) noexcept        { vst1q_s16 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s16 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s16 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s16 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s16 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s16 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s16 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s16 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s16 ((int16_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s16 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s16 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s16 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s16 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s16 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s16 (a, b, c); }
-    static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType expand (int16_t s) noexcept                                  { return vdupq_n_s16 (s); }
+    static forcedinline vSIMDType load (const int16_t* a) noexcept                             { return vld1q_s16 (a); }
+    static forcedinline void store (vSIMDType value, int16_t* a) noexcept                      { vst1q_s16 (a, value); }
+    static forcedinline int16_t get (vSIMDType v, size_t i) noexcept                           { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, int16_t s) noexcept              { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_s16 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_s16 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return vmulq_s16 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return vandq_s16 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return vorrq_s16 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return veorq_s16 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return vbicq_s16 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_s16 ((int16_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return vminq_s16 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return vmaxq_s16 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return (vSIMDType) vceqq_s16 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return (vSIMDType) vcgtq_s16 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vcgeq_s16 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s16 (a, b, c); }
+    static forcedinline int16_t sum (vSIMDType a) noexcept                                     { return fb::sum (a); }
 };
 
 
@@ -271,26 +281,28 @@ struct SIMDNativeOps<uint16_t>
     DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (uint16_t s) noexcept                   { return vdupq_n_u16 (s); }
-    static forcedinline vSIMDType load (const uint16_t* a) noexcept              { return vld1q_u16 (a); }
-    static forcedinline void store (vSIMDType value, uint16_t* a) noexcept       { vst1q_u16 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u16 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u16 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u16 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u16 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u16 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u16 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u16 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u16 ((uint16_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u16 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u16 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u16 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u16 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u16 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u16 (a, b, c); }
-    static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType expand (uint16_t s) noexcept                                 { return vdupq_n_u16 (s); }
+    static forcedinline vSIMDType load (const uint16_t* a) noexcept                            { return vld1q_u16 (a); }
+    static forcedinline void store (vSIMDType value, uint16_t* a) noexcept                     { vst1q_u16 (a, value); }
+    static forcedinline uint16_t get (vSIMDType v, size_t i) noexcept                          { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, uint16_t s) noexcept             { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_u16 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_u16 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return vmulq_u16 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return vandq_u16 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return vorrq_u16 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return veorq_u16 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return vbicq_u16 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_u16 ((uint16_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return vminq_u16 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return vmaxq_u16 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return (vSIMDType) vceqq_u16 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return (vSIMDType) vcgtq_u16 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vcgeq_u16 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u16 (a, b, c); }
+    static forcedinline uint16_t sum (vSIMDType a) noexcept                                    { return fb::sum (a); }
 };
 
 //==============================================================================
@@ -309,26 +321,28 @@ struct SIMDNativeOps<int64_t>
     DECLARE_NEON_SIMD_CONST (int64_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (int64_t s) noexcept                    { return vdupq_n_s64 (s); }
-    static forcedinline vSIMDType load (const int64_t* a) noexcept               { return vld1q_s64 (a); }
-    static forcedinline void store (vSIMDType value, int64_t* a) noexcept        { vst1q_s64 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s64 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s64 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s64 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s64 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s64 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s64 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s64 ((int64_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
-    static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
+    static forcedinline vSIMDType expand (int64_t s) noexcept                                  { return vdupq_n_s64 (s); }
+    static forcedinline vSIMDType load (const int64_t* a) noexcept                             { return vld1q_s64 (a); }
+    static forcedinline void store (vSIMDType value, int64_t* a) noexcept                      { vst1q_s64 (a, value); }
+    static forcedinline int64_t get (vSIMDType v, size_t i) noexcept                           { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, int64_t s) noexcept              { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_s64 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_s64 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return fb::mul (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return vandq_s64 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return vorrq_s64 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return veorq_s64 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return vbicq_s64 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_s64 ((int64_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return fb::min (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return fb::max (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return fb::equal (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return fb::notEqual (a, b); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return fb::greaterThan (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return fb::greaterThanOrEqual (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<int32_t>::sum ((SIMDNativeOps<int32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
+    static forcedinline int64_t sum (vSIMDType a) noexcept                                     { return fb::sum (a); }
 };
 
 
@@ -348,24 +362,26 @@ struct SIMDNativeOps<uint64_t>
     DECLARE_NEON_SIMD_CONST (uint64_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (uint64_t s) noexcept                   { return vdupq_n_u64 (s); }
-    static forcedinline vSIMDType load (const uint64_t* a) noexcept              { return vld1q_u64 (a); }
-    static forcedinline void store (vSIMDType value, uint64_t* a) noexcept       { vst1q_u64 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u64 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u64 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u64 (a, b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u64 (a, b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u64 (a, b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u64 (b, a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u64 ((uint64_t*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType expand (uint64_t s) noexcept                                  { return vdupq_n_u64 (s); }
+    static forcedinline vSIMDType load (const uint64_t* a) noexcept                             { return vld1q_u64 (a); }
+    static forcedinline void store (vSIMDType value, uint64_t* a) noexcept                      { vst1q_u64 (a, value); }
+    static forcedinline uint64_t get (vSIMDType v, size_t i) noexcept                           { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, uint64_t s) noexcept              { v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                       { return vaddq_u64 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                       { return vsubq_u64 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                       { return fb::mul (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                   { return vandq_u64 (a, b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                   { return vorrq_u64 (a, b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                   { return veorq_u64 (a, b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept                { return vbicq_u64 (b, a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                                { return bit_notand (a, vld1q_u64 ((uint64_t*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                       { return fb::min (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                       { return fb::max (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                     { return fb::equal (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                  { return fb::notEqual (a, b); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept               { return fb::greaterThan (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept        { return fb::greaterThanOrEqual (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                  { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
     static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
     static forcedinline uint64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@@ -389,29 +405,31 @@ struct SIMDNativeOps<float>
     DECLARE_NEON_SIMD_CONST (float, kOne);
 
     //==============================================================================
-    static forcedinline vSIMDType expand (float s) noexcept                          { return vdupq_n_f32 (s); }
-    static forcedinline vSIMDType load (const float* a) noexcept                     { return vld1q_f32 (a); }
-    static forcedinline void store (vSIMDType value, float* a) noexcept              { vst1q_f32 (a, value); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_f32 (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_f32 (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_f32 (a, b); }
-    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
-    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
-    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                   { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
-    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
-    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
-    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
-    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
-    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_f32 (a, b); }
-    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_f32 (a, b); }
-    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
-    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_f32 (a, b, c); }
-    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
-    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
-    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
-    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }
+    static forcedinline vSIMDType expand (float s) noexcept                                    { return vdupq_n_f32 (s); }
+    static forcedinline vSIMDType load (const float* a) noexcept                               { return vld1q_f32 (a); }
+    static forcedinline float get (vSIMDType v, size_t i) noexcept                             { return v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, float s) noexcept                { v[i] = s; return v; }
+    static forcedinline void store (vSIMDType value, float* a) noexcept                        { vst1q_f32 (a, value); }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept                      { return vaddq_f32 (a, b); }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept                      { return vsubq_f32 (a, b); }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept                      { return vmulq_f32 (a, b); }
+    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
+    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept               { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
+    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                               { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
+    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                      { return vminq_f32 (a, b); }
+    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                      { return vmaxq_f32 (a, b); }
+    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                    { return (vSIMDType) vceqq_f32 (a, b); }
+    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept                 { return bit_not (equal (a, b)); }
+    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept              { return (vSIMDType) vcgtq_f32 (a, b); }
+    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vcgeq_f32 (a, b); }
+    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept                 { return (SIMDNativeOps<uint32_t>::sum ((SIMDNativeOps<uint32_t>::vSIMDType) notEqual (a, b)) == 0); }
+    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_f32 (a, b, c); }
+    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept                               { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
+    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept                               { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
+    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept                           { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
+    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept                            { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }
 
     //==============================================================================
     static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
@@ -438,15 +456,17 @@ template <>
 struct SIMDNativeOps<double>
 {
     //==============================================================================
-    typedef struct { double values [2]; } vSIMDType;
+    typedef struct { double v[2]; } vSIMDType;
     typedef SIMDFallbackOps<double, vSIMDType> fb;
 
-    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
-    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
-    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
-    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
-    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
-    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
+    static forcedinline vSIMDType expand (double s) noexcept                     { return {{s, s}}; }
+    static forcedinline vSIMDType load (const double* a) noexcept                { return {{a[0], a[1]}}; }
+    static forcedinline void store (vSIMDType v, double* a) noexcept             { a[0] = v.v[0]; a[1] = v.v[1]; }
+    static forcedinline double get (vSIMDType v, size_t i) noexcept              { return v.v[i]; }
+    static forcedinline vSIMDType set (vSIMDType v, size_t i, double s) noexcept { v.v[i] = s; return v; }
+    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return {{a.v[0] + b.v[0], a.v[1] + b.v[1]}}; }
+    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return {{a.v[0] - b.v[0], a.v[1] - b.v[1]}}; }
+    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return {{a.v[0] * b.v[0], a.v[1] * b.v[1]}}; }
     static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_and (a, b); }
     static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_or  (a, b); }
     static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_xor (a, b); }
diff --git a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
index f8ee84371b..dd80cd00db 100644
--- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
@@ -95,6 +95,8 @@ struct SIMDNativeOps<float>
     static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept                           { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
     static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept                      { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
     static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept                       { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
+    static forcedinline float  JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept                    { return SIMDFallbackOps<float, __m128>::get (v, i); }
+    static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept           { return SIMDFallbackOps<float, __m128>::set (v, i, s); }
 
     //==============================================================================
     static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
@@ -114,7 +116,7 @@ struct SIMDNativeOps<float>
         __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
         retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
        #endif
-        return ((float*) &retval) [0];
+        return _mm_cvtss_f32 (retval);
     }
 };
 
@@ -135,8 +137,8 @@ struct SIMDNativeOps<double>
     DECLARE_SSE_SIMD_CONST (double, kOne);
 
     //==============================================================================
-    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                       { return *reinterpret_cast<const __m128d*> (a); }
-    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m128d*> (a); }
+    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept                       { return load (a); }
+    static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return _mm_castsi128_pd (_mm_load_si128 ((const __m128i*) a)); }
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept                              { return _mm_load1_pd (&s); }
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept                         { return _mm_load_pd (a); }
     static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept               { _mm_store_pd (dest, value); }
@@ -159,7 +161,9 @@ struct SIMDNativeOps<double>
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept                            { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept                             { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept                        { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
-    static forcedinline __m128d oddevensum (__m128d a) noexcept                         { return a; }
+    static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept                         { return a; }
+    static forcedinline double  JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept                      { return SIMDFallbackOps<double, __m128d>::get (v, i); }
+    static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept            { return SIMDFallbackOps<double, __m128d>::set (v, i, s); }
 
     //==============================================================================
     static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
@@ -178,7 +182,7 @@ struct SIMDNativeOps<double>
        #else
         __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
        #endif
-        return ((double*) &retval) [0];
+        return _mm_cvtsd_f64 (retval);
     }
 };
 
@@ -196,7 +200,9 @@ struct SIMDNativeOps<int8_t>
     //==============================================================================
     DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
 
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept                       { return *reinterpret_cast<const __m128i*> (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept                       { return load (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept                         { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept                   { _mm_store_si128 ((__m128i*) p, v); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept                              { return _mm_set1_epi8 (s); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi8 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi8 (a, b); }
@@ -218,20 +224,10 @@ struct SIMDNativeOps<int8_t>
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline int8_t  JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<int8_t, __m128i>::get (v, i); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept            { return SIMDFallbackOps<int8_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const char*> (a);
-        return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
-                             b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
-    {
-        SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
@@ -244,18 +240,9 @@ struct SIMDNativeOps<int8_t>
             hi = _mm_hadd_epi16 (hi, hi);
         }
 
-        const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
-        const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
-
-        return lo_ptr[0] + hi_ptr[0];
+        return static_cast<int8_t> ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff));
        #else
-        int8_t sum = 0;
-        const int8_t* src = reinterpret_cast<const int8_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
-            sum += src [i];
-
-        return sum;
+        return SIMDFallbackOps<int8_t, __m128i>::sum (a);
        #endif
     }
 
@@ -285,8 +272,10 @@ struct SIMDNativeOps<uint8_t>
     DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
     DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
 
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept                      { return load (a); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept                        { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept                     { _mm_store_si128 ((__m128i*) p, v); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept                             { return _mm_set1_epi8 ((int8_t) s); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi8 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi8 (a, b); }
@@ -303,20 +292,10 @@ struct SIMDNativeOps<uint8_t>
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<uint8_t, __m128i>::get (v, i); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept           { return SIMDFallbackOps<uint8_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const char*> (a);
-        return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
-                             b[7],  b[6],  b[5],  b[4],  b[3],  b[2],  b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
@@ -329,18 +308,10 @@ struct SIMDNativeOps<uint8_t>
             hi = _mm_hadd_epi16 (hi, hi);
         }
 
-        const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
-        const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
-
-        return lo_ptr[0] + hi_ptr[0];
+        return static_cast<uint8_t> ((static_cast<uint32_t> (_mm_cvtsi128_si32 (lo)) & 0xffu)
+                                   + (static_cast<uint32_t> (_mm_cvtsi128_si32 (hi)) & 0xffu));
        #else
-        uint8_t sum = 0;
-        const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
-            sum += src [i];
-
-        return sum;
+        return SIMDFallbackOps<uint8_t, __m128i>::sum (a);
        #endif
     }
 
@@ -370,7 +341,9 @@ struct SIMDNativeOps<int16_t>
     DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept                      { return load (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept                        { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept                  { _mm_store_si128 ((__m128i*) p, v); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept                             { return _mm_set1_epi16 (s); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi16 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi16 (a, b); }
@@ -388,34 +361,20 @@ struct SIMDNativeOps<int16_t>
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<int16_t, __m128i>::get (v, i); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept           { return SIMDFallbackOps<int16_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
-    {
-        return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
-    {
-        SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
         __m128i tmp = _mm_hadd_epi16 (a, a);
         tmp = _mm_hadd_epi16 (tmp, tmp);
         tmp = _mm_hadd_epi16 (tmp, tmp);
-        return *reinterpret_cast<int16_t*> (&tmp);
 
+        return static_cast<int16_t> (_mm_cvtsi128_si32 (tmp) & 0xffff);
        #else
-        int16_t sum = 0;
-        const int16_t* src = reinterpret_cast<const int16_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
-            sum += src [i];
-
-        return sum;
+        return SIMDFallbackOps<int16_t, __m128i>::sum (a);
        #endif
     }
 };
@@ -436,58 +395,46 @@ struct SIMDNativeOps<uint16_t>
     DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept                     { return *reinterpret_cast<const __m128i*> (a); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept                            { return _mm_set1_epi16 ((int16_t) s); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi16 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi16 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept                     { return _mm_mullo_epi16 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept                     { return load (a); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept                       { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept                    { _mm_store_si128 ((__m128i*) p, v); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept                            { return _mm_set1_epi16 ((int16_t) s); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept                     { return _mm_mullo_epi16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
    #if defined(__SSE4__)
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { return _mm_min_epu16 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { return _mm_max_epu16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { return _mm_min_epu16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { return _mm_max_epu16 (a, b); }
    #else
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
    #endif
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept                   { return _mm_cmpeq_epi16 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept             { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept                   { return _mm_cmpeq_epi16 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept             { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<uint16_t, __m128i>::get (v, i); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept          { return SIMDFallbackOps<uint16_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int16_t*> (a);
-        return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
         __m128i tmp = _mm_hadd_epi16 (a, a);
         tmp = _mm_hadd_epi16 (tmp, tmp);
         tmp = _mm_hadd_epi16 (tmp, tmp);
-        return *reinterpret_cast<uint16_t*> (&tmp);
-       #else
-        uint16_t sum = 0;
-        const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
-            sum += src [i];
 
-        return sum;
+        return static_cast<uint16_t> (static_cast<uint32_t> (_mm_cvtsi128_si32 (tmp)) & 0xffffu);
+       #else
+        return SIMDFallbackOps<uint16_t, __m128i>::sum (a);
        #endif
     }
 };
@@ -507,9 +454,10 @@ struct SIMDNativeOps<int32_t>
     DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept                      { return load (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept                        { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept                  { _mm_store_si128 ((__m128i*) p, v); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept                             { return _mm_set1_epi32 (s); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept                        { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi32 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi32 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
@@ -523,27 +471,17 @@ struct SIMDNativeOps<int32_t>
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<int32_t, __m128i>::get (v, i); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept           { return SIMDFallbackOps<int32_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
-    {
-        SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
         __m128i tmp = _mm_hadd_epi32 (a, a);
-        tmp = _mm_hadd_epi32 (tmp, tmp);
-        return *reinterpret_cast<int32_t*> (&tmp);
+        return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp));
        #else
-        int32_t sum = 0;
-        const int32_t* src = reinterpret_cast<const int32_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
-            sum += src [i];
-
-        return sum;
+        return SIMDFallbackOps<int32_t, __m128i>::sum (a);
        #endif
     }
 
@@ -596,49 +534,35 @@ struct SIMDNativeOps<uint32_t>
     DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept                     { return *reinterpret_cast<const __m128i*> (a); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept                            { return _mm_set1_epi32 ((int32_t) s); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi32 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi32 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept                   { return _mm_cmpeq_epi32 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept             { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept                     { return load (a); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept                       { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept                    { _mm_store_si128 ((__m128i*) p, v); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept                            { return _mm_set1_epi32 ((int32_t) s); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi32 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi32 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept                   { return _mm_cmpeq_epi32 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept             { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<uint32_t, __m128i>::get (v, i); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept          { return SIMDFallbackOps<uint32_t, __m128i>::set (v, i, s); }
 
     //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int32_t*> (a);
-        return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
-    }
-
     static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
     {
        #ifdef __SSSE3__
         __m128i tmp = _mm_hadd_epi32 (a, a);
-        tmp = _mm_hadd_epi32 (tmp, tmp);
-        return *reinterpret_cast<uint32_t*> (&tmp);
+        return static_cast<uint32_t> (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)));
        #else
-        uint32_t sum = 0;
-        const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
-
-        for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
-            sum += src [i];
-
-        return sum;
+        return SIMDFallbackOps<uint32_t, __m128i>::sum (a);
        #endif
     }
 
@@ -689,16 +613,10 @@ struct SIMDNativeOps<int64_t>
     //==============================================================================
     DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
 
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
-    {
-        __m128i retval;
-        int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
-        ptr[0] = ptr[1] = s;
-        return retval;
-    }
-
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept                        { return _mm_set_epi64x (a[1], a[0]); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return *reinterpret_cast<const __m128i*> (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept                      { return load (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept                             { return _mm_set1_epi64x (s); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept                        { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void    JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept                  { _mm_store_si128 ((__m128i*) p, v); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi64 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi64 (a, b); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
@@ -708,36 +626,14 @@ struct SIMDNativeOps<int64_t>
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
-    static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
     static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
-
-    //==============================================================================
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
-    {
-        SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
-    }
-
-    static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
-    {
-        const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
-        return ptr[0] + ptr[1];
-    }
-
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
-    {
-        __m128i retval;
-
-        const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
-        const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
-        int64_t* dst =  reinterpret_cast<int64_t*> (&retval);
-
-        dst[0] = aptr[0] * bptr[0];
-        dst[1] = aptr[1] * bptr[1];
-
-        return retval;
-    }
+    static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<int64_t, __m128i>::get (v, i); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept           { return SIMDFallbackOps<int64_t, __m128i>::set (v, i, s); }
+    static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept                                { return SIMDFallbackOps<int64_t, __m128i>::sum (a); }
+    static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept                     { return SIMDFallbackOps<int64_t, __m128i>::mul (a, b); }
 
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
     {
@@ -752,19 +648,10 @@ struct SIMDNativeOps<int64_t>
 
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
     {
-       #if defined(__SSE4_1__) && !defined(__clang__)
+       #if defined(__SSE4_1__)
         return _mm_cmpgt_epi64 (a, b);
        #else
-        __m128i retval;
-
-        const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
-        const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
-        int64_t* dst =  reinterpret_cast<int64_t*> (&retval);
-
-        dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
-        dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
-
-        return retval;
+        return SIMDFallbackOps<int64_t, __m128i>::greaterThan (a, b);
        #endif
     }
 };
@@ -784,61 +671,28 @@ struct SIMDNativeOps<uint64_t>
     DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
     DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
 
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
-    {
-         __m128i retval;
-        uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
-        ptr[0] = ptr[1] = s;
-        return retval;
-    }
-
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept                     { return *reinterpret_cast<const __m128i*> (a); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi64 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi64 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
-    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
-
-    //==============================================================================
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
-    {
-        const auto* b = reinterpret_cast<const int64_t*> (a);
-        return _mm_set_epi64x (b[1], b[0]);
-    }
-
-    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
-    {
-        SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
-    }
-
-    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
-    {
-        const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
-        return ptr[0] + ptr[1];
-    }
-
-    static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
-    {
-        __m128i retval;
-
-        const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
-        const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
-        uint64_t* dst =  reinterpret_cast<uint64_t*> (&retval);
-
-        dst[0] = aptr[0] * bptr[0];
-        dst[1] = aptr[1] * bptr[1];
-
-        return retval;
-    }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept                     { return load (a); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept                            { return _mm_set1_epi64x ((int64_t) s); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept                              { return _mm_xor_si128 (a, vconst (kHighBit)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept                       { return _mm_load_si128 ((const __m128i*) a); }
+    static forcedinline void     JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept                 { _mm_store_si128 ((__m128i*) p, v); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept                     { return _mm_add_epi64 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept                     { return _mm_sub_epi64 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept                 { return _mm_and_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_or  (__m128i a, __m128i b) noexcept                 { return _mm_or_si128  (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept                 { return _mm_xor_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept              { return _mm_andnot_si128 (a, b); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept                            { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept                     { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept                     { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
+    static forcedinline bool     JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
+    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept                      { return SIMDFallbackOps<uint64_t, __m128i>::get (v, i); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept          { return SIMDFallbackOps<uint64_t, __m128i>::set (v, i, s); }
+    static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept                                { return SIMDFallbackOps<uint64_t, __m128i>::sum (a); }
+    static forcedinline __m128i  JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept                     { return SIMDFallbackOps<uint64_t, __m128i>::mul (a, b); }
 
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
     {
@@ -853,19 +707,10 @@ struct SIMDNativeOps<uint64_t>
 
     static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
     {
-       #if defined(__SSE4_1__) && !defined(__clang__)
-        return _mm_cmpgt_epi64 (a, b);
+       #if defined(__SSE4_1__)
+        return _mm_cmpgt_epi64 (ssign (a), ssign (b));
        #else
-        __m128i retval;
-
-        const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
-        const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
-        uint64_t* dst =  reinterpret_cast<uint64_t*> (&retval);
-
-        dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
-        dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
-
-        return retval;
+        return SIMDFallbackOps<uint64_t, __m128i>::greaterThan (a, b);
        #endif
     }
 };