Various additions to SIMDRegister

8 years ago · 8f02179bbf
--- a/modules/juce_dsp/containers/juce_SIMDRegister.h
+++ b/modules/juce_dsp/containers/juce_SIMDRegister.h
@@ -102,6 +102,18 @@ struct SIMDRegister

    vSIMDType value;

    /** Default constructor. */
    inline JUCE_VECTOR_CALLTYPE SIMDRegister() noexcept {}

    /** Constructs an object from the native SIMD type. */
    inline JUCE_VECTOR_CALLTYPE SIMDRegister (vSIMDType a) noexcept : value (a) {}

    /** Constructs an object from a scalar type by broadcasting it to all elements. */
    inline JUCE_VECTOR_CALLTYPE SIMDRegister (Type s) noexcept  { *this = s; }

    /** Destrutor. */
    inline JUCE_VECTOR_CALLTYPE ~SIMDRegister() noexcept {}

    //==============================================================================
    /** Returns the number of elements in this vector. */
    static constexpr size_t size() noexcept    { return SIMDNumElements; }
@@ -232,6 +244,19 @@ struct SIMDRegister
    /** Returns a vector where each element is the bit-xor'd value of the corresponding element in the receiver and the scalar s.*/
    inline SIMDRegister JUCE_VECTOR_CALLTYPE operator^ (MaskType s) const noexcept      { return { NativeOps::bit_xor (value, toVecType (s)) }; }

    //==============================================================================
    /** Returns true if all elements-wise comparisons return true. */
    inline bool JUCE_VECTOR_CALLTYPE operator== (SIMDRegister other) const noexcept    { return  NativeOps::allEqual (value, other.value); }

    /** Returns true if any elements-wise comparisons return false. */
    inline bool JUCE_VECTOR_CALLTYPE operator!= (SIMDRegister other) const noexcept    { return ! (*this == other); }

    /** Returns true if all elements are equal to the scalar. */
    inline bool JUCE_VECTOR_CALLTYPE operator== (Type s) const noexcept                { return *this == SIMDRegister::expand (s); }

    /** Returns true if any elements are not equal to the scalar. */
    inline bool JUCE_VECTOR_CALLTYPE operator!= (Type s) const noexcept                { return ! (*this == s); }

    //==============================================================================
    /** Returns a SIMDRegister of the corresponding integral type where each element has each bit set
        if the corresponding element of a is equal to the corresponding element of b, or zero otherwise.
--- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
+++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
@@ -370,8 +370,10 @@ public:
                {
                    type array_a [SIMDRegister<type>::SIMDNumElements];

                    union
                    union ConversionUnion
                    {
                        inline ConversionUnion() {}
                        inline ~ConversionUnion() {}
                        SIMDRegister<type> floatVersion;
                        vMaskType intVersion;
                    } a, b;
@@ -512,6 +514,39 @@ public:
                u.expect (vecEqualToArray (le,  array_le ));
                u.expect (vecEqualToArray (gt,  array_gt ));
                u.expect (vecEqualToArray (ge,  array_ge ));

                do
                {
                    SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
                    SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister<type>::SIMDNumElements, random);
                } while (std::equal (array_a, array_a + SIMDRegister<type>::SIMDNumElements, array_b));

                copy (a, array_a);
                copy (b, array_b);
                u.expect (a != b);
                u.expect (b != a);
                u.expect (! (a == b));
                u.expect (! (b == a));

                SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister<type>::SIMDNumElements, random);
                copy (a, array_a);
                copy (b, array_a);

                u.expect (a == b);
                u.expect (b == a);
                u.expect (! (a != b));
                u.expect (! (b != a));

                auto scalar = a[0];
                a = SIMDRegister<type>::expand (scalar);

                u.expect (a == scalar);
                u.expect (! (a != scalar));

                scalar--;

                u.expect (a != scalar);
                u.expect (! (a == scalar));
            }
        }
    };
--- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
@@ -82,6 +82,7 @@ struct SIMDNativeOps<float>
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept               { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept            { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept     { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
    static forcedinline bool   JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept               { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept  { return _mm256_fmadd_ps (b, c, a); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept                          { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
    static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept                           { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
@@ -141,6 +142,7 @@ struct SIMDNativeOps<double>
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept               { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept            { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept     { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept                { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept                           { return _mm256_shuffle_pd (a, a, 0); }
    static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept                            { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
@@ -261,6 +263,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept                   { return _mm256_cmpeq_epi8 (a, b); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept             { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }

@@ -336,6 +339,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
@@ -390,6 +394,7 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
@@ -443,6 +448,7 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
@@ -495,6 +501,7 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
@@ -543,6 +550,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
@@ -614,6 +622,7 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept                { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }

    //==============================================================================
    static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
--- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
@@ -117,6 +117,19 @@ struct SIMDFallbackOps
        return retval;
    }

    //==============================================================================
    static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept
    {
        auto* aSrc = reinterpret_cast<const ScalarType*> (&a);
        auto* bSrc = reinterpret_cast<const ScalarType*> (&b);

        for (size_t i = 0; i < n; ++i)
            if (aSrc[i] != bSrc[i])
                return false;

        return true;
    }

    //==============================================================================
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
    {
--- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
@@ -51,86 +51,82 @@ template <typename type>
 struct SIMDNativeOps;

 //==============================================================================
 /** Single-precision floating point NEON intrinsics. */
 /** Unsigned 32-bit integer NEON intrinsics. */
 template <>
 struct SIMDNativeOps<float>
 struct SIMDNativeOps<uint32_t>
 {
    //==============================================================================
    typedef float32x4_t vSIMDType;
    typedef uint32x4_t vMaskType;
    typedef SIMDFallbackOps<float, vSIMDType> fb;
    typedef uint32x4_t vSIMDType;
    typedef SIMDFallbackOps<uint32_t, vSIMDType> fb;

    //==============================================================================
    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
    DECLARE_NEON_SIMD_CONST (int32_t, kEvenHighBit);
    DECLARE_NEON_SIMD_CONST (float, kOne);
    DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (float s) noexcept                         { return vdupq_n_f32 (s); }
    static forcedinline vSIMDType load (const float* a) noexcept                    { return vld1q_f32 (a); }
    static forcedinline void store (vSIMDType value, float* a) noexcept             { vst1q_f32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept           { return vaddq_f32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept           { return vsubq_f32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept           { return vmulq_f32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept       { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                    { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u32 (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u32  (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u32 (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u32 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_f32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_f32 (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_f32 (a, b, c); }
    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
    static forcedinline float sum (vSIMDType a) noexcept { return fb::sum (a); }
    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }

    //==============================================================================
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u32 (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u32 (a, b, c); }
    static forcedinline uint32_t sum (vSIMDType a) noexcept
    {
        vSIMDType rr_ir = mul (a, dupeven (b));
        vSIMDType ii_ri = mul (swapevenodd (a), dupodd (b));
        return add (rr_ir, bit_xor (ii_ri, vld1q_f32 ((float*) kEvenHighBit)));
        auto rr = vadd_u32 (vget_high_u32 (a), vget_low_u32 (a));
        return vget_lane_u32 (vpadd_u32 (rr, rr), 0);
    }
 };

 //==============================================================================
 /** Double-precision floating point NEON intrinsics does not exist in NEON
    so we need to emulate this.
 */
 /** Signed 32-bit integer NEON intrinsics. */
 template <>
 struct SIMDNativeOps<double>
 struct SIMDNativeOps<int32_t>
 {
    //==============================================================================
    typedef struct { double values [2]; } vSIMDType;
    typedef SIMDFallbackOps<double, vSIMDType> fb;
    typedef int32x4_t vSIMDType;
    typedef SIMDFallbackOps<int32_t, vSIMDType> fb;

    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_and (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_or  (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_xor (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return fb::bit_notand (a, b); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return fb::bit_not (a); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept { return fb::cmplxmul (a, b); }
    static forcedinline double sum (vSIMDType a) noexcept { return fb::sum (a); }
    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return a; }
    //==============================================================================
    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s32 (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s32 (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s32 (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s32 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s32 (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s32 (a, b, c); }
    static forcedinline int32_t sum (vSIMDType a) noexcept
    {
        auto rr = vadd_s32 (vget_high_s32 (a), vget_low_s32 (a));
        rr = vpadd_s32 (rr, rr);
        return vget_lane_s32 (rr, 0);
    }
 };

 //==============================================================================
@@ -163,6 +159,7 @@ struct SIMDNativeOps<int8_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s8 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s8 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s8 (a, b, c); }
    static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@@ -197,6 +194,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u8 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u8 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u8 (a, b, c); }
    static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@@ -231,6 +229,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s16 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s16 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s16 (a, b, c); }
    static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@@ -266,79 +265,11 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u16 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u16 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u16 (a, b, c); }
    static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };

 //==============================================================================
 /** Signed 32-bit integer NEON intrinsics. */
 template <>
 struct SIMDNativeOps<int32_t>
 {
    //==============================================================================
    typedef int32x4_t vSIMDType;
    typedef SIMDFallbackOps<int32_t, vSIMDType> fb;

    //==============================================================================
    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (int32_t s) noexcept                    { return vdupq_n_s32 (s); }
    static forcedinline vSIMDType load (const int32_t* a) noexcept               { return vld1q_s32 (a); }
    static forcedinline void store (vSIMDType value, int32_t* a) noexcept        { vst1q_s32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_s32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_s32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_s32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_s32 (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_s32 (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_s32 (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_s32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_s32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_s32 (a, b); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_s32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_s32 (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_s32 (a, b, c); }
    static forcedinline int32_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };


 //==============================================================================
 /** Unsigned 32-bit integer NEON intrinsics. */
 template <>
 struct SIMDNativeOps<uint32_t>
 {
    //==============================================================================
    typedef uint32x4_t vSIMDType;
    typedef SIMDFallbackOps<uint32_t, vSIMDType> fb;

    //==============================================================================
    DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);

    //==============================================================================
    static forcedinline vSIMDType expand (uint32_t s) noexcept                   { return vdupq_n_u32 (s); }
    static forcedinline vSIMDType load (const uint32_t* a) noexcept              { return vld1q_u32 (a); }
    static forcedinline void store (vSIMDType value, uint32_t* a) noexcept       { vst1q_u32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_u32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_u32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_u32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return vandq_u32 (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return vorrq_u32  (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return veorq_u32 (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_u32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_u32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_u32 (a, b); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_u32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_u32 (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_u32 (a, b, c); }
    static forcedinline uint32_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };

 //==============================================================================
 /** Signed 64-bit integer NEON intrinsics. */
 template <>
@@ -369,6 +300,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<int32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
    static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };
@@ -404,10 +336,101 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return fb::multiplyAdd (a, b, c); }
    static forcedinline uint64_t sum (vSIMDType a) noexcept { return fb::sum (a); }
 };

    //==============================================================================
 /** Single-precision floating point NEON intrinsics. */
 template <>
 struct SIMDNativeOps<float>
 {
    //==============================================================================
    typedef float32x4_t vSIMDType;
    typedef uint32x4_t vMaskType;
    typedef SIMDFallbackOps<float, vSIMDType> fb;

    //==============================================================================
    DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet);
    DECLARE_NEON_SIMD_CONST (int32_t, kEvenHighBit);
    DECLARE_NEON_SIMD_CONST (float, kOne);

    //==============================================================================
    static forcedinline vSIMDType expand (float s) noexcept                          { return vdupq_n_f32 (s); }
    static forcedinline vSIMDType load (const float* a) noexcept                     { return vld1q_f32 (a); }
    static forcedinline void store (vSIMDType value, float* a) noexcept              { vst1q_f32 (a, value); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return vaddq_f32 (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return vsubq_f32 (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return vmulq_f32 (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                   { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return vminq_f32 (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return vmaxq_f32 (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return (vSIMDType) vceqq_f32 (a, b); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return bit_not (equal (a, b)); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return (vSIMDType) vcgtq_f32 (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return (vSIMDType) vcgeq_f32 (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return (SIMDNativeOps<uint32_t>::sum (notEqual (a, b)) == 0); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept  { return vmlaq_f32 (a, b, c); }
    static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)>     (a); }
    static forcedinline vSIMDType dupodd  (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)>     (a); }
    static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); }
    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); }

    //==============================================================================
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept
    {
        vSIMDType rr_ir = mul (a, dupeven (b));
        vSIMDType ii_ri = mul (swapevenodd (a), dupodd (b));
        return add (rr_ir, bit_xor (ii_ri, vld1q_f32 ((float*) kEvenHighBit)));
    }

    static forcedinline float sum (vSIMDType a) noexcept
    {
        auto rr = vadd_f32 (vget_high_f32 (a), vget_low_f32 (a));
        return vget_lane_f32 (vpadd_f32 (rr, rr), 0);
    }
 };

 //==============================================================================
 /** Double-precision floating point NEON intrinsics does not exist in NEON
    so we need to emulate this.
 */
 template <>
 struct SIMDNativeOps<double>
 {
    //==============================================================================
    typedef struct { double values [2]; } vSIMDType;
    typedef SIMDFallbackOps<double, vSIMDType> fb;

    static forcedinline vSIMDType expand (double s) noexcept                     { return fb::expand (s); }
    static forcedinline vSIMDType load (const double* a) noexcept                { return fb::load (a); }
    static forcedinline void store (vSIMDType value, double* a) noexcept         { fb::store (value, a); }
    static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept        { return fb::add (a, b); }
    static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept        { return fb::sub (a, b); }
    static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept        { return fb::mul (a, b); }
    static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_and (a, b); }
    static forcedinline vSIMDType bit_or  (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_or  (a, b); }
    static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept    { return fb::bit_xor (a, b); }
    static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return fb::bit_notand (a, b); }
    static forcedinline vSIMDType bit_not (vSIMDType a) noexcept                 { return fb::bit_not (a); }
    static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept                    { return fb::min (a, b); }
    static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept                    { return fb::max (a, b); }
    static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept                  { return fb::equal (a, b); }
    static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::notEqual (a, b); }
    static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept            { return fb::greaterThan (a, b); }
    static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept     { return fb::greaterThanOrEqual (a, b); }
    static forcedinline bool      allEqual (vSIMDType a, vSIMDType b) noexcept               { return fb::allEqual (a, b); }
    static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); }
    static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept { return fb::cmplxmul (a, b); }
    static forcedinline double sum (vSIMDType a) noexcept { return fb::sum (a); }
    static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return a; }
 };

 #endif

 } // namespace dsp
--- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
+++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h
@@ -81,6 +81,7 @@ struct SIMDNativeOps<float>
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept               { return _mm_cmpneq_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept            { return _mm_cmpgt_ps (a, b); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept     { return _mm_cmpge_ps (a, b); }
    static forcedinline bool   JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept              { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept  { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept                          { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
    static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept                           { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
@@ -142,6 +143,7 @@ struct SIMDNativeOps<double>
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept                { return _mm_cmpneq_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept             { return _mm_cmpgt_pd (a, b); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept      { return _mm_cmpge_pd (a, b); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept               { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept  { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept                            { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
    static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept                             { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
@@ -201,6 +203,7 @@ struct SIMDNativeOps<int8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
@@ -282,6 +285,7 @@ struct SIMDNativeOps<uint8_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
@@ -363,6 +367,7 @@ struct SIMDNativeOps<int16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
@@ -431,6 +436,7 @@ struct SIMDNativeOps<uint16_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
@@ -490,6 +496,7 @@ struct SIMDNativeOps<int32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
@@ -575,6 +582,7 @@ struct SIMDNativeOps<uint32_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
@@ -671,6 +679,7 @@ struct SIMDNativeOps<int64_t>
    static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
@@ -762,6 +771,7 @@ struct SIMDNativeOps<uint64_t>
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept      { return bit_or (greaterThan (a, b), equal (a,b)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept  { return add (a, mul (b, c)); }
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept                { return bit_not (equal (a, b)); }
    static forcedinline bool    JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept                { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }

    //==============================================================================
    static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
--- a/modules/juce_dsp/processors/juce_FIRFilter.h
+++ b/modules/juce_dsp/processors/juce_FIRFilter.h
@@ -172,7 +172,7 @@ namespace FIR
        static SampleType JUCE_VECTOR_CALLTYPE processSingleSample (SampleType sample, SampleType* buf,
                                                                    const NumericType* fir, size_t m, size_t& p) noexcept
        {
            SampleType out = {};
            SampleType out (0);

            buf[p] = sample;

--- a/modules/juce_dsp/processors/juce_FIRFilter_test.cpp
+++ b/modules/juce_dsp/processors/juce_FIRFilter_test.cpp
@@ -106,7 +106,7 @@ class FIRFilterTest : public UnitTest

            buffer[0] = input[i];

            SampleType sum{};
            SampleType sum (0);

            for (size_t j = 0; j < numCoefficients; ++j)
                sum += buffer[j] * firCoefficients[j];