diff --git a/modules/juce_dsp/containers/juce_SIMDRegister.h b/modules/juce_dsp/containers/juce_SIMDRegister.h index ad9730894c..2e2b13a77d 100644 --- a/modules/juce_dsp/containers/juce_SIMDRegister.h +++ b/modules/juce_dsp/containers/juce_SIMDRegister.h @@ -95,6 +95,10 @@ struct SIMDRegister and scalar types (used internally). */ typedef CmplxSIMDOps CmplxOps; + /** Type which is returned when using the subscript operator. The returned type + should be used just like the type ElementType. */ + struct ElementAccess; + //============================================================================== /** The size in bytes of this register. */ static constexpr size_t SIMDRegisterSize = sizeof (vSIMDType); @@ -146,18 +150,34 @@ struct SIMDRegister //============================================================================== /** Returns the idx-th element of the receiver. Note that this does not check if idx is larger than the native register size. */ - inline ElementType JUCE_VECTOR_CALLTYPE operator[] (size_t idx) const noexcept + inline ElementType JUCE_VECTOR_CALLTYPE get (size_t idx) const noexcept + { + jassert (idx < SIMDNumElements); + return CmplxOps::get (value, idx); + } + + /** Sets the idx-th element of the receiver. Note that this does not check if idx + is larger than the native register size. */ + inline void JUCE_VECTOR_CALLTYPE set (size_t idx, ElementType v) noexcept { jassert (idx < SIMDNumElements); - return reinterpret_cast (&value) [idx]; + value = CmplxOps::set (value, idx, v); } + //============================================================================== /** Returns the idx-th element of the receiver. Note that this does not check if idx is larger than the native register size. */ - inline ElementType& JUCE_VECTOR_CALLTYPE operator[] (size_t idx) noexcept + inline ElementType JUCE_VECTOR_CALLTYPE operator[] (size_t idx) const noexcept + { + return get (idx); + } + + /** Returns the idx-th element of the receiver. Note that this does not check if idx + is larger than the native register size. */ + inline ElementAccess JUCE_VECTOR_CALLTYPE operator[] (size_t idx) noexcept { jassert (idx < SIMDNumElements); - return reinterpret_cast (&value) [idx]; + return ElementAccess (*this, idx); } //============================================================================== @@ -371,114 +391,9 @@ private: } }; -#ifndef DOXYGEN -//============================================================================== -/* This class is used internally by SIMDRegister to abstract away differences - in operations which are different for complex and pure floating point types. */ - -// the pure floating-point version -template -struct CmplxSIMDOps -{ - typedef typename SIMDNativeOps::vSIMDType vSIMDType; - - static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept - { - return SIMDNativeOps::load (a); - } - - static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept - { - SIMDNativeOps::store (value, dest); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept - { - return SIMDNativeOps::expand (s); - } - - static inline Scalar JUCE_VECTOR_CALLTYPE sum (vSIMDType a) noexcept - { - return SIMDNativeOps::sum (a); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept - { - return SIMDNativeOps::mul (a, b); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept - { - return SIMDNativeOps::multiplyAdd (a, b, c); - } -}; - -// The pure complex version -template -struct CmplxSIMDOps> -{ - typedef typename SIMDNativeOps::vSIMDType vSIMDType; - - static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex* a) noexcept - { - return SIMDNativeOps::load (reinterpret_cast (a)); - } - - static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex* dest) noexcept - { - SIMDNativeOps::store (value, reinterpret_cast (dest)); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex s) noexcept - { - const int n = sizeof (vSIMDType) / sizeof (Scalar); - - union - { - vSIMDType v; - Scalar floats[n]; - } u; - - for (int i = 0; i < n; ++i) - u.floats[i] = (i & 1) == 0 ? s.real() : s.imag(); - - return u.v; - } - - static inline std::complex JUCE_VECTOR_CALLTYPE sum (vSIMDType a) noexcept - { - vSIMDType result = SIMDNativeOps::oddevensum (a); - auto* ptr = reinterpret_cast (&result); - return std::complex (ptr[0], ptr[1]); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept - { - return SIMDNativeOps::cmplxmul (a, b); - } - - static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept - { - return SIMDNativeOps::add (a, SIMDNativeOps::cmplxmul (b, c)); - } -}; -#endif +} // namespace dsp +} // namespace juce -//============================================================================== #ifndef DOXYGEN - namespace util - { - template - inline void snapToZero (SIMDRegister&) noexcept {} - } + #include "juce_SIMDRegister_Impl.h" #endif - -} // namespace dsp - -// Extend some common used global functions to SIMDRegister types -template -inline dsp::SIMDRegister JUCE_VECTOR_CALLTYPE jmin (dsp::SIMDRegister a, dsp::SIMDRegister b) { return dsp::SIMDRegister::min (a, b); } -template -inline dsp::SIMDRegister JUCE_VECTOR_CALLTYPE jmax (dsp::SIMDRegister a, dsp::SIMDRegister b) { return dsp::SIMDRegister::max (a, b); } - -} // namespace juce diff --git a/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h b/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h new file mode 100644 index 0000000000..6d11240e3d --- /dev/null +++ b/modules/juce_dsp/containers/juce_SIMDRegister_Impl.h @@ -0,0 +1,176 @@ +/* + ============================================================================== + + This file is part of the JUCE library. + Copyright (c) 2017 - ROLI Ltd. + + JUCE is an open source library subject to commercial or open-source + licensing. + + By using JUCE, you agree to the terms of both the JUCE 5 End-User License + Agreement and JUCE 5 Privacy Policy (both updated and effective as of the + 27th April 2017). + + End User License Agreement: www.juce.com/juce-5-licence + Privacy Policy: www.juce.com/juce-5-privacy-policy + + Or: You may also use this code under the terms of the GPL v3 (see + www.gnu.org/licenses). + + JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER + EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE + DISCLAIMED. + + ============================================================================== +*/ + +namespace juce +{ +namespace dsp +{ + + +//============================================================================== +template +struct SIMDRegister::ElementAccess +{ + operator Type() const { return simd.get (idx); } + ElementAccess& operator= (Type scalar) noexcept { simd.set (idx, scalar); return *this; } + ElementAccess& operator= (ElementAccess& o) noexcept { return operator= ((Type) o); } + +private: + friend struct SIMDRegister; + ElementAccess (SIMDRegister& owner, size_t index) noexcept : simd (owner), idx (index) {} + SIMDRegister& simd; + size_t idx; +}; + +//============================================================================== +/* This class is used internally by SIMDRegister to abstract away differences + in operations which are different for complex and pure floating point types. */ + +// the pure floating-point version +template +struct CmplxSIMDOps +{ + typedef typename SIMDNativeOps::vSIMDType vSIMDType; + + static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept + { + return SIMDNativeOps::load (a); + } + + static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept + { + SIMDNativeOps::store (value, dest); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept + { + return SIMDNativeOps::expand (s); + } + + static inline Scalar JUCE_VECTOR_CALLTYPE get (vSIMDType v, std::size_t i) noexcept + { + return SIMDNativeOps::get (v, i); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE set (vSIMDType v, std::size_t i, Scalar s) noexcept + { + return SIMDNativeOps::set (v, i, s); + } + + static inline Scalar JUCE_VECTOR_CALLTYPE sum (vSIMDType a) noexcept + { + return SIMDNativeOps::sum (a); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept + { + return SIMDNativeOps::mul (a, b); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept + { + return SIMDNativeOps::multiplyAdd (a, b, c); + } +}; + +// The pure complex version +template +struct CmplxSIMDOps> +{ + typedef typename SIMDNativeOps::vSIMDType vSIMDType; + + static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex* a) noexcept + { + return SIMDNativeOps::load (reinterpret_cast (a)); + } + + static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex* dest) noexcept + { + SIMDNativeOps::store (value, reinterpret_cast (dest)); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex s) noexcept + { + const int n = sizeof (vSIMDType) / sizeof (Scalar); + + union + { + vSIMDType v; + Scalar floats[n]; + } u; + + for (int i = 0; i < n; ++i) + u.floats[i] = (i & 1) == 0 ? s.real() : s.imag(); + + return u.v; + } + + static inline std::complex JUCE_VECTOR_CALLTYPE get (vSIMDType v, std::size_t i) noexcept + { + auto j = i << 1; + return std::complex (SIMDNativeOps::get (v, j), SIMDNativeOps::get (v, j + 1)); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE set (vSIMDType v, std::size_t i, std::complex s) noexcept + { + auto j = i << 1; + return SIMDNativeOps::set (SIMDNativeOps::set (v, j, s.real()), j + 1, s.imag()); + } + + static inline std::complex JUCE_VECTOR_CALLTYPE sum (vSIMDType a) noexcept + { + vSIMDType result = SIMDNativeOps::oddevensum (a); + auto* ptr = reinterpret_cast (&result); + return std::complex (ptr[0], ptr[1]); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE mul (vSIMDType a, vSIMDType b) noexcept + { + return SIMDNativeOps::cmplxmul (a, b); + } + + static inline vSIMDType JUCE_VECTOR_CALLTYPE muladd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept + { + return SIMDNativeOps::add (a, SIMDNativeOps::cmplxmul (b, c)); + } +}; + +//============================================================================== + namespace util + { + template + inline void snapToZero (SIMDRegister&) noexcept {} + } + +} // namespace dsp + +// Extend some common used global functions to SIMDRegister types +template +inline dsp::SIMDRegister JUCE_VECTOR_CALLTYPE jmin (dsp::SIMDRegister a, dsp::SIMDRegister b) { return dsp::SIMDRegister::min (a, b); } +template +inline dsp::SIMDRegister JUCE_VECTOR_CALLTYPE jmax (dsp::SIMDRegister a, dsp::SIMDRegister b) { return dsp::SIMDRegister::max (a, b); } + +} // namespace juce diff --git a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp index 5b13b63a21..8213aa3439 100644 --- a/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp +++ b/modules/juce_dsp/containers/juce_SIMDRegister_test.cpp @@ -31,37 +31,78 @@ namespace dsp namespace SIMDRegister_test_internal { + template struct RandomPrimitive {}; + template - static void fillRandom (type* dst, const int size, Random& random) + struct RandomPrimitive::value>::type> { - bool is_signed = std::is_signed::value; + static type next (Random& random) + { + return static_cast (std::is_signed::value ? (random.nextFloat() * 16.0) - 8.0 + : (random.nextFloat() * 8.0)); + + } + }; - for (int i = 0; i < size; ++i) + template + struct RandomPrimitive::value>::type> + { + static type next (Random& random) { - if (is_signed) - { - *dst++ = static_cast ((random.nextFloat() * 16.0) - 8.0); - } - else - { - *dst++ = static_cast (random.nextFloat() * 8.0); - } + return static_cast (random.nextInt64()); + } - } + }; + template struct RandomValue { static type next (Random& random) { return RandomPrimitive::next (random); } }; template - static void fillRandom (std::complex* dst, const int size, Random& random) + struct RandomValue> { - for (int i = 0; i < size; ++i) + static std::complex next (Random& random) { - type real, imag; + return {RandomPrimitive::next (random), RandomPrimitive::next (random)}; + } + }; - real = static_cast ((random.nextFloat() * 16.0) - 8.0); - imag = static_cast ((random.nextFloat() * 16.0) - 8.0); - *dst++ = std::complex (real, imag); + template + struct VecFiller + { + static void fill (type* dst, const int size, Random& random) + { + for (int i = 0; i < size; ++i) + dst[i] = RandomValue::next (random); } - } + }; + + // We need to specialise for complex types: otherwise GCC 6 gives + // us an ICE internal compiler error after which the compiler seg faults. + template + struct VecFiller> + { + static void fill (std::complex* dst, const int size, Random& random) + { + for (int i = 0; i < size; ++i) + dst[i] = std::complex (RandomValue::next (random), RandomValue::next (random)); + } + }; + + template + struct VecFiller> + { + static SIMDRegister fill(Random& random) + { + constexpr int size = (int) SIMDRegister::SIMDNumElements; + #ifdef _MSC_VER + __declspec(align(sizeof (SIMDRegister))) type elements[size]; + #else + type elements[size] __attribute__((aligned(sizeof (SIMDRegister)))); + #endif + + VecFiller::fill (elements, size, random); + return SIMDRegister::fromRawArray (elements); + } + }; // Avoid visual studio warning template @@ -102,10 +143,17 @@ public: template static bool allValuesEqualTo (const SIMDRegister& vec, const type scalar) { + #ifdef _MSC_VER + __declspec(align(sizeof (SIMDRegister))) type elements[SIMDRegister::SIMDNumElements]; + #else + type elements[SIMDRegister::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister)))); + #endif + + vec.copyToRawArray (elements); + // as we do not want to rely on the access operator we cast this to a primitive pointer - const type* ptr = reinterpret_cast (&vec); for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) - if (ptr[i] != scalar) return false; + if (elements[i] != scalar) return false; return true; } @@ -246,12 +294,20 @@ public: u.expect (allValuesEqualTo (SIMDRegister::expand (static_cast (23)), 23)); { - SIMDRegister a; + #ifdef _MSC_VER + __declspec(align(sizeof (SIMDRegister))) type elements[SIMDRegister::SIMDNumElements]; + #else + type elements[SIMDRegister::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister)))); + #endif + SIMDRegister_test_internal::VecFiller::fill (elements, SIMDRegister::SIMDNumElements, random); + SIMDRegister a (SIMDRegister::fromRawArray (elements)); + + u.expect (vecEqualToArray (a, elements)); - type* ptr = reinterpret_cast(&a); - SIMDRegister_test_internal::fillRandom (ptr, SIMDRegister::SIMDNumElements, random); + SIMDRegister b (a); + a *= static_cast (2); - u.expect (vecEqualToArray (SIMDRegister (a), ptr)); + u.expect (vecEqualToArray (b, elements)); } } }; @@ -265,7 +321,7 @@ public: SIMDRegister a; type array [SIMDRegister::SIMDNumElements]; - SIMDRegister_test_internal::fillRandom (array, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array, SIMDRegister::SIMDNumElements, random); // Test non-const access operator for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) @@ -290,14 +346,17 @@ public: for (int n = 0; n < 100; ++n) { // set-up - SIMDRegister a, b, c; + SIMDRegister a (static_cast (0)); + SIMDRegister b (static_cast (0)); + SIMDRegister c (static_cast (0)); + type array_a [SIMDRegister::SIMDNumElements]; type array_b [SIMDRegister::SIMDNumElements]; type array_c [SIMDRegister::SIMDNumElements]; - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_c, SIMDRegister::SIMDNumElements, random); copy (a, array_a); copy (b, array_b); copy (c, array_c); @@ -310,9 +369,9 @@ public: u.expect (vecEqualToArray (a, array_a)); u.expect (vecEqualToArray (b, array_b)); - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_c, SIMDRegister::SIMDNumElements, random); copy (a, array_a); copy (b, array_b); copy (c, array_c); @@ -326,9 +385,9 @@ public: u.expect (vecEqualToArray (b, array_b)); // set-up again - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_c, SIMDRegister::SIMDNumElements, random); copy (a, array_a); copy (b, array_b); copy (c, array_c); // test out-of-place with both params being vectors @@ -363,7 +422,6 @@ public: typedef typename SIMDRegister::vMaskType vMaskType; typedef typename SIMDRegister::MaskType MaskType; - for (int n = 0; n < 100; ++n) { // Check flip sign bit and using as a union @@ -372,21 +430,28 @@ public: union ConversionUnion { - inline ConversionUnion() {} + inline ConversionUnion() : floatVersion (static_cast (0)) {} inline ~ConversionUnion() {} SIMDRegister floatVersion; vMaskType intVersion; } a, b; vMaskType bitmask = vMaskType::expand (static_cast (1) << (sizeof (MaskType) - 1)); - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); copy (a.floatVersion, array_a); copy (b.floatVersion, array_a); Operation::template inplace, vMaskType> (a.floatVersion, bitmask); Operation::template inplace (b.intVersion, bitmask); - u.expect (vecEqualToArray (a.floatVersion, reinterpret_cast (&b.floatVersion))); + #ifdef _MSC_VER + __declspec(align(sizeof (SIMDRegister))) type elements[SIMDRegister::SIMDNumElements]; + #else + type elements[SIMDRegister::SIMDNumElements] __attribute__((aligned(sizeof (SIMDRegister)))); + #endif + b.floatVersion.copyToRawArray (elements); + + u.expect (vecEqualToArray (a.floatVersion, elements)); } // set-up @@ -397,42 +462,51 @@ public: MaskType array_b [SIMDRegister::SIMDNumElements]; MaskType array_c [SIMDRegister::SIMDNumElements]; - type* conv_a = reinterpret_cast (array_a); - type* conv_c = reinterpret_cast (array_c); + type float_a [SIMDRegister::SIMDNumElements]; + type float_c [SIMDRegister::SIMDNumElements]; - SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister::SIMDNumElements, random); - copy (a, conv_a); copy (b, array_b); copy (c, conv_c); + SIMDRegister_test_internal::VecFiller::fill (float_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (float_c, SIMDRegister::SIMDNumElements, random); + + memcpy (array_a, float_a, sizeof (type) * SIMDRegister::SIMDNumElements); + memcpy (array_c, float_c, sizeof (type) * SIMDRegister::SIMDNumElements); + copy (a, float_a); copy (b, array_b); copy (c, float_c); // test in-place with both params being vectors for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) Operation::template inplace (array_a[i], array_b[i]); + memcpy (float_a, array_a, sizeof (type) * SIMDRegister::SIMDNumElements); Operation::template inplace, vMaskType> (a, b); - u.expect (vecEqualToArray (a, conv_a)); + u.expect (vecEqualToArray (a, float_a)); u.expect (vecEqualToArray (b, array_b)); - SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister::SIMDNumElements, random); - copy (a, conv_a); copy (b, array_b); copy (c, conv_c); + SIMDRegister_test_internal::VecFiller::fill (float_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (float_c, SIMDRegister::SIMDNumElements, random); + memcpy (array_a, float_a, sizeof (type) * SIMDRegister::SIMDNumElements); + memcpy (array_c, float_c, sizeof (type) * SIMDRegister::SIMDNumElements); + copy (a, float_a); copy (b, array_b); copy (c, float_c); // test in-place with one param being scalar for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) Operation::template inplace (array_a[i], static_cast (9)); + memcpy (float_a, array_a, sizeof (type) * SIMDRegister::SIMDNumElements); Operation::template inplace, MaskType> (a, static_cast (9)); - u.expect (vecEqualToArray (a, conv_a)); + u.expect (vecEqualToArray (a, float_a)); u.expect (vecEqualToArray (b, array_b)); // set-up again - SIMDRegister_test_internal::fillRandom (conv_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (conv_c, SIMDRegister::SIMDNumElements, random); - copy (a, conv_a); copy (b, array_b); copy (c, conv_c); + SIMDRegister_test_internal::VecFiller::fill (float_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (float_c, SIMDRegister::SIMDNumElements, random); + memcpy (array_a, float_a, sizeof (type) * SIMDRegister::SIMDNumElements); + memcpy (array_c, float_c, sizeof (type) * SIMDRegister::SIMDNumElements); + copy (a, float_a); copy (b, array_b); copy (c, float_c); // test out-of-place with both params being vectors for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) @@ -440,22 +514,26 @@ public: array_c[i] = Operation::template outofplace (array_a[i], array_b[i]); } + memcpy (float_a, array_a, sizeof (type) * SIMDRegister::SIMDNumElements); + memcpy (float_c, array_c, sizeof (type) * SIMDRegister::SIMDNumElements); c = Operation::template outofplace, vMaskType> (a, b); - u.expect (vecEqualToArray (a, conv_a)); + u.expect (vecEqualToArray (a, float_a)); u.expect (vecEqualToArray (b, array_b)); - u.expect (vecEqualToArray (c, conv_c)); + u.expect (vecEqualToArray (c, float_c)); // test out-of-place with one param being scalar for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) array_c[i] = Operation::template outofplace (array_a[i], static_cast (9)); + memcpy (float_a, array_a, sizeof (type) * SIMDRegister::SIMDNumElements); + memcpy (float_c, array_c, sizeof (type) * SIMDRegister::SIMDNumElements); c = Operation::template outofplace, MaskType> (a, static_cast (9)); - u.expect (vecEqualToArray (a, conv_a)); + u.expect (vecEqualToArray (a, float_a)); u.expect (vecEqualToArray (b, array_b)); - u.expect (vecEqualToArray (c, conv_c)); + u.expect (vecEqualToArray (c, float_c)); } } }; @@ -481,8 +559,8 @@ public: MaskType array_ge [SIMDRegister::SIMDNumElements]; - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); // do check for (size_t j = 0; j < SIMDRegister::SIMDNumElements; ++j) @@ -495,7 +573,9 @@ public: array_ge [j] = (array_a[j] >= array_b[j]) ? static_cast (-1) : 0; } - SIMDRegister a, b; + SIMDRegister a (static_cast (0)); + SIMDRegister b (static_cast (0)); + vMaskType eq, neq, lt, le, gt, ge; copy (a, array_a); @@ -517,8 +597,8 @@ public: do { - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); } while (std::equal (array_a, array_a + SIMDRegister::SIMDNumElements, array_b)); copy (a, array_a); @@ -528,7 +608,7 @@ public: u.expect (! (a == b)); u.expect (! (b == a)); - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); copy (a, array_a); copy (b, array_a); @@ -537,7 +617,7 @@ public: u.expect (! (a != b)); u.expect (! (b != a)); - auto scalar = a[0]; + type scalar = a[0]; a = SIMDRegister::expand (scalar); u.expect (a == scalar); @@ -562,10 +642,10 @@ public: type array_c [SIMDRegister::SIMDNumElements]; type array_d [SIMDRegister::SIMDNumElements]; - SIMDRegister_test_internal::fillRandom (array_a, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_b, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_c, SIMDRegister::SIMDNumElements, random); - SIMDRegister_test_internal::fillRandom (array_d, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_a, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_b, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_c, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array_d, SIMDRegister::SIMDNumElements, random); // check for (size_t i = 0; i < SIMDRegister::SIMDNumElements; ++i) @@ -607,7 +687,10 @@ public: array_max[j] = (array_a[j] > array_b[j]) ? array_a[j] : array_b[j]; } - SIMDRegister a, b, vMin, vMax; + SIMDRegister a (static_cast (0)); + SIMDRegister b (static_cast (0)); + SIMDRegister vMin (static_cast (0)); + SIMDRegister vMax (static_cast (0)); copy (a, array_a); copy (b, array_b); @@ -638,7 +721,7 @@ public: type array [SIMDRegister::SIMDNumElements]; type sumCheck = 0; - SIMDRegister_test_internal::fillRandom (array, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array, SIMDRegister::SIMDNumElements, random); for (size_t j = 0; j < SIMDRegister::SIMDNumElements; ++j) { @@ -674,14 +757,14 @@ public: u.expect (a != value); u.expect (! (a == value)); - SIMDRegister_test_internal::fillRandom (array, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array, SIMDRegister::SIMDNumElements, random); copy (a, array); copy (b, array); u.expect (a == b); u.expect (! (a != b)); - SIMDRegister_test_internal::fillRandom (array, SIMDRegister::SIMDNumElements, random); + SIMDRegister_test_internal::VecFiller::fill (array, SIMDRegister::SIMDNumElements, random); copy (b, array); u.expect (a != b); diff --git a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h index 1c20774346..642642fcf4 100644 --- a/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_avx_SIMDNativeOps.h @@ -71,11 +71,11 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (float, kOne); //============================================================================== - static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return load (a); } + static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return _mm256_castsi256_ps (_mm256_load_si256 ((const __m256i*) a)); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); } - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); } @@ -95,6 +95,8 @@ struct SIMDNativeOps static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); } + static forcedinline float JUCE_VECTOR_CALLTYPE get (__m256 v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256 JUCE_VECTOR_CALLTYPE set (__m256 v, size_t i, float s) noexcept { return SIMDFallbackOps::set (v, i, s); } static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept { a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); @@ -114,7 +116,12 @@ struct SIMDNativeOps __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff); __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1); retval = _mm256_add_ps (retval, tmp); - return ((float*) &retval)[0]; + + #if JUCE_GCC + return retval[0]; + #else + return _mm256_cvtss_f32 (retval); + #endif } }; @@ -134,8 +141,8 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (double, kOne); //============================================================================== - static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm256_castsi256_pd (_mm256_load_si256 ((const __m256i*) a)); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); } static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); } @@ -153,12 +160,15 @@ struct SIMDNativeOps static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); } static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); } + static forcedinline double JUCE_VECTOR_CALLTYPE get (__m256d v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256d JUCE_VECTOR_CALLTYPE set (__m256d v, size_t i, double s) noexcept { return SIMDFallbackOps::set (v, i, s); } + //============================================================================== static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept @@ -173,7 +183,12 @@ struct SIMDNativeOps __m256d retval = _mm256_hadd_pd (a, a); __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1); retval = _mm256_add_pd (retval, tmp); - return ((double*) &retval)[0]; + + #if JUCE_GCC + return retval[0]; + #else + return _mm256_cvtsd_f64 (retval); + #endif } }; @@ -190,15 +205,16 @@ struct SIMDNativeOps //============================================================================== DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet); - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } @@ -207,22 +223,10 @@ struct SIMDNativeOps static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; } static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], - b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], - b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], - b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); @@ -234,10 +238,19 @@ struct SIMDNativeOps hi = _mm256_hadd_epi16 (hi, hi); } - const int8_t* lo_ptr = reinterpret_cast (&lo); - const int8_t* hi_ptr = reinterpret_cast (&hi); + #if JUCE_GCC + return (int8_t) ((lo[0] & 0xff) + + (hi[0] & 0xff) + + (lo[2] & 0xff) + + (hi[2] & 0xff)); + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); - return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]); + return (int8_t) ((_mm256_cvtsi256_si32 (lo) & 0xff) + + (_mm256_cvtsi256_si32 (hi) & 0xff) + + (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask)) & 0xff) + + (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask)) & 0xff)); + #endif } static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) @@ -266,16 +279,17 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit); DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet); - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } @@ -284,22 +298,10 @@ struct SIMDNativeOps static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], - b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], - b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], - b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); @@ -311,10 +313,19 @@ struct SIMDNativeOps hi = _mm256_hadd_epi16 (hi, hi); } - const uint8_t* lo_ptr = reinterpret_cast (&lo); - const uint8_t* hi_ptr = reinterpret_cast (&hi); + #if JUCE_GCC + return (uint8_t) ((static_cast (lo[0]) & 0xffu) + + (static_cast (hi[0]) & 0xffu) + + (static_cast (lo[2]) & 0xffu) + + (static_cast (hi[2]) & 0xffu)); + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); - return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]); + return (uint8_t) ((static_cast (_mm256_cvtsi256_si32 (lo)) & 0xffu) + + (static_cast (_mm256_cvtsi256_si32 (hi)) & 0xffu) + + (static_cast (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (lo, mask))) & 0xffu) + + (static_cast (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (hi, mask))) & 0xffu)); + #endif } static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) @@ -343,8 +354,9 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet); //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); } @@ -352,7 +364,7 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); } @@ -361,26 +373,24 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept - { - return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8], - a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi16 (a, a); tmp = _mm256_hadd_epi16 (tmp, tmp); tmp = _mm256_hadd_epi16 (tmp, tmp); - int16_t* ptr = reinterpret_cast (&tmp); - return (int16_t) (ptr[0] + ptr[8]); + + #if JUCE_GCC + return (int16_t) ((tmp[0] & 0xffff) + (tmp[2] & 0xffff)); + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); + + return (int16_t) ((_mm256_cvtsi256_si32 (tmp) & 0xffff) + + (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)) & 0xffff)); + #endif } }; @@ -400,46 +410,45 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet); //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], - b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi16 (a, a); tmp = _mm256_hadd_epi16 (tmp, tmp); tmp = _mm256_hadd_epi16 (tmp, tmp); - uint16_t* ptr = reinterpret_cast (&tmp); - return (uint16_t) (ptr[0] + ptr[8]); + + #if JUCE_GCC + return (uint16_t) ((static_cast (tmp[0]) & 0xffffu) + + (static_cast (tmp[2]) & 0xffffu)); + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); + + return (uint16_t) ((static_cast (_mm256_cvtsi256_si32 (tmp)) & 0xffffu) + + (static_cast (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))) & 0xffffu)); + #endif } }; @@ -458,8 +467,9 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet); //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast (a); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); } @@ -467,7 +477,7 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); } @@ -476,24 +486,22 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept - { - return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi32 (a, a); tmp = _mm256_hadd_epi32 (tmp, tmp); - int32_t* ptr = reinterpret_cast (&tmp); - return ptr[0] + ptr[4]; + + #if JUCE_GCC + return tmp[0] + tmp[2]; + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); + + return _mm256_cvtsi256_si32 (tmp) + _mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask)); + #endif } }; @@ -513,44 +521,43 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit); //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { __m256i tmp = _mm256_hadd_epi32 (a, a); tmp = _mm256_hadd_epi32 (tmp, tmp); - uint32_t* ptr = reinterpret_cast (&tmp); - return ptr[0] + ptr[4]; + + #if JUCE_GCC + return static_cast (tmp[0]) + static_cast (tmp[2]); + #else + constexpr int mask = (2 << 0) | (3 << 2) | (0 << 4) | (1 << 6); + + return static_cast (_mm256_cvtsi256_si32 (tmp)) + + static_cast (_mm256_cvtsi256_si32 (_mm256_permute4x64_epi64 (tmp, mask))); + #endif } }; @@ -568,14 +575,16 @@ struct SIMDNativeOps //============================================================================== DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet); - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); } @@ -584,47 +593,10 @@ struct SIMDNativeOps static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } - - //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept - { - return _mm256_set_epi64x (a[3], a[2], a[1], a[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - - static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept - { - #ifdef _MSC_VER - __m256d tmp = _mm256_broadcast_sd (reinterpret_cast (&s)); - return *reinterpret_cast (&tmp); - #else - return _mm256_set1_epi64x ((int64_t) s); - #endif - } - - static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept - { - const int64_t* ptr = reinterpret_cast (&a); - return ptr[0] + ptr[1] + ptr[2] + ptr[3]; - } - - static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept - { - __m256i retval; - - const int64_t* aptr = reinterpret_cast (&a); - const int64_t* bptr = reinterpret_cast (&b); - int64_t* dst = reinterpret_cast (&retval); - - for (int i = 0; i < 4; ++i) - dst[i] = aptr[i] * bptr[i]; - - return retval; - } + static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } + static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps::sum (a); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps::mul (a, b); } }; //============================================================================== @@ -642,65 +614,29 @@ struct SIMDNativeOps DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet); DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit); - static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } - - //============================================================================== - static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm256_set_epi64x (b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - - static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept - { - #ifdef _MSC_VER - __m256d tmp = _mm256_broadcast_sd (reinterpret_cast (&s)); - return *reinterpret_cast (&tmp); - #else - return _mm256_set1_epi64x ((int64_t) s); - #endif - } - - static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept - { - const uint64_t* ptr = reinterpret_cast (&a); - return ptr[0] + ptr[1] + ptr[2] + ptr[3]; - } - - static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept - { - __m256i retval; - - const uint64_t* aptr = reinterpret_cast (&a); - const uint64_t* bptr = reinterpret_cast (&b); - uint64_t* dst = reinterpret_cast (&retval); - - for (int i = 0; i < 4; ++i) - dst[i] = aptr[i] * bptr[i]; - - return retval; - } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm256_set1_epi64x ((int64_t) s); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* p) noexcept { return _mm256_load_si256 ((const __m256i*) p); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept { _mm256_store_si256 ((__m256i*) dest, value); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, load (kHighBit)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, load (kAllBitsSet)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); } + static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m256i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE set (__m256i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } + static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept { return SIMDFallbackOps::sum (a); } + static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return SIMDFallbackOps::mul (a, b); } }; #endif diff --git a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h index d7336c8818..1502d26cbe 100644 --- a/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h @@ -63,8 +63,11 @@ struct SIMDFallbackOps static constexpr size_t mask = (sizeof (vSIMDType) / sizeof (ScalarType)) - 1; static constexpr size_t bits = SIMDInternal::Log2Helper::value; - // corresponding mask type - typedef typename SIMDInternal::MaskTypeFor::type MaskType; + // helper types + using MaskType = typename SIMDInternal::MaskTypeFor::type; + union UnionType { vSIMDType v; ScalarType s[n]; }; + union UnionMaskType { vSIMDType v; MaskType m[n]; }; + // fallback methods static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return apply (a, b); } @@ -82,69 +85,80 @@ struct SIMDFallbackOps static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return cmp (a, b); } static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return cmp (a, b); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept + static forcedinline ScalarType get (vSIMDType v, size_t i) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); + UnionType u {v}; + return u.s[i]; + } + + static forcedinline vSIMDType set (vSIMDType v, size_t i, ScalarType s) noexcept + { + UnionType u {v}; + + u.s[i] = s; + return u.v; + } + + static forcedinline vSIMDType bit_not (vSIMDType av) noexcept + { + UnionMaskType a {av}; for (size_t i = 0; i < n; ++i) - dst [i] = ~aSrc [i]; + a.m[i] = ~a.m[i]; - return retval; + return a.v; } - static forcedinline ScalarType sum (vSIMDType a) noexcept + static forcedinline ScalarType sum (vSIMDType av) noexcept { + UnionType a {av}; auto retval = static_cast (0); - auto* aSrc = reinterpret_cast (&a); for (size_t i = 0; i < n; ++i) - retval += aSrc [i]; + retval += a.s[i]; return retval; } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept + static forcedinline vSIMDType multiplyAdd (vSIMDType av, vSIMDType bv, vSIMDType cv) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); - auto* bSrc = reinterpret_cast (&b); - auto* cSrc = reinterpret_cast (&c); + UnionType a {av}, b {bv}, c {cv}; for (size_t i = 0; i < n; ++i) - dst [i] = aSrc [i] + (bSrc [i] * cSrc [i]); + a.s[i] += b.s[i] * c.s[i]; - return retval; + return a.v; } //============================================================================== - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept + static forcedinline bool allEqual (vSIMDType av, vSIMDType bv) noexcept { - auto* aSrc = reinterpret_cast (&a); - auto* bSrc = reinterpret_cast (&b); + UnionType a {av}, b {bv}; for (size_t i = 0; i < n; ++i) - if (aSrc[i] != bSrc[i]) + if (a.s[i] != b.s[i]) return false; return true; } //============================================================================== - static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept + static forcedinline vSIMDType cmplxmul (vSIMDType av, vSIMDType bv) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast*> (&retval); - auto* aSrc = reinterpret_cast*> (&a); - auto* bSrc = reinterpret_cast*> (&b); + UnionType a {av}, b {bv}, r; const int m = n >> 1; for (int i = 0; i < m; ++i) - dst [i] = aSrc [i] * bSrc [i]; + { + std::complex result + = std::complex (a.s[i<<1], a.s[(i<<1)|1]) + * std::complex (b.s[i<<1], b.s[(i<<1)|1]); - return retval; + r.s[i<<1] = result.real(); + r.s[(i<<1)|1] = result.imag(); + } + + return r.v; } struct ScalarAdd { static forcedinline ScalarType op (ScalarType a, ScalarType b) noexcept { return a + b; } }; @@ -163,90 +177,78 @@ struct SIMDFallbackOps // generic apply routines for operations above template - static forcedinline vSIMDType apply (vSIMDType a, vSIMDType b) noexcept + static forcedinline vSIMDType apply (vSIMDType av, vSIMDType bv) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); - auto* bSrc = reinterpret_cast (&b); + UnionType a {av}, b {bv}; for (size_t i = 0; i < n; ++i) - dst [i] = Op::op (aSrc [i], bSrc [i]); + a.s[i] = Op::op (a.s[i], b.s[i]); - return retval; + return a.v; } template - static forcedinline vSIMDType cmp (vSIMDType a, vSIMDType b) noexcept + static forcedinline vSIMDType cmp (vSIMDType av, vSIMDType bv) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); - auto* bSrc = reinterpret_cast (&b); + UnionType a {av}, b {bv}; + UnionMaskType r; for (size_t i = 0; i < n; ++i) - dst [i] = Op::op (aSrc [i], bSrc [i]) ? static_cast (-1) : static_cast (0); + r.m[i] = Op::op (a.s[i], b.s[i]) ? static_cast (-1) : static_cast (0); - return retval; + return r.v; } template - static forcedinline vSIMDType bitapply (vSIMDType a, vSIMDType b) noexcept + static forcedinline vSIMDType bitapply (vSIMDType av, vSIMDType bv) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); - auto* bSrc = reinterpret_cast (&b); + UnionMaskType a {av}, b {bv}; for (size_t i = 0; i < n; ++i) - dst [i] = Op::op (aSrc [i], bSrc [i]); + a.m[i] = Op::op (a.m[i], b.m[i]); - return retval; + return a.v; } static forcedinline vSIMDType expand (ScalarType s) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); + UnionType r; for (size_t i = 0; i < n; ++i) - dst [i] = s; + r.s[i] = s; - return retval; + return r.v; } static forcedinline vSIMDType load (const ScalarType* a) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); + UnionType r; for (size_t i = 0; i < n; ++i) - dst [i] = a[i]; + r.s[i] = a[i]; - return retval; + return r.v; } - static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept + static forcedinline void store (vSIMDType av, ScalarType* dest) noexcept { - const auto* src = reinterpret_cast (&value); + UnionType a {av}; for (size_t i = 0; i < n; ++i) - dest[i] = src[i]; + dest[i] = a.s[i]; } template - static forcedinline vSIMDType shuffle (vSIMDType a) noexcept + static forcedinline vSIMDType shuffle (vSIMDType av) noexcept { - vSIMDType retval; - auto* dst = reinterpret_cast (&retval); - auto* aSrc = reinterpret_cast (&a); + UnionType a {av}, r; // the compiler will unroll this loop and the index can // be computed at compile-time, so this will be super fast for (size_t i = 0; i < n; ++i) - dst [i] = aSrc [(shuffle_idx >> (bits * i)) & mask]; + r.s[i] = a.s[(shuffle_idx >> (bits * i)) & mask]; - return retval; + return r.v; } }; diff --git a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h index af0b6cd6b7..5eef5051f6 100644 --- a/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_neon_SIMDNativeOps.h @@ -71,24 +71,26 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } - static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); } - static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u32 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u32 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u32 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u32 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u32 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u32 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u32 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u32 (a, b); } + static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } + static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); } + static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); } + static forcedinline uint32_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, uint32_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u32 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u32 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u32 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u32 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u32 ((uint32_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u32 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u32 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u32 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u32 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u32 (a, b); } static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u32 (a, b, c); } static forcedinline uint32_t sum (vSIMDType a) noexcept { @@ -113,24 +115,26 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (int32_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); } - static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); } - static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s32 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s32 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s32 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s32 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s32 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s32 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s32 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s32 (a, b); } + static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); } + static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); } + static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); } + static forcedinline int32_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, int32_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s32 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s32 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s32 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s32 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s32 ((int32_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s32 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s32 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s32 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (sum (notEqual (a, b)) == 0); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s32 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s32 (a, b); } static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s32 (a, b, c); } static forcedinline int32_t sum (vSIMDType a) noexcept { @@ -156,26 +160,28 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (int8_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); } - static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); } - static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s8 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s8 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s8 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s8 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s8 ((int8_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s8 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s8 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s8 (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s8 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s8 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s8 (a, b, c); } - static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); } + static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); } + static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); } + static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); } + static forcedinline int8_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, int8_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s8 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s8 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s8 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s8 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s8 ((int8_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s8 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s8 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s8 (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s8 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s8 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s8 (a, b, c); } + static forcedinline int8_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; //============================================================================== @@ -194,26 +200,28 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } - static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); } - static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u8 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u8 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u8 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u8 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u8 ((uint8_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u8 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u8 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u8 (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u8 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u8 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u8 (a, b, c); } - static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); } + static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } + static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); } + static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); } + static forcedinline uint8_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, uint8_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u8 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u8 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u8 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u8 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u8 ((uint8_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u8 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u8 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u8 (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u8 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u8 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u8 (a, b, c); } + static forcedinline uint8_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; //============================================================================== @@ -232,26 +240,28 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (int16_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); } - static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); } - static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s16 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s16 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s16 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s16 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s16 ((int16_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s16 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s16 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s16 (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s16 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s16 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s16 (a, b, c); } - static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); } + static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); } + static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); } + static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); } + static forcedinline int16_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, int16_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s16 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s16 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s16 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s16 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s16 ((int16_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_s16 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_s16 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_s16 (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_s16 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_s16 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_s16 (a, b, c); } + static forcedinline int16_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; @@ -271,26 +281,28 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } - static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); } - static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u16 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u16 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u16 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u16 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u16 ((uint16_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u16 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u16 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u16 (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u16 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u16 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u16 (a, b, c); } - static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); } + static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } + static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); } + static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); } + static forcedinline uint16_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, uint16_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u16 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u16 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u16 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u16 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u16 ((uint16_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_u16 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_u16 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_u16 (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_u16 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_u16 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_u16 (a, b, c); } + static forcedinline uint16_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; //============================================================================== @@ -309,26 +321,28 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (int64_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); } - static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); } - static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s64 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s64 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s64 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s64 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s64 ((int64_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); } - static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); } + static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); } + static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); } + static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); } + static forcedinline int64_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, int64_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_s64 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_s64 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_s64 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_s64 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_s64 ((int64_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); } + static forcedinline int64_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; @@ -348,24 +362,26 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (uint64_t, kAllBitsSet); //============================================================================== - static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); } - static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); } - static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u64 (a, b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u64 (a, b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u64 (a, b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u64 (b, a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u64 ((uint64_t*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); } + static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); } + static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); } + static forcedinline uint64_t get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, uint64_t s) noexcept { v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return vandq_u64 (a, b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return vorrq_u64 (a, b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return veorq_u64 (a, b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return vbicq_u64 (b, a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_u64 ((uint64_t*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return fb::min (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return fb::max (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return fb::equal (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return fb::notEqual (a, b); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThan (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return fb::greaterThanOrEqual (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return fb::multiplyAdd (a, b, c); } static forcedinline uint64_t sum (vSIMDType a) noexcept { return fb::sum (a); } }; @@ -389,29 +405,31 @@ struct SIMDNativeOps DECLARE_NEON_SIMD_CONST (float, kOne); //============================================================================== - static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } - static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); } - static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } - static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } - static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } - static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } - static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); } - static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); } - static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); } - static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } - static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_f32 (a, b); } - static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_f32 (a, b); } - static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } - static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_f32 (a, b, c); } - static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)> (a); } - static forcedinline vSIMDType dupodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)> (a); } - static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); } - static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); } + static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } + static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); } + static forcedinline float get (vSIMDType v, size_t i) noexcept { return v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, float s) noexcept { v[i] = s; return v; } + static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } + static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } + static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } + static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } + static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); } + static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); } + static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); } + static forcedinline vSIMDType notEqual (vSIMDType a, vSIMDType b) noexcept { return bit_not (equal (a, b)); } + static forcedinline vSIMDType greaterThan (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgtq_f32 (a, b); } + static forcedinline vSIMDType greaterThanOrEqual (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vcgeq_f32 (a, b); } + static forcedinline bool allEqual (vSIMDType a, vSIMDType b) noexcept { return (SIMDNativeOps::sum ((SIMDNativeOps::vSIMDType) notEqual (a, b)) == 0); } + static forcedinline vSIMDType multiplyAdd (vSIMDType a, vSIMDType b, vSIMDType c) noexcept { return vmlaq_f32 (a, b, c); } + static forcedinline vSIMDType dupeven (vSIMDType a) noexcept { return fb::shuffle<(0 << 0) | (0 << 2) | (2 << 4) | (2 << 6)> (a); } + static forcedinline vSIMDType dupodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (1 << 2) | (3 << 4) | (3 << 6)> (a); } + static forcedinline vSIMDType swapevenodd (vSIMDType a) noexcept { return fb::shuffle<(1 << 0) | (0 << 2) | (3 << 4) | (2 << 6)> (a); } + static forcedinline vSIMDType oddevensum (vSIMDType a) noexcept { return add (fb::shuffle<(2 << 0) | (3 << 2) | (0 << 4) | (1 << 6)> (a), a); } //============================================================================== static forcedinline vSIMDType cmplxmul (vSIMDType a, vSIMDType b) noexcept @@ -438,15 +456,17 @@ template <> struct SIMDNativeOps { //============================================================================== - typedef struct { double values [2]; } vSIMDType; + typedef struct { double v[2]; } vSIMDType; typedef SIMDFallbackOps fb; - static forcedinline vSIMDType expand (double s) noexcept { return fb::expand (s); } - static forcedinline vSIMDType load (const double* a) noexcept { return fb::load (a); } - static forcedinline void store (vSIMDType value, double* a) noexcept { fb::store (value, a); } - static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return fb::add (a, b); } - static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return fb::sub (a, b); } - static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } + static forcedinline vSIMDType expand (double s) noexcept { return {{s, s}}; } + static forcedinline vSIMDType load (const double* a) noexcept { return {{a[0], a[1]}}; } + static forcedinline void store (vSIMDType v, double* a) noexcept { a[0] = v.v[0]; a[1] = v.v[1]; } + static forcedinline double get (vSIMDType v, size_t i) noexcept { return v.v[i]; } + static forcedinline vSIMDType set (vSIMDType v, size_t i, double s) noexcept { v.v[i] = s; return v; } + static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] + b.v[0], a.v[1] + b.v[1]}}; } + static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] - b.v[0], a.v[1] - b.v[1]}}; } + static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return {{a.v[0] * b.v[0], a.v[1] * b.v[1]}}; } static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return fb::bit_and (a, b); } static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return fb::bit_or (a, b); } static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return fb::bit_xor (a, b); } diff --git a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h index f8ee84371b..dd80cd00db 100644 --- a/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h +++ b/modules/juce_dsp/native/juce_sse_SIMDNativeOps.h @@ -95,6 +95,8 @@ struct SIMDNativeOps static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); } static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); } static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); } + static forcedinline float JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept @@ -114,7 +116,7 @@ struct SIMDNativeOps __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a); retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1)); #endif - return ((float*) &retval) [0]; + return _mm_cvtss_f32 (retval); } }; @@ -135,8 +137,8 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (double, kOne); //============================================================================== - static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); } + static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm_castsi128_pd (_mm_load_si128 ((const __m128i*) a)); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); } static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); } @@ -159,7 +161,9 @@ struct SIMDNativeOps static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); } static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); } - static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; } + static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept { return a; } + static forcedinline double JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept @@ -178,7 +182,7 @@ struct SIMDNativeOps #else __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a); #endif - return ((double*) &retval) [0]; + return _mm_cvtsd_f64 (retval); } }; @@ -196,7 +200,9 @@ struct SIMDNativeOps //============================================================================== DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet); - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); } @@ -218,20 +224,10 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], - b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -244,18 +240,9 @@ struct SIMDNativeOps hi = _mm_hadd_epi16 (hi, hi); } - const int8_t* lo_ptr = reinterpret_cast (&lo); - const int8_t* hi_ptr = reinterpret_cast (&hi); - - return lo_ptr[0] + hi_ptr[0]; + return static_cast ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff)); #else - int8_t sum = 0; - const int8_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i) - sum += src [i]; - - return sum; + return SIMDFallbackOps::sum (a); #endif } @@ -285,8 +272,10 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit); DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet); - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return load (a); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); } @@ -303,20 +292,10 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], - b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ @@ -329,18 +308,10 @@ struct SIMDNativeOps hi = _mm_hadd_epi16 (hi, hi); } - const uint8_t* lo_ptr = reinterpret_cast (&lo); - const uint8_t* hi_ptr = reinterpret_cast (&hi); - - return lo_ptr[0] + hi_ptr[0]; + return static_cast ((static_cast (_mm_cvtsi128_si32 (lo)) & 0xffu) + + (static_cast (_mm_cvtsi128_si32 (hi)) & 0xffu)); #else - uint8_t sum = 0; - const uint8_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i) - sum += src [i]; - - return sum; + return SIMDFallbackOps::sum (a); #endif } @@ -370,7 +341,9 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet); //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); } @@ -388,34 +361,20 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept - { - return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ __m128i tmp = _mm_hadd_epi16 (a, a); tmp = _mm_hadd_epi16 (tmp, tmp); tmp = _mm_hadd_epi16 (tmp, tmp); - return *reinterpret_cast (&tmp); + return static_cast (_mm_cvtsi128_si32 (tmp) & 0xffff); #else - int16_t sum = 0; - const int16_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i) - sum += src [i]; - - return sum; + return SIMDFallbackOps::sum (a); #endif } }; @@ -436,58 +395,46 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet); //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } #if defined(__SSE4__) - static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); } #else - static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } #endif - static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ __m128i tmp = _mm_hadd_epi16 (a, a); tmp = _mm_hadd_epi16 (tmp, tmp); tmp = _mm_hadd_epi16 (tmp, tmp); - return *reinterpret_cast (&tmp); - #else - uint16_t sum = 0; - const uint16_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i) - sum += src [i]; - return sum; + return static_cast (static_cast (_mm_cvtsi128_si32 (tmp)) & 0xffffu); + #else + return SIMDFallbackOps::sum (a); #endif } }; @@ -507,9 +454,10 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet); //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } @@ -523,27 +471,17 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ __m128i tmp = _mm_hadd_epi32 (a, a); - tmp = _mm_hadd_epi32 (tmp, tmp); - return *reinterpret_cast (&tmp); + return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)); #else - int32_t sum = 0; - const int32_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i) - sum += src [i]; - - return sum; + return SIMDFallbackOps::sum (a); #endif } @@ -596,49 +534,35 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit); //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm_set_epi32 (b[3], b[2], b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { #ifdef __SSSE3__ __m128i tmp = _mm_hadd_epi32 (a, a); - tmp = _mm_hadd_epi32 (tmp, tmp); - return *reinterpret_cast (&tmp); + return static_cast (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp))); #else - uint32_t sum = 0; - const uint32_t* src = reinterpret_cast (&a); - - for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i) - sum += src [i]; - - return sum; + return SIMDFallbackOps::sum (a); #endif } @@ -689,16 +613,10 @@ struct SIMDNativeOps //============================================================================== DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet); - static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept - { - __m128i retval; - int64_t* ptr = reinterpret_cast (&retval); - ptr[0] = ptr[1] = s; - return retval; - } - - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm_set1_epi64x (s); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } @@ -708,36 +626,14 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } - static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } - - //============================================================================== - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - - static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept - { - const int64_t* ptr = reinterpret_cast (&a); - return ptr[0] + ptr[1]; - } - - static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept - { - __m128i retval; - - const int64_t* aptr = reinterpret_cast (&a); - const int64_t* bptr = reinterpret_cast (&b); - int64_t* dst = reinterpret_cast (&retval); - - dst[0] = aptr[0] * bptr[0]; - dst[1] = aptr[1] * bptr[1]; - - return retval; - } + static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } + static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps::sum (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps::mul (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { @@ -752,19 +648,10 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { - #if defined(__SSE4_1__) && !defined(__clang__) + #if defined(__SSE4_1__) return _mm_cmpgt_epi64 (a, b); #else - __m128i retval; - - const int64_t* aptr = reinterpret_cast (&a); - const int64_t* bptr = reinterpret_cast (&b); - int64_t* dst = reinterpret_cast (&retval); - - dst[0] = aptr[0] > bptr[0] ? -1LL : 0; - dst[1] = aptr[1] > bptr[1] ? -1LL : 0; - - return retval; + return SIMDFallbackOps::greaterThan (a, b); #endif } }; @@ -784,61 +671,28 @@ struct SIMDNativeOps DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet); DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit); - static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept - { - __m128i retval; - uint64_t* ptr = reinterpret_cast (&retval); - ptr[0] = ptr[1] = s; - return retval; - } - - static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast (a); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } - static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } - static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } - - //============================================================================== - static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept - { - const auto* b = reinterpret_cast (a); - return _mm_set_epi64x (b[1], b[0]); - } - - static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept - { - SIMDFallbackOps::store (value, dest); - } - - static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept - { - const uint64_t* ptr = reinterpret_cast (&a); - return ptr[0] + ptr[1]; - } - - static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept - { - __m128i retval; - - const uint64_t* aptr = reinterpret_cast (&a); - const uint64_t* bptr = reinterpret_cast (&b); - uint64_t* dst = reinterpret_cast (&retval); - - dst[0] = aptr[0] * bptr[0]; - dst[1] = aptr[1] * bptr[1]; - - return retval; - } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return load (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm_set1_epi64x ((int64_t) s); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept { return _mm_load_si128 ((const __m128i*) a); } + static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept { _mm_store_si128 ((__m128i*) p, v); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } + static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); } + static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps::get (v, i); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps::set (v, i, s); } + static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps::sum (a); } + static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps::mul (a, b); } static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { @@ -853,19 +707,10 @@ struct SIMDNativeOps static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { - #if defined(__SSE4_1__) && !defined(__clang__) - return _mm_cmpgt_epi64 (a, b); + #if defined(__SSE4_1__) + return _mm_cmpgt_epi64 (ssign (a), ssign (b)); #else - __m128i retval; - - const uint64_t* aptr = reinterpret_cast (&a); - const uint64_t* bptr = reinterpret_cast (&b); - uint64_t* dst = reinterpret_cast (&retval); - - dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0; - dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0; - - return retval; + return SIMDFallbackOps::greaterThan (a, b); #endif } };