Browse Source

DSP: Added SIMDRegister::copyToRawArray and SIMDRegister::fromRawArray to easily convert between raw arrays and SIMD registers

tags/2021-05-28
hogliux 7 years ago
parent
commit
eb8400b366
6 changed files with 326 additions and 56 deletions
  1. +42
    -1
      modules/juce_dsp/containers/juce_SIMDRegister.h
  2. +14
    -4
      modules/juce_dsp/containers/juce_SIMDRegister_test.cpp
  3. +136
    -39
      modules/juce_dsp/native/juce_avx_SIMDNativeOps.h
  4. +19
    -0
      modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h
  5. +32
    -12
      modules/juce_dsp/native/juce_neon_SIMDNativeOps.h
  6. +83
    -0
      modules/juce_dsp/native/juce_sse_SIMDNativeOps.h

+ 42
- 1
modules/juce_dsp/containers/juce_SIMDRegister.h View File

@@ -115,6 +115,20 @@ struct SIMDRegister
__mm128 for single-precision floating point on SSE architectures). */
inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromNative (vSIMDType a) noexcept { return {a}; }
/** Creates a new SIMDRegister from the first SIMDNumElements of a scalar array. */
inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromRawArray (const ElementType* a) noexcept
{
jassert (isSIMDAligned (a));
return {CmplxOps::load (a)};
}
/** Copies the elements of the SIMDRegister to a scalar array in memory. */
inline void JUCE_VECTOR_CALLTYPE copyToRawArray (ElementType* a) const noexcept
{
jassert (isSIMDAligned (a));
CmplxOps::store (value, a);
}
//==============================================================================
/** Returns the idx-th element of the receiver. Note that this does not check if idx
is larger than the native register size. */
@@ -269,7 +283,7 @@ struct SIMDRegister
//==============================================================================
/** Checks if the given pointer is suffeciently aligned for using SIMD operations. */
static inline bool isSIMDAligned (ElementType* ptr) noexcept
static inline bool isSIMDAligned (const ElementType* ptr) noexcept
{
uintptr_t bitmask = SIMDRegisterSize - 1;
return (reinterpret_cast<uintptr_t> (ptr) & bitmask) == 0;
@@ -285,6 +299,13 @@ struct SIMDRegister
return snapPointerToAlignment (ptr, SIMDRegisterSize);
}
#ifndef DOXYGEN
static inline const ElementType* getNextSIMDAlignedPtr (const ElementType* ptr) noexcept
{
return snapPointerToAlignment (ptr, SIMDRegisterSize);
}
#endif
private:
static inline vMaskType JUCE_VECTOR_CALLTYPE toMaskType (vSIMDType a) noexcept
{
@@ -333,6 +354,16 @@ struct CmplxSIMDOps
{
typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept
{
return SIMDNativeOps<Scalar>::load (a);
}
static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept
{
SIMDNativeOps<Scalar>::store (value, dest);
}
static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept
{
return SIMDNativeOps<Scalar>::expand (s);
@@ -360,6 +391,16 @@ struct CmplxSIMDOps<std::complex<Scalar>>
{
typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType;
static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex<Scalar>* a) noexcept
{
return SIMDNativeOps<Scalar>::load (reinterpret_cast<const Scalar*> (a));
}
static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex<Scalar>* dest) noexcept
{
SIMDNativeOps<Scalar>::store (value, reinterpret_cast<Scalar*> (dest));
}
static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex<Scalar> s) noexcept
{
const int n = sizeof (vSIMDType) / sizeof (Scalar);


+ 14
- 4
modules/juce_dsp/containers/juce_SIMDRegister_test.cpp View File

@@ -95,7 +95,7 @@ namespace SIMDRegister_test_internal
class SIMDRegisterUnitTests : public UnitTest
{
public:
SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests") {}
SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests", "DSP") {}
//==============================================================================
// Some helper classes
@@ -113,7 +113,10 @@ public:
template <typename type>
static bool vecEqualToArray (const SIMDRegister<type>& vec, const type* array)
{
const type* ptr = reinterpret_cast<const type*> (&vec);
HeapBlock<type> vecElementsStorage (SIMDRegister<type>::SIMDNumElements * 2);
auto* ptr = SIMDRegister<type>::getNextSIMDAlignedPtr (vecElementsStorage.getData());
vec.copyToRawArray (ptr);
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
{
double delta = SIMDRegister_test_internal::difference (ptr[i], array[i]);
@@ -130,8 +133,15 @@ public:
template <typename type>
static void copy (SIMDRegister<type>& vec, const type* ptr)
{
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
vec[i] = ptr[i];
if (SIMDRegister<type>::isSIMDAligned (ptr))
{
vec = SIMDRegister<type>::fromRawArray (ptr);
}
else
{
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i)
vec[i] = ptr[i];
}
}
//==============================================================================


+ 136
- 39
modules/juce_dsp/native/juce_avx_SIMDNativeOps.h View File

@@ -66,6 +66,8 @@ struct SIMDNativeOps<float>
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
@@ -120,28 +122,30 @@ struct SIMDNativeOps<double>
DECLARE_AVX_SIMD_CONST (double, kOne);
//==============================================================================
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
static forcedinline __m256d expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
static forcedinline __m256d bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
//==============================================================================
static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
@@ -170,24 +174,38 @@ struct SIMDNativeOps<int8_t>
//==============================================================================
DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
{
const auto* b = reinterpret_cast<const char*> (a);
return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
{
SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
}
//==============================================================================
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -247,6 +265,20 @@ struct SIMDNativeOps<uint8_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
{
const auto* b = reinterpret_cast<const char*> (a);
return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
{
SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
}
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
@@ -306,6 +338,17 @@ struct SIMDNativeOps<int16_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
{
return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
{
SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
}
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i tmp = _mm256_hadd_epi16 (a, a);
@@ -349,6 +392,18 @@ struct SIMDNativeOps<uint16_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
{
const auto* b = reinterpret_cast<const int16_t*> (a);
return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
{
SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
}
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i tmp = _mm256_hadd_epi16 (a, a);
@@ -390,6 +445,16 @@ struct SIMDNativeOps<int32_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
{
return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
{
SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
}
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i tmp = _mm256_hadd_epi32 (a, a);
@@ -432,6 +497,17 @@ struct SIMDNativeOps<uint32_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
{
const auto* b = reinterpret_cast<const int32_t*> (a);
return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
{
SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
}
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
{
__m256i tmp = _mm256_hadd_epi32 (a, a);
@@ -469,6 +545,16 @@ struct SIMDNativeOps<int64_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
{
return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
{
SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
}
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
{
#ifdef _MSC_VER
@@ -530,6 +616,17 @@ struct SIMDNativeOps<uint64_t>
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
{
const auto* b = reinterpret_cast<const int64_t*> (a);
return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
{
SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
}
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
{
#ifdef _MSC_VER


+ 19
- 0
modules/juce_dsp/native/juce_fallback_SIMDNativeOps.h View File

@@ -200,6 +200,25 @@ struct SIMDFallbackOps
return retval;
}
static forcedinline vSIMDType load (const ScalarType* a) noexcept
{
vSIMDType retval;
auto* dst = reinterpret_cast<ScalarType*> (&retval);
for (size_t i = 0; i < n; ++i)
dst [i] = a[i];
return retval;
}
static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept
{
const auto* src = reinterpret_cast<const ScalarType*> (&value);
for (size_t i = 0; i < n; ++i)
dest[i] = src[i];
}
template <unsigned int shuffle_idx>
static forcedinline vSIMDType shuffle (vSIMDType a) noexcept
{


+ 32
- 12
modules/juce_dsp/native/juce_neon_SIMDNativeOps.h View File

@@ -66,15 +66,17 @@ struct SIMDNativeOps<float>
DECLARE_NEON_SIMD_CONST (float, kOne);
//==============================================================================
static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); }
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); }
static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); }
static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); }
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); }
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); }
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); }
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); }
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); }
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); }
@@ -109,6 +111,8 @@ struct SIMDNativeOps<double>
typedef SIMDFallbackOps<double, vSIMDType> fb;
static forcedinline vSIMDType expand (double s) noexcept { return fb::expand (s); }
static forcedinline vSIMDType load (const double* a) noexcept { return fb::load (a); }
static forcedinline void store (vSIMDType value, double* a) noexcept { fb::store (value, a); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return fb::add (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return fb::sub (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); }
@@ -143,6 +147,8 @@ struct SIMDNativeOps<int8_t>
//==============================================================================
static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); }
static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); }
static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); }
@@ -174,7 +180,9 @@ struct SIMDNativeOps<uint8_t>
DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet);
//==============================================================================
static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); }
static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); }
static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); }
static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); }
@@ -207,6 +215,8 @@ struct SIMDNativeOps<int16_t>
//==============================================================================
static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); }
static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); }
static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); }
@@ -239,7 +249,9 @@ struct SIMDNativeOps<uint16_t>
DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet);
//==============================================================================
static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); }
static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); }
static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); }
static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); }
@@ -272,6 +284,8 @@ struct SIMDNativeOps<int32_t>
//==============================================================================
static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); }
static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); }
static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); }
@@ -304,7 +318,9 @@ struct SIMDNativeOps<uint32_t>
DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet);
//==============================================================================
static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); }
static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); }
static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); }
static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); }
@@ -337,6 +353,8 @@ struct SIMDNativeOps<int64_t>
//==============================================================================
static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); }
static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); }
static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); }
@@ -370,6 +388,8 @@ struct SIMDNativeOps<uint64_t>
//==============================================================================
static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); }
static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); }
static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); }
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); }
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); }
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); }


+ 83
- 0
modules/juce_dsp/native/juce_sse_SIMDNativeOps.h View File

@@ -65,6 +65,8 @@ struct SIMDNativeOps<float>
//==============================================================================
static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
@@ -124,6 +126,8 @@ struct SIMDNativeOps<double>
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
@@ -199,6 +203,18 @@ struct SIMDNativeOps<int8_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
{
const auto* b = reinterpret_cast<const char*> (a);
return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
{
SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
}
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -268,6 +284,18 @@ struct SIMDNativeOps<uint8_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
{
const auto* b = reinterpret_cast<const char*> (a);
return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
{
SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
}
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -337,6 +365,16 @@ struct SIMDNativeOps<int16_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
{
return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
{
SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
}
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -395,6 +433,17 @@ struct SIMDNativeOps<uint16_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
{
const auto* b = reinterpret_cast<const int16_t*> (a);
return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
{
SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
}
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -428,6 +477,7 @@ struct SIMDNativeOps<int32_t>
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
@@ -442,6 +492,11 @@ struct SIMDNativeOps<int32_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
{
SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
}
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -522,6 +577,17 @@ struct SIMDNativeOps<uint32_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
{
const auto* b = reinterpret_cast<const int32_t*> (a);
return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
{
SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
}
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
#ifdef __SSSE3__
@@ -591,6 +657,7 @@ struct SIMDNativeOps<int64_t>
return retval;
}
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
@@ -606,6 +673,11 @@ struct SIMDNativeOps<int64_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
{
SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
}
static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
@@ -692,6 +764,17 @@ struct SIMDNativeOps<uint64_t>
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
//==============================================================================
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
{
const auto* b = reinterpret_cast<const int64_t*> (a);
return _mm_set_epi64x (b[1], b[0]);
}
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
{
SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
}
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
{
const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);


Loading…
Cancel
Save