@@ -115,6 +115,20 @@ struct SIMDRegister | |||
__mm128 for single-precision floating point on SSE architectures). */ | |||
inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromNative (vSIMDType a) noexcept { return {a}; } | |||
/** Creates a new SIMDRegister from the first SIMDNumElements of a scalar array. */ | |||
inline static SIMDRegister JUCE_VECTOR_CALLTYPE fromRawArray (const ElementType* a) noexcept | |||
{ | |||
jassert (isSIMDAligned (a)); | |||
return {CmplxOps::load (a)}; | |||
} | |||
/** Copies the elements of the SIMDRegister to a scalar array in memory. */ | |||
inline void JUCE_VECTOR_CALLTYPE copyToRawArray (ElementType* a) const noexcept | |||
{ | |||
jassert (isSIMDAligned (a)); | |||
CmplxOps::store (value, a); | |||
} | |||
//============================================================================== | |||
/** Returns the idx-th element of the receiver. Note that this does not check if idx | |||
is larger than the native register size. */ | |||
@@ -269,7 +283,7 @@ struct SIMDRegister | |||
//============================================================================== | |||
/** Checks if the given pointer is suffeciently aligned for using SIMD operations. */ | |||
static inline bool isSIMDAligned (ElementType* ptr) noexcept | |||
static inline bool isSIMDAligned (const ElementType* ptr) noexcept | |||
{ | |||
uintptr_t bitmask = SIMDRegisterSize - 1; | |||
return (reinterpret_cast<uintptr_t> (ptr) & bitmask) == 0; | |||
@@ -285,6 +299,13 @@ struct SIMDRegister | |||
return snapPointerToAlignment (ptr, SIMDRegisterSize); | |||
} | |||
#ifndef DOXYGEN | |||
static inline const ElementType* getNextSIMDAlignedPtr (const ElementType* ptr) noexcept | |||
{ | |||
return snapPointerToAlignment (ptr, SIMDRegisterSize); | |||
} | |||
#endif | |||
private: | |||
static inline vMaskType JUCE_VECTOR_CALLTYPE toMaskType (vSIMDType a) noexcept | |||
{ | |||
@@ -333,6 +354,16 @@ struct CmplxSIMDOps | |||
{ | |||
typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType; | |||
static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const Scalar* a) noexcept | |||
{ | |||
return SIMDNativeOps<Scalar>::load (a); | |||
} | |||
static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, Scalar* dest) noexcept | |||
{ | |||
SIMDNativeOps<Scalar>::store (value, dest); | |||
} | |||
static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (Scalar s) noexcept | |||
{ | |||
return SIMDNativeOps<Scalar>::expand (s); | |||
@@ -360,6 +391,16 @@ struct CmplxSIMDOps<std::complex<Scalar>> | |||
{ | |||
typedef typename SIMDNativeOps<Scalar>::vSIMDType vSIMDType; | |||
static inline vSIMDType JUCE_VECTOR_CALLTYPE load (const std::complex<Scalar>* a) noexcept | |||
{ | |||
return SIMDNativeOps<Scalar>::load (reinterpret_cast<const Scalar*> (a)); | |||
} | |||
static inline void JUCE_VECTOR_CALLTYPE store (vSIMDType value, std::complex<Scalar>* dest) noexcept | |||
{ | |||
SIMDNativeOps<Scalar>::store (value, reinterpret_cast<Scalar*> (dest)); | |||
} | |||
static inline vSIMDType JUCE_VECTOR_CALLTYPE expand (std::complex<Scalar> s) noexcept | |||
{ | |||
const int n = sizeof (vSIMDType) / sizeof (Scalar); | |||
@@ -95,7 +95,7 @@ namespace SIMDRegister_test_internal | |||
class SIMDRegisterUnitTests : public UnitTest | |||
{ | |||
public: | |||
SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests") {} | |||
SIMDRegisterUnitTests() : UnitTest ("SIMDRegister UnitTests", "DSP") {} | |||
//============================================================================== | |||
// Some helper classes | |||
@@ -113,7 +113,10 @@ public: | |||
template <typename type> | |||
static bool vecEqualToArray (const SIMDRegister<type>& vec, const type* array) | |||
{ | |||
const type* ptr = reinterpret_cast<const type*> (&vec); | |||
HeapBlock<type> vecElementsStorage (SIMDRegister<type>::SIMDNumElements * 2); | |||
auto* ptr = SIMDRegister<type>::getNextSIMDAlignedPtr (vecElementsStorage.getData()); | |||
vec.copyToRawArray (ptr); | |||
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i) | |||
{ | |||
double delta = SIMDRegister_test_internal::difference (ptr[i], array[i]); | |||
@@ -130,8 +133,15 @@ public: | |||
template <typename type> | |||
static void copy (SIMDRegister<type>& vec, const type* ptr) | |||
{ | |||
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i) | |||
vec[i] = ptr[i]; | |||
if (SIMDRegister<type>::isSIMDAligned (ptr)) | |||
{ | |||
vec = SIMDRegister<type>::fromRawArray (ptr); | |||
} | |||
else | |||
{ | |||
for (size_t i = 0; i < SIMDRegister<type>::SIMDNumElements; ++i) | |||
vec[i] = ptr[i]; | |||
} | |||
} | |||
//============================================================================== | |||
@@ -66,6 +66,8 @@ struct SIMDNativeOps<float> | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); } | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); } | |||
static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); } | |||
@@ -120,28 +122,30 @@ struct SIMDNativeOps<double> | |||
DECLARE_AVX_SIMD_CONST (double, kOne); | |||
//============================================================================== | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); } | |||
static forcedinline __m256d expand (double s) noexcept { return _mm256_broadcast_sd (&s); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); } | |||
static forcedinline __m256d bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); } | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); } | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); } | |||
//============================================================================== | |||
static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept | |||
@@ -170,24 +174,38 @@ struct SIMDNativeOps<int8_t> | |||
//============================================================================== | |||
DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet); | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); } | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const char*> (a); | |||
return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], | |||
b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], | |||
b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], | |||
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int8_t, __m256i>::store (value, dest); | |||
} | |||
//============================================================================== | |||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); | |||
@@ -247,6 +265,20 @@ struct SIMDNativeOps<uint8_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const char*> (a); | |||
return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24], | |||
b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16], | |||
b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], | |||
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint8_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256()); | |||
@@ -306,6 +338,17 @@ struct SIMDNativeOps<int16_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept | |||
{ | |||
return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8], | |||
a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int16_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i tmp = _mm256_hadd_epi16 (a, a); | |||
@@ -349,6 +392,18 @@ struct SIMDNativeOps<uint16_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int16_t*> (a); | |||
return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], | |||
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint16_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i tmp = _mm256_hadd_epi16 (a, a); | |||
@@ -390,6 +445,16 @@ struct SIMDNativeOps<int32_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept | |||
{ | |||
return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int32_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i tmp = _mm256_hadd_epi32 (a, a); | |||
@@ -432,6 +497,17 @@ struct SIMDNativeOps<uint32_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int32_t*> (a); | |||
return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint32_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept | |||
{ | |||
__m256i tmp = _mm256_hadd_epi32 (a, a); | |||
@@ -469,6 +545,16 @@ struct SIMDNativeOps<int64_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept | |||
{ | |||
return _mm256_set_epi64x (a[3], a[2], a[1], a[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int64_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept | |||
{ | |||
#ifdef _MSC_VER | |||
@@ -530,6 +616,17 @@ struct SIMDNativeOps<uint64_t> | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int64_t*> (a); | |||
return _mm256_set_epi64x (b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint64_t, __m256i>::store (value, dest); | |||
} | |||
static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept | |||
{ | |||
#ifdef _MSC_VER | |||
@@ -200,6 +200,25 @@ struct SIMDFallbackOps | |||
return retval; | |||
} | |||
static forcedinline vSIMDType load (const ScalarType* a) noexcept | |||
{ | |||
vSIMDType retval; | |||
auto* dst = reinterpret_cast<ScalarType*> (&retval); | |||
for (size_t i = 0; i < n; ++i) | |||
dst [i] = a[i]; | |||
return retval; | |||
} | |||
static forcedinline void store (vSIMDType value, ScalarType* dest) noexcept | |||
{ | |||
const auto* src = reinterpret_cast<const ScalarType*> (&value); | |||
for (size_t i = 0; i < n; ++i) | |||
dest[i] = src[i]; | |||
} | |||
template <unsigned int shuffle_idx> | |||
static forcedinline vSIMDType shuffle (vSIMDType a) noexcept | |||
{ | |||
@@ -66,15 +66,17 @@ struct SIMDNativeOps<float> | |||
DECLARE_NEON_SIMD_CONST (float, kOne); | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } | |||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } | |||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } | |||
static forcedinline vSIMDType expand (float s) noexcept { return vdupq_n_f32 (s); } | |||
static forcedinline vSIMDType load (const float* a) noexcept { return vld1q_f32 (a); } | |||
static forcedinline void store (vSIMDType value, float* a) noexcept { vst1q_f32 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_f32 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_f32 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_f32 (a, b); } | |||
static forcedinline vSIMDType bit_and (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vandq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_or (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vorrq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_xor (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) veorq_u32 ((vMaskType) a, (vMaskType) b); } | |||
static forcedinline vSIMDType bit_notand (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vbicq_u32 ((vMaskType) b, (vMaskType) a); } | |||
static forcedinline vSIMDType bit_not (vSIMDType a) noexcept { return bit_notand (a, vld1q_f32 ((float*) kAllBitsSet)); } | |||
static forcedinline vSIMDType min (vSIMDType a, vSIMDType b) noexcept { return vminq_f32 (a, b); } | |||
static forcedinline vSIMDType max (vSIMDType a, vSIMDType b) noexcept { return vmaxq_f32 (a, b); } | |||
static forcedinline vSIMDType equal (vSIMDType a, vSIMDType b) noexcept { return (vSIMDType) vceqq_f32 (a, b); } | |||
@@ -109,6 +111,8 @@ struct SIMDNativeOps<double> | |||
typedef SIMDFallbackOps<double, vSIMDType> fb; | |||
static forcedinline vSIMDType expand (double s) noexcept { return fb::expand (s); } | |||
static forcedinline vSIMDType load (const double* a) noexcept { return fb::load (a); } | |||
static forcedinline void store (vSIMDType value, double* a) noexcept { fb::store (value, a); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return fb::add (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return fb::sub (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } | |||
@@ -143,6 +147,8 @@ struct SIMDNativeOps<int8_t> | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (int8_t s) noexcept { return vdupq_n_s8 (s); } | |||
static forcedinline vSIMDType load (const int8_t* a) noexcept { return vld1q_s8 (a); } | |||
static forcedinline void store (vSIMDType value, int8_t* a) noexcept { vst1q_s8 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s8 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s8 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s8 (a, b); } | |||
@@ -174,7 +180,9 @@ struct SIMDNativeOps<uint8_t> | |||
DECLARE_NEON_SIMD_CONST (uint8_t, kAllBitsSet); | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } | |||
static forcedinline vSIMDType expand (uint8_t s) noexcept { return vdupq_n_u8 (s); } | |||
static forcedinline vSIMDType load (const uint8_t* a) noexcept { return vld1q_u8 (a); } | |||
static forcedinline void store (vSIMDType value, uint8_t* a) noexcept { vst1q_u8 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u8 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u8 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u8 (a, b); } | |||
@@ -207,6 +215,8 @@ struct SIMDNativeOps<int16_t> | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (int16_t s) noexcept { return vdupq_n_s16 (s); } | |||
static forcedinline vSIMDType load (const int16_t* a) noexcept { return vld1q_s16 (a); } | |||
static forcedinline void store (vSIMDType value, int16_t* a) noexcept { vst1q_s16 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s16 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s16 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s16 (a, b); } | |||
@@ -239,7 +249,9 @@ struct SIMDNativeOps<uint16_t> | |||
DECLARE_NEON_SIMD_CONST (uint16_t, kAllBitsSet); | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } | |||
static forcedinline vSIMDType expand (uint16_t s) noexcept { return vdupq_n_u16 (s); } | |||
static forcedinline vSIMDType load (const uint16_t* a) noexcept { return vld1q_u16 (a); } | |||
static forcedinline void store (vSIMDType value, uint16_t* a) noexcept { vst1q_u16 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u16 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u16 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u16 (a, b); } | |||
@@ -272,6 +284,8 @@ struct SIMDNativeOps<int32_t> | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (int32_t s) noexcept { return vdupq_n_s32 (s); } | |||
static forcedinline vSIMDType load (const int32_t* a) noexcept { return vld1q_s32 (a); } | |||
static forcedinline void store (vSIMDType value, int32_t* a) noexcept { vst1q_s32 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s32 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s32 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_s32 (a, b); } | |||
@@ -304,7 +318,9 @@ struct SIMDNativeOps<uint32_t> | |||
DECLARE_NEON_SIMD_CONST (uint32_t, kAllBitsSet); | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } | |||
static forcedinline vSIMDType expand (uint32_t s) noexcept { return vdupq_n_u32 (s); } | |||
static forcedinline vSIMDType load (const uint32_t* a) noexcept { return vld1q_u32 (a); } | |||
static forcedinline void store (vSIMDType value, uint32_t* a) noexcept { vst1q_u32 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u32 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u32 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return vmulq_u32 (a, b); } | |||
@@ -337,6 +353,8 @@ struct SIMDNativeOps<int64_t> | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (int64_t s) noexcept { return vdupq_n_s64 (s); } | |||
static forcedinline vSIMDType load (const int64_t* a) noexcept { return vld1q_s64 (a); } | |||
static forcedinline void store (vSIMDType value, int64_t* a) noexcept { vst1q_s64 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_s64 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_s64 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } | |||
@@ -370,6 +388,8 @@ struct SIMDNativeOps<uint64_t> | |||
//============================================================================== | |||
static forcedinline vSIMDType expand (uint64_t s) noexcept { return vdupq_n_u64 (s); } | |||
static forcedinline vSIMDType load (const uint64_t* a) noexcept { return vld1q_u64 (a); } | |||
static forcedinline void store (vSIMDType value, uint64_t* a) noexcept { vst1q_u64 (a, value); } | |||
static forcedinline vSIMDType add (vSIMDType a, vSIMDType b) noexcept { return vaddq_u64 (a, b); } | |||
static forcedinline vSIMDType sub (vSIMDType a, vSIMDType b) noexcept { return vsubq_u64 (a, b); } | |||
static forcedinline vSIMDType mul (vSIMDType a, vSIMDType b) noexcept { return fb::mul (a, b); } | |||
@@ -65,6 +65,8 @@ struct SIMDNativeOps<float> | |||
//============================================================================== | |||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); } | |||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); } | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); } | |||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); } | |||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); } | |||
static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); } | |||
@@ -124,6 +126,8 @@ struct SIMDNativeOps<double> | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); } | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); } | |||
static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); } | |||
@@ -199,6 +203,18 @@ struct SIMDNativeOps<int8_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const char*> (a); | |||
return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], | |||
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int8_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -268,6 +284,18 @@ struct SIMDNativeOps<uint8_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const char*> (a); | |||
return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8], | |||
b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint8_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -337,6 +365,16 @@ struct SIMDNativeOps<int16_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept | |||
{ | |||
return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int16_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -395,6 +433,17 @@ struct SIMDNativeOps<uint16_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int16_t*> (a); | |||
return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint16_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -428,6 +477,7 @@ struct SIMDNativeOps<int32_t> | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); } | |||
@@ -442,6 +492,11 @@ struct SIMDNativeOps<int32_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int32_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -522,6 +577,17 @@ struct SIMDNativeOps<uint32_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int32_t*> (a); | |||
return _mm_set_epi32 (b[3], b[2], b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint32_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
#ifdef __SSSE3__ | |||
@@ -591,6 +657,7 @@ struct SIMDNativeOps<int64_t> | |||
return retval; | |||
} | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); } | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); } | |||
@@ -606,6 +673,11 @@ struct SIMDNativeOps<int64_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<int64_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
const int64_t* ptr = reinterpret_cast<const int64_t*> (&a); | |||
@@ -692,6 +764,17 @@ struct SIMDNativeOps<uint64_t> | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); } | |||
//============================================================================== | |||
static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept | |||
{ | |||
const auto* b = reinterpret_cast<const int64_t*> (a); | |||
return _mm_set_epi64x (b[1], b[0]); | |||
} | |||
static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept | |||
{ | |||
SIMDFallbackOps<uint64_t, __m128i>::store (value, dest); | |||
} | |||
static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept | |||
{ | |||
const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a); | |||