The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

665 lines
45KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_AVX_SIMD_CONST(type, name) \
  26. static __declspec(align(32)) const type name[32 / sizeof (type)]
  27. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
  29. #else
  30. #define DECLARE_AVX_SIMD_CONST(type, name) \
  31. static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
  32. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point AVX intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. typedef __m256 vSIMDType;
  43. //==============================================================================
  44. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  45. DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
  46. DECLARE_AVX_SIMD_CONST (float, kOne);
  47. //==============================================================================
  48. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  49. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  50. static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
  51. static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
  52. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
  53. static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
  54. static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
  55. static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
  56. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
  57. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
  58. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
  59. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
  60. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  61. static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
  62. static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
  63. static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
  64. static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
  65. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
  66. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
  67. static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept { return _mm256_fmadd_ps (b, c, a); }
  68. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  69. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  70. static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  71. static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
  72. {
  73. a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
  74. return add (_mm256_permute2f128_ps (a, a, 1), a);
  75. }
  76. //==============================================================================
  77. static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
  78. {
  79. __m256 rr_ir = mul (a, dupeven (b));
  80. __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
  81. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  82. }
  83. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
  84. {
  85. __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
  86. __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
  87. retval = _mm256_add_ps (retval, tmp);
  88. return ((float*) &retval)[0];
  89. }
  90. };
  91. //==============================================================================
  92. /** Double-precision floating point AVX intrinsics. */
  93. template <>
  94. struct SIMDNativeOps<double>
  95. {
  96. typedef __m256d vSIMDType;
  97. //==============================================================================
  98. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  99. DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
  100. DECLARE_AVX_SIMD_CONST (double, kOne);
  101. //==============================================================================
  102. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  103. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  104. static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
  105. static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
  106. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
  107. static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
  108. static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
  109. static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
  110. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
  111. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
  112. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
  113. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
  114. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  115. static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
  116. static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
  117. static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
  118. static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
  119. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
  120. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
  121. static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
  122. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
  123. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
  124. static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
  125. static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
  126. //==============================================================================
  127. static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
  128. {
  129. __m256d rr_ir = mul (a, dupeven (b));
  130. __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
  131. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  132. }
  133. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
  134. {
  135. __m256d retval = _mm256_hadd_pd (a, a);
  136. __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
  137. retval = _mm256_add_pd (retval, tmp);
  138. return ((double*) &retval)[0];
  139. }
  140. };
  141. //==============================================================================
  142. /** Signed 8-bit integer AVX intrinsics */
  143. template <>
  144. struct SIMDNativeOps<int8_t>
  145. {
  146. typedef __m256i vSIMDType;
  147. //==============================================================================
  148. DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
  149. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  150. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
  151. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  152. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  153. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  154. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  155. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  156. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  157. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  158. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
  159. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
  160. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  161. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
  162. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  163. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  164. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  165. //==============================================================================
  166. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  167. {
  168. const auto* b = reinterpret_cast<const char*> (a);
  169. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  170. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  171. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  172. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  173. }
  174. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
  175. {
  176. SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
  177. }
  178. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  179. {
  180. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  181. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  182. for (int i = 0; i < 3; ++i)
  183. {
  184. lo = _mm256_hadd_epi16 (lo, lo);
  185. hi = _mm256_hadd_epi16 (hi, hi);
  186. }
  187. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  188. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  189. return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  190. }
  191. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  192. {
  193. // unpack and multiply
  194. __m256i even = _mm256_mullo_epi16 (a, b);
  195. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  196. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  197. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  198. }
  199. };
  200. //==============================================================================
  201. /** Unsigned 8-bit integer AVX intrinsics. */
  202. template <>
  203. struct SIMDNativeOps<uint8_t>
  204. {
  205. //==============================================================================
  206. typedef __m256i vSIMDType;
  207. //==============================================================================
  208. DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
  209. DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
  210. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  211. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  212. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
  213. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  214. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  215. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  216. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  217. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  218. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  219. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  220. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
  221. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
  222. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  223. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
  224. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  225. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  226. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  227. //==============================================================================
  228. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  229. {
  230. const auto* b = reinterpret_cast<const char*> (a);
  231. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  232. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  233. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  234. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  235. }
  236. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
  237. {
  238. SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
  239. }
  240. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  241. {
  242. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  243. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  244. for (int i = 0; i < 3; ++i)
  245. {
  246. lo = _mm256_hadd_epi16 (lo, lo);
  247. hi = _mm256_hadd_epi16 (hi, hi);
  248. }
  249. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  250. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  251. return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  252. }
  253. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  254. {
  255. // unpack and multiply
  256. __m256i even = _mm256_mullo_epi16 (a, b);
  257. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  258. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  259. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  260. }
  261. };
  262. //==============================================================================
  263. /** Signed 16-bit integer AVX intrinsics. */
  264. template <>
  265. struct SIMDNativeOps<int16_t>
  266. {
  267. //==============================================================================
  268. typedef __m256i vSIMDType;
  269. //==============================================================================
  270. DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
  271. //==============================================================================
  272. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  273. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
  274. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  275. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  276. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  277. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  278. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  279. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  280. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  281. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  282. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
  283. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
  284. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  285. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
  286. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  287. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  288. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  289. //==============================================================================
  290. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  291. {
  292. return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
  293. a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  294. }
  295. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
  296. {
  297. SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
  298. }
  299. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  300. {
  301. __m256i tmp = _mm256_hadd_epi16 (a, a);
  302. tmp = _mm256_hadd_epi16 (tmp, tmp);
  303. tmp = _mm256_hadd_epi16 (tmp, tmp);
  304. int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
  305. return (int16_t) (ptr[0] + ptr[8]);
  306. }
  307. };
  308. //==============================================================================
  309. /** Unsigned 16-bit integer AVX intrinsics. */
  310. template <>
  311. struct SIMDNativeOps<uint16_t>
  312. {
  313. //==============================================================================
  314. typedef __m256i vSIMDType;
  315. //==============================================================================
  316. DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
  317. DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
  318. //==============================================================================
  319. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  320. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  321. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
  322. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  323. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  324. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  325. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  326. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  327. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  328. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  329. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  330. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
  331. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
  332. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  333. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
  334. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  335. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  336. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  337. //==============================================================================
  338. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  339. {
  340. const auto* b = reinterpret_cast<const int16_t*> (a);
  341. return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  342. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  343. }
  344. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
  345. {
  346. SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
  347. }
  348. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  349. {
  350. __m256i tmp = _mm256_hadd_epi16 (a, a);
  351. tmp = _mm256_hadd_epi16 (tmp, tmp);
  352. tmp = _mm256_hadd_epi16 (tmp, tmp);
  353. uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
  354. return (uint16_t) (ptr[0] + ptr[8]);
  355. }
  356. };
  357. //==============================================================================
  358. /** Signed 32-bit integer AVX intrinsics. */
  359. template <>
  360. struct SIMDNativeOps<int32_t>
  361. {
  362. //==============================================================================
  363. typedef __m256i vSIMDType;
  364. //==============================================================================
  365. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  366. //==============================================================================
  367. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  368. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
  369. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  370. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  371. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  372. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  373. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  374. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  375. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  376. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  377. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
  378. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
  379. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  380. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
  381. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  382. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  383. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  384. //==============================================================================
  385. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
  386. {
  387. return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  388. }
  389. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
  390. {
  391. SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
  392. }
  393. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  394. {
  395. __m256i tmp = _mm256_hadd_epi32 (a, a);
  396. tmp = _mm256_hadd_epi32 (tmp, tmp);
  397. int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
  398. return ptr[0] + ptr[4];
  399. }
  400. };
  401. //==============================================================================
  402. /** Unsigned 32-bit integer AVX intrinsics. */
  403. template <>
  404. struct SIMDNativeOps<uint32_t>
  405. {
  406. //==============================================================================
  407. typedef __m256i vSIMDType;
  408. //==============================================================================
  409. DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
  410. DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
  411. //==============================================================================
  412. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  413. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  414. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
  415. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  416. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  417. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  418. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  419. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  420. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  421. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  422. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  423. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
  424. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
  425. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  426. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
  427. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  428. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  429. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  430. //==============================================================================
  431. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  432. {
  433. const auto* b = reinterpret_cast<const int32_t*> (a);
  434. return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  435. }
  436. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
  437. {
  438. SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
  439. }
  440. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  441. {
  442. __m256i tmp = _mm256_hadd_epi32 (a, a);
  443. tmp = _mm256_hadd_epi32 (tmp, tmp);
  444. uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
  445. return ptr[0] + ptr[4];
  446. }
  447. };
  448. //==============================================================================
  449. /** Signed 64-bit integer AVX intrinsics. */
  450. template <>
  451. struct SIMDNativeOps<int64_t>
  452. {
  453. //==============================================================================
  454. typedef __m256i vSIMDType;
  455. //==============================================================================
  456. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  457. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  458. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  459. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  460. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  461. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  462. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  463. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  464. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  465. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  466. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  467. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  468. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
  469. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  470. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  471. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  472. //==============================================================================
  473. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
  474. {
  475. return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
  476. }
  477. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
  478. {
  479. SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
  480. }
  481. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  482. {
  483. #ifdef _MSC_VER
  484. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  485. return *reinterpret_cast<const __m256i*> (&tmp);
  486. #else
  487. return _mm256_set1_epi64x ((int64_t) s);
  488. #endif
  489. }
  490. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  491. {
  492. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  493. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  494. }
  495. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  496. {
  497. __m256i retval;
  498. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  499. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  500. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  501. for (int i = 0; i < 4; ++i)
  502. dst[i] = aptr[i] * bptr[i];
  503. return retval;
  504. }
  505. };
  506. //==============================================================================
  507. /** Unsigned 64-bit integer AVX intrinsics. */
  508. template <>
  509. struct SIMDNativeOps<uint64_t>
  510. {
  511. //==============================================================================
  512. typedef __m256i vSIMDType;
  513. //==============================================================================
  514. DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
  515. DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
  516. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  517. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  518. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  519. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  520. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  521. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  522. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  523. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  524. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  525. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  526. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  527. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  528. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
  529. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  530. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  531. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  532. //==============================================================================
  533. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  534. {
  535. const auto* b = reinterpret_cast<const int64_t*> (a);
  536. return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
  537. }
  538. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
  539. {
  540. SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
  541. }
  542. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  543. {
  544. #ifdef _MSC_VER
  545. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  546. return *reinterpret_cast<const __m256i*> (&tmp);
  547. #else
  548. return _mm256_set1_epi64x ((int64_t) s);
  549. #endif
  550. }
  551. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  552. {
  553. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  554. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  555. }
  556. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  557. {
  558. __m256i retval;
  559. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  560. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  561. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  562. for (int i = 0; i < 4; ++i)
  563. dst[i] = aptr[i] * bptr[i];
  564. return retval;
  565. }
  566. };
  567. #endif
  568. } // namespace dsp
  569. } // namespace juce