The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

675 lines
46KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_AVX_SIMD_CONST(type, name) \
  26. static __declspec(align(32)) const type name[32 / sizeof (type)]
  27. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
  29. #else
  30. #define DECLARE_AVX_SIMD_CONST(type, name) \
  31. static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
  32. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point AVX intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. typedef __m256 vSIMDType;
  43. //==============================================================================
  44. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  45. DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
  46. DECLARE_AVX_SIMD_CONST (float, kOne);
  47. //==============================================================================
  48. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  49. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  50. static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
  51. static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
  52. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
  53. static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
  54. static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
  55. static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
  56. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
  57. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
  58. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
  59. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
  60. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  61. static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
  62. static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
  63. static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
  64. static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
  65. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
  66. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
  67. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
  68. static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept { return _mm256_fmadd_ps (b, c, a); }
  69. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  70. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  71. static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  72. static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
  73. {
  74. a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
  75. return add (_mm256_permute2f128_ps (a, a, 1), a);
  76. }
  77. //==============================================================================
  78. static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
  79. {
  80. __m256 rr_ir = mul (a, dupeven (b));
  81. __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
  82. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  83. }
  84. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
  85. {
  86. __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
  87. __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
  88. retval = _mm256_add_ps (retval, tmp);
  89. return ((float*) &retval)[0];
  90. }
  91. };
  92. //==============================================================================
  93. /** Double-precision floating point AVX intrinsics. */
  94. template <>
  95. struct SIMDNativeOps<double>
  96. {
  97. typedef __m256d vSIMDType;
  98. //==============================================================================
  99. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  100. DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
  101. DECLARE_AVX_SIMD_CONST (double, kOne);
  102. //==============================================================================
  103. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  104. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  105. static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
  106. static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
  107. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
  108. static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
  109. static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
  110. static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
  111. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
  112. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
  113. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
  114. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
  115. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  116. static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
  117. static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
  118. static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
  119. static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
  120. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
  121. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
  122. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
  123. static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
  124. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
  125. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
  126. static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
  127. static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
  128. //==============================================================================
  129. static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
  130. {
  131. __m256d rr_ir = mul (a, dupeven (b));
  132. __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
  133. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  134. }
  135. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
  136. {
  137. __m256d retval = _mm256_hadd_pd (a, a);
  138. __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
  139. retval = _mm256_add_pd (retval, tmp);
  140. return ((double*) &retval)[0];
  141. }
  142. };
  143. //==============================================================================
  144. /** Signed 8-bit integer AVX intrinsics */
  145. template <>
  146. struct SIMDNativeOps<int8_t>
  147. {
  148. typedef __m256i vSIMDType;
  149. //==============================================================================
  150. DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
  151. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  152. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
  153. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  154. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  155. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  156. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  157. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  158. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  159. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  160. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
  161. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
  162. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  163. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
  164. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  165. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
  166. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  167. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  168. //==============================================================================
  169. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  170. {
  171. const auto* b = reinterpret_cast<const char*> (a);
  172. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  173. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  174. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  175. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  176. }
  177. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
  178. {
  179. SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
  180. }
  181. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  182. {
  183. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  184. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  185. for (int i = 0; i < 3; ++i)
  186. {
  187. lo = _mm256_hadd_epi16 (lo, lo);
  188. hi = _mm256_hadd_epi16 (hi, hi);
  189. }
  190. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  191. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  192. return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  193. }
  194. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  195. {
  196. // unpack and multiply
  197. __m256i even = _mm256_mullo_epi16 (a, b);
  198. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  199. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  200. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  201. }
  202. };
  203. //==============================================================================
  204. /** Unsigned 8-bit integer AVX intrinsics. */
  205. template <>
  206. struct SIMDNativeOps<uint8_t>
  207. {
  208. //==============================================================================
  209. typedef __m256i vSIMDType;
  210. //==============================================================================
  211. DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
  212. DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
  213. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  214. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  215. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
  216. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  217. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  218. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  219. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  220. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  221. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  222. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  223. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
  224. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
  225. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  226. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
  227. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  228. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  229. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  230. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  231. //==============================================================================
  232. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  233. {
  234. const auto* b = reinterpret_cast<const char*> (a);
  235. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  236. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  237. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  238. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  239. }
  240. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
  241. {
  242. SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
  243. }
  244. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  245. {
  246. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  247. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  248. for (int i = 0; i < 3; ++i)
  249. {
  250. lo = _mm256_hadd_epi16 (lo, lo);
  251. hi = _mm256_hadd_epi16 (hi, hi);
  252. }
  253. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  254. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  255. return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  256. }
  257. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  258. {
  259. // unpack and multiply
  260. __m256i even = _mm256_mullo_epi16 (a, b);
  261. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  262. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  263. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  264. }
  265. };
  266. //==============================================================================
  267. /** Signed 16-bit integer AVX intrinsics. */
  268. template <>
  269. struct SIMDNativeOps<int16_t>
  270. {
  271. //==============================================================================
  272. typedef __m256i vSIMDType;
  273. //==============================================================================
  274. DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
  275. //==============================================================================
  276. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  277. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
  278. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  279. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  280. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  281. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  282. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  283. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  284. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  285. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  286. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
  287. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
  288. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  289. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
  290. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  291. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  292. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  293. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  294. //==============================================================================
  295. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  296. {
  297. return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
  298. a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  299. }
  300. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
  301. {
  302. SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
  303. }
  304. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  305. {
  306. __m256i tmp = _mm256_hadd_epi16 (a, a);
  307. tmp = _mm256_hadd_epi16 (tmp, tmp);
  308. tmp = _mm256_hadd_epi16 (tmp, tmp);
  309. int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
  310. return (int16_t) (ptr[0] + ptr[8]);
  311. }
  312. };
  313. //==============================================================================
  314. /** Unsigned 16-bit integer AVX intrinsics. */
  315. template <>
  316. struct SIMDNativeOps<uint16_t>
  317. {
  318. //==============================================================================
  319. typedef __m256i vSIMDType;
  320. //==============================================================================
  321. DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
  322. DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
  323. //==============================================================================
  324. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  325. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  326. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
  327. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  328. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  329. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  330. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  331. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  332. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  333. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  334. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  335. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
  336. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
  337. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  338. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
  339. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  340. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  341. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  342. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  343. //==============================================================================
  344. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  345. {
  346. const auto* b = reinterpret_cast<const int16_t*> (a);
  347. return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  348. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  349. }
  350. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
  351. {
  352. SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
  353. }
  354. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  355. {
  356. __m256i tmp = _mm256_hadd_epi16 (a, a);
  357. tmp = _mm256_hadd_epi16 (tmp, tmp);
  358. tmp = _mm256_hadd_epi16 (tmp, tmp);
  359. uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
  360. return (uint16_t) (ptr[0] + ptr[8]);
  361. }
  362. };
  363. //==============================================================================
  364. /** Signed 32-bit integer AVX intrinsics. */
  365. template <>
  366. struct SIMDNativeOps<int32_t>
  367. {
  368. //==============================================================================
  369. typedef __m256i vSIMDType;
  370. //==============================================================================
  371. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  372. //==============================================================================
  373. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  374. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
  375. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  376. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  377. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  378. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  379. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  380. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  381. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  382. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  383. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
  384. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
  385. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  386. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
  387. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  388. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  389. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  390. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  391. //==============================================================================
  392. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
  393. {
  394. return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  395. }
  396. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
  397. {
  398. SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
  399. }
  400. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  401. {
  402. __m256i tmp = _mm256_hadd_epi32 (a, a);
  403. tmp = _mm256_hadd_epi32 (tmp, tmp);
  404. int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
  405. return ptr[0] + ptr[4];
  406. }
  407. };
  408. //==============================================================================
  409. /** Unsigned 32-bit integer AVX intrinsics. */
  410. template <>
  411. struct SIMDNativeOps<uint32_t>
  412. {
  413. //==============================================================================
  414. typedef __m256i vSIMDType;
  415. //==============================================================================
  416. DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
  417. DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
  418. //==============================================================================
  419. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  420. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  421. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
  422. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  423. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  424. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  425. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  426. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  427. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  428. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  429. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  430. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
  431. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
  432. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  433. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
  434. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  435. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  436. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  437. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  438. //==============================================================================
  439. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  440. {
  441. const auto* b = reinterpret_cast<const int32_t*> (a);
  442. return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  443. }
  444. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
  445. {
  446. SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
  447. }
  448. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  449. {
  450. __m256i tmp = _mm256_hadd_epi32 (a, a);
  451. tmp = _mm256_hadd_epi32 (tmp, tmp);
  452. uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
  453. return ptr[0] + ptr[4];
  454. }
  455. };
  456. //==============================================================================
  457. /** Signed 64-bit integer AVX intrinsics. */
  458. template <>
  459. struct SIMDNativeOps<int64_t>
  460. {
  461. //==============================================================================
  462. typedef __m256i vSIMDType;
  463. //==============================================================================
  464. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  465. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  466. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  467. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  468. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  469. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  470. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  471. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  472. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  473. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  474. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  475. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  476. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
  477. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  478. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  479. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  480. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  481. //==============================================================================
  482. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
  483. {
  484. return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
  485. }
  486. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
  487. {
  488. SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
  489. }
  490. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  491. {
  492. #ifdef _MSC_VER
  493. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  494. return *reinterpret_cast<const __m256i*> (&tmp);
  495. #else
  496. return _mm256_set1_epi64x ((int64_t) s);
  497. #endif
  498. }
  499. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  500. {
  501. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  502. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  503. }
  504. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  505. {
  506. __m256i retval;
  507. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  508. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  509. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  510. for (int i = 0; i < 4; ++i)
  511. dst[i] = aptr[i] * bptr[i];
  512. return retval;
  513. }
  514. };
  515. //==============================================================================
  516. /** Unsigned 64-bit integer AVX intrinsics. */
  517. template <>
  518. struct SIMDNativeOps<uint64_t>
  519. {
  520. //==============================================================================
  521. typedef __m256i vSIMDType;
  522. //==============================================================================
  523. DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
  524. DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
  525. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  526. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  527. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  528. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  529. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  530. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  531. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  532. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  533. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  534. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  535. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  536. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  537. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
  538. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  539. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  540. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  541. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  542. //==============================================================================
  543. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  544. {
  545. const auto* b = reinterpret_cast<const int64_t*> (a);
  546. return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
  547. }
  548. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
  549. {
  550. SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
  551. }
  552. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  553. {
  554. #ifdef _MSC_VER
  555. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  556. return *reinterpret_cast<const __m256i*> (&tmp);
  557. #else
  558. return _mm256_set1_epi64x ((int64_t) s);
  559. #endif
  560. }
  561. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  562. {
  563. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  564. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  565. }
  566. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  567. {
  568. __m256i retval;
  569. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  570. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  571. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  572. for (int i = 0; i < 4; ++i)
  573. dst[i] = aptr[i] * bptr[i];
  574. return retval;
  575. }
  576. };
  577. #endif
  578. } // namespace dsp
  579. } // namespace juce