The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

714 lines
47KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #if JUCE_GCC && (__GNUC__ >= 6)
  25. #pragma GCC diagnostic push
  26. #pragma GCC diagnostic ignored "-Wignored-attributes"
  27. #endif
  28. #ifdef _MSC_VER
  29. #define DECLARE_AVX_SIMD_CONST(type, name) \
  30. static __declspec(align(32)) const type name[32 / sizeof (type)]
  31. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  32. __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
  33. #else
  34. #define DECLARE_AVX_SIMD_CONST(type, name) \
  35. static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
  36. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  37. const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
  38. #endif
  39. template <typename type>
  40. struct SIMDNativeOps;
  41. //==============================================================================
  42. /** Single-precision floating point AVX intrinsics.
  43. @tags{DSP}
  44. */
  45. template <>
  46. struct SIMDNativeOps<float>
  47. {
  48. typedef __m256 vSIMDType;
  49. //==============================================================================
  50. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  51. DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
  52. DECLARE_AVX_SIMD_CONST (float, kOne);
  53. //==============================================================================
  54. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  55. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  56. static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
  57. static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
  58. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
  59. static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
  60. static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
  61. static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
  62. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
  63. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
  64. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
  65. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
  66. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  67. static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
  68. static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
  69. static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
  70. static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
  71. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
  72. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
  73. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
  74. static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept { return _mm256_fmadd_ps (b, c, a); }
  75. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  76. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  77. static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  78. static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
  79. {
  80. a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
  81. return add (_mm256_permute2f128_ps (a, a, 1), a);
  82. }
  83. //==============================================================================
  84. static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
  85. {
  86. __m256 rr_ir = mul (a, dupeven (b));
  87. __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
  88. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  89. }
  90. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
  91. {
  92. __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
  93. __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
  94. retval = _mm256_add_ps (retval, tmp);
  95. return ((float*) &retval)[0];
  96. }
  97. };
  98. //==============================================================================
  99. /** Double-precision floating point AVX intrinsics.
  100. @tags{DSP}
  101. */
  102. template <>
  103. struct SIMDNativeOps<double>
  104. {
  105. typedef __m256d vSIMDType;
  106. //==============================================================================
  107. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  108. DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
  109. DECLARE_AVX_SIMD_CONST (double, kOne);
  110. //==============================================================================
  111. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  112. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  113. static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
  114. static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
  115. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
  116. static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
  117. static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
  118. static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
  119. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
  120. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
  121. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
  122. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
  123. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  124. static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
  125. static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
  126. static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
  127. static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
  128. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
  129. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
  130. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
  131. static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
  132. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
  133. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
  134. static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
  135. static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
  136. //==============================================================================
  137. static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
  138. {
  139. __m256d rr_ir = mul (a, dupeven (b));
  140. __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
  141. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  142. }
  143. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
  144. {
  145. __m256d retval = _mm256_hadd_pd (a, a);
  146. __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
  147. retval = _mm256_add_pd (retval, tmp);
  148. return ((double*) &retval)[0];
  149. }
  150. };
  151. //==============================================================================
  152. /** Signed 8-bit integer AVX intrinsics
  153. @tags{DSP}
  154. */
  155. template <>
  156. struct SIMDNativeOps<int8_t>
  157. {
  158. typedef __m256i vSIMDType;
  159. //==============================================================================
  160. DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
  161. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  162. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
  163. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  164. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  165. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  166. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  167. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  168. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  169. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  170. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
  171. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
  172. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  173. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
  174. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  175. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
  176. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  177. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  178. //==============================================================================
  179. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  180. {
  181. const auto* b = reinterpret_cast<const char*> (a);
  182. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  183. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  184. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  185. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  186. }
  187. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
  188. {
  189. SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
  190. }
  191. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  192. {
  193. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  194. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  195. for (int i = 0; i < 3; ++i)
  196. {
  197. lo = _mm256_hadd_epi16 (lo, lo);
  198. hi = _mm256_hadd_epi16 (hi, hi);
  199. }
  200. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  201. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  202. return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  203. }
  204. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  205. {
  206. // unpack and multiply
  207. __m256i even = _mm256_mullo_epi16 (a, b);
  208. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  209. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  210. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  211. }
  212. };
  213. //==============================================================================
  214. /** Unsigned 8-bit integer AVX intrinsics.
  215. @tags{DSP}
  216. */
  217. template <>
  218. struct SIMDNativeOps<uint8_t>
  219. {
  220. //==============================================================================
  221. typedef __m256i vSIMDType;
  222. //==============================================================================
  223. DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
  224. DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
  225. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  226. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  227. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
  228. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  229. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  230. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  231. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  232. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  233. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  234. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  235. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
  236. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
  237. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  238. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
  239. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  240. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  241. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  242. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  243. //==============================================================================
  244. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  245. {
  246. const auto* b = reinterpret_cast<const char*> (a);
  247. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  248. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  249. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  250. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  251. }
  252. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
  253. {
  254. SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
  255. }
  256. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  257. {
  258. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  259. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  260. for (int i = 0; i < 3; ++i)
  261. {
  262. lo = _mm256_hadd_epi16 (lo, lo);
  263. hi = _mm256_hadd_epi16 (hi, hi);
  264. }
  265. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  266. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  267. return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  268. }
  269. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  270. {
  271. // unpack and multiply
  272. __m256i even = _mm256_mullo_epi16 (a, b);
  273. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  274. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  275. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  276. }
  277. };
  278. //==============================================================================
  279. /** Signed 16-bit integer AVX intrinsics.
  280. @tags{DSP}
  281. */
  282. template <>
  283. struct SIMDNativeOps<int16_t>
  284. {
  285. //==============================================================================
  286. typedef __m256i vSIMDType;
  287. //==============================================================================
  288. DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
  289. //==============================================================================
  290. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  291. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
  292. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  293. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  294. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  295. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  296. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  297. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  298. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  299. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  300. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
  301. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
  302. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  303. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
  304. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  305. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  306. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  307. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  308. //==============================================================================
  309. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  310. {
  311. return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
  312. a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  313. }
  314. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
  315. {
  316. SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
  317. }
  318. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  319. {
  320. __m256i tmp = _mm256_hadd_epi16 (a, a);
  321. tmp = _mm256_hadd_epi16 (tmp, tmp);
  322. tmp = _mm256_hadd_epi16 (tmp, tmp);
  323. int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
  324. return (int16_t) (ptr[0] + ptr[8]);
  325. }
  326. };
  327. //==============================================================================
  328. /** Unsigned 16-bit integer AVX intrinsics.
  329. @tags{DSP}
  330. */
  331. template <>
  332. struct SIMDNativeOps<uint16_t>
  333. {
  334. //==============================================================================
  335. typedef __m256i vSIMDType;
  336. //==============================================================================
  337. DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
  338. DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
  339. //==============================================================================
  340. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  341. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  342. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
  343. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  344. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  345. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  346. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  347. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  348. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  349. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  350. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  351. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
  352. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
  353. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  354. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
  355. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  356. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  357. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  358. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  359. //==============================================================================
  360. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  361. {
  362. const auto* b = reinterpret_cast<const int16_t*> (a);
  363. return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  364. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  365. }
  366. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
  367. {
  368. SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
  369. }
  370. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  371. {
  372. __m256i tmp = _mm256_hadd_epi16 (a, a);
  373. tmp = _mm256_hadd_epi16 (tmp, tmp);
  374. tmp = _mm256_hadd_epi16 (tmp, tmp);
  375. uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
  376. return (uint16_t) (ptr[0] + ptr[8]);
  377. }
  378. };
  379. //==============================================================================
  380. /** Signed 32-bit integer AVX intrinsics.
  381. @tags{DSP}
  382. */
  383. template <>
  384. struct SIMDNativeOps<int32_t>
  385. {
  386. //==============================================================================
  387. typedef __m256i vSIMDType;
  388. //==============================================================================
  389. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  390. //==============================================================================
  391. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  392. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
  393. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  394. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  395. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  396. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  397. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  398. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  399. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  400. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  401. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
  402. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
  403. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  404. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
  405. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  406. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  407. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  408. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  409. //==============================================================================
  410. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
  411. {
  412. return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  413. }
  414. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
  415. {
  416. SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
  417. }
  418. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  419. {
  420. __m256i tmp = _mm256_hadd_epi32 (a, a);
  421. tmp = _mm256_hadd_epi32 (tmp, tmp);
  422. int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
  423. return ptr[0] + ptr[4];
  424. }
  425. };
  426. //==============================================================================
  427. /** Unsigned 32-bit integer AVX intrinsics.
  428. @tags{DSP}
  429. */
  430. template <>
  431. struct SIMDNativeOps<uint32_t>
  432. {
  433. //==============================================================================
  434. typedef __m256i vSIMDType;
  435. //==============================================================================
  436. DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
  437. DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
  438. //==============================================================================
  439. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  440. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  441. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
  442. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  443. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  444. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  445. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  446. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  447. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  448. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  449. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  450. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
  451. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
  452. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  453. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
  454. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  455. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  456. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  457. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  458. //==============================================================================
  459. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  460. {
  461. const auto* b = reinterpret_cast<const int32_t*> (a);
  462. return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  463. }
  464. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
  465. {
  466. SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
  467. }
  468. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  469. {
  470. __m256i tmp = _mm256_hadd_epi32 (a, a);
  471. tmp = _mm256_hadd_epi32 (tmp, tmp);
  472. uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
  473. return ptr[0] + ptr[4];
  474. }
  475. };
  476. //==============================================================================
  477. /** Signed 64-bit integer AVX intrinsics.
  478. @tags{DSP}
  479. */
  480. template <>
  481. struct SIMDNativeOps<int64_t>
  482. {
  483. //==============================================================================
  484. typedef __m256i vSIMDType;
  485. //==============================================================================
  486. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  487. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  488. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  489. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  490. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  491. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  492. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  493. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  494. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  495. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  496. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  497. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  498. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
  499. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  500. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  501. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  502. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  503. //==============================================================================
  504. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
  505. {
  506. return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
  507. }
  508. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
  509. {
  510. SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
  511. }
  512. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  513. {
  514. #ifdef _MSC_VER
  515. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  516. return *reinterpret_cast<const __m256i*> (&tmp);
  517. #else
  518. return _mm256_set1_epi64x ((int64_t) s);
  519. #endif
  520. }
  521. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  522. {
  523. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  524. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  525. }
  526. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  527. {
  528. __m256i retval;
  529. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  530. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  531. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  532. for (int i = 0; i < 4; ++i)
  533. dst[i] = aptr[i] * bptr[i];
  534. return retval;
  535. }
  536. };
  537. //==============================================================================
  538. /** Unsigned 64-bit integer AVX intrinsics.
  539. @tags{DSP}
  540. */
  541. template <>
  542. struct SIMDNativeOps<uint64_t>
  543. {
  544. //==============================================================================
  545. typedef __m256i vSIMDType;
  546. //==============================================================================
  547. DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
  548. DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
  549. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  550. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  551. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  552. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  553. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  554. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  555. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  556. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  557. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  558. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  559. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  560. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  561. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
  562. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  563. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  564. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  565. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  566. //==============================================================================
  567. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  568. {
  569. const auto* b = reinterpret_cast<const int64_t*> (a);
  570. return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
  571. }
  572. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
  573. {
  574. SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
  575. }
  576. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  577. {
  578. #ifdef _MSC_VER
  579. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  580. return *reinterpret_cast<const __m256i*> (&tmp);
  581. #else
  582. return _mm256_set1_epi64x ((int64_t) s);
  583. #endif
  584. }
  585. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  586. {
  587. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  588. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  589. }
  590. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  591. {
  592. __m256i retval;
  593. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  594. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  595. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  596. for (int i = 0; i < 4; ++i)
  597. dst[i] = aptr[i] * bptr[i];
  598. return retval;
  599. }
  600. };
  601. #endif
  602. #if JUCE_GCC && (__GNUC__ >= 6)
  603. #pragma GCC diagnostic pop
  604. #endif
  605. } // namespace dsp
  606. } // namespace juce