The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

705 lines
46KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_AVX_SIMD_CONST(type, name) \
  26. static __declspec(align(32)) const type name[32 / sizeof (type)]
  27. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
  29. #else
  30. #define DECLARE_AVX_SIMD_CONST(type, name) \
  31. static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
  32. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point AVX intrinsics.
  39. @tags{DSP}
  40. */
  41. template <>
  42. struct SIMDNativeOps<float>
  43. {
  44. typedef __m256 vSIMDType;
  45. //==============================================================================
  46. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  47. DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
  48. DECLARE_AVX_SIMD_CONST (float, kOne);
  49. //==============================================================================
  50. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  51. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  52. static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
  53. static forcedinline __m256 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm256_load_ps (a); }
  54. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256 value, float* dest) noexcept { _mm256_store_ps (dest, value); }
  55. static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
  56. static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
  57. static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
  58. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
  59. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
  60. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
  61. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
  62. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  63. static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
  64. static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
  65. static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
  66. static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
  67. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
  68. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
  69. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256 a, __m256 b) noexcept { return (_mm256_movemask_ps (equal (a, b)) == 0xff); }
  70. static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept { return _mm256_fmadd_ps (b, c, a); }
  71. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  72. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  73. static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  74. static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
  75. {
  76. a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
  77. return add (_mm256_permute2f128_ps (a, a, 1), a);
  78. }
  79. //==============================================================================
  80. static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
  81. {
  82. __m256 rr_ir = mul (a, dupeven (b));
  83. __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
  84. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  85. }
  86. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
  87. {
  88. __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
  89. __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
  90. retval = _mm256_add_ps (retval, tmp);
  91. return ((float*) &retval)[0];
  92. }
  93. };
  94. //==============================================================================
  95. /** Double-precision floating point AVX intrinsics.
  96. @tags{DSP}
  97. */
  98. template <>
  99. struct SIMDNativeOps<double>
  100. {
  101. typedef __m256d vSIMDType;
  102. //==============================================================================
  103. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  104. DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
  105. DECLARE_AVX_SIMD_CONST (double, kOne);
  106. //==============================================================================
  107. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  108. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  109. static forcedinline __m256d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
  110. static forcedinline __m256d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm256_load_pd (a); }
  111. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256d value, double* dest) noexcept { _mm256_store_pd (dest, value); }
  112. static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
  113. static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
  114. static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
  115. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
  116. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
  117. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
  118. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
  119. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  120. static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
  121. static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
  122. static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
  123. static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
  124. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
  125. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
  126. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256d a, __m256d b) noexcept { return (_mm256_movemask_pd (equal (a, b)) == 0xf); }
  127. static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
  128. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
  129. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
  130. static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
  131. static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
  132. //==============================================================================
  133. static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
  134. {
  135. __m256d rr_ir = mul (a, dupeven (b));
  136. __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
  137. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  138. }
  139. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
  140. {
  141. __m256d retval = _mm256_hadd_pd (a, a);
  142. __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
  143. retval = _mm256_add_pd (retval, tmp);
  144. return ((double*) &retval)[0];
  145. }
  146. };
  147. //==============================================================================
  148. /** Signed 8-bit integer AVX intrinsics
  149. @tags{DSP}
  150. */
  151. template <>
  152. struct SIMDNativeOps<int8_t>
  153. {
  154. typedef __m256i vSIMDType;
  155. //==============================================================================
  156. DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
  157. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  158. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
  159. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  160. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  161. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  162. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  163. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  164. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  165. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  166. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
  167. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
  168. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  169. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
  170. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  171. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return _mm256_movemask_epi8 (equal (a, b)) == -1; }
  172. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  173. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  174. //==============================================================================
  175. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  176. {
  177. const auto* b = reinterpret_cast<const char*> (a);
  178. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  179. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  180. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  181. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  182. }
  183. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int8_t* dest) noexcept
  184. {
  185. SIMDFallbackOps<int8_t, __m256i>::store (value, dest);
  186. }
  187. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  188. {
  189. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  190. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  191. for (int i = 0; i < 3; ++i)
  192. {
  193. lo = _mm256_hadd_epi16 (lo, lo);
  194. hi = _mm256_hadd_epi16 (hi, hi);
  195. }
  196. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  197. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  198. return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  199. }
  200. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  201. {
  202. // unpack and multiply
  203. __m256i even = _mm256_mullo_epi16 (a, b);
  204. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  205. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  206. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  207. }
  208. };
  209. //==============================================================================
  210. /** Unsigned 8-bit integer AVX intrinsics.
  211. @tags{DSP}
  212. */
  213. template <>
  214. struct SIMDNativeOps<uint8_t>
  215. {
  216. //==============================================================================
  217. typedef __m256i vSIMDType;
  218. //==============================================================================
  219. DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
  220. DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
  221. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  222. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  223. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
  224. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  225. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  226. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  227. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  228. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  229. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  230. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  231. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
  232. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
  233. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  234. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
  235. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  236. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  237. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  238. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  239. //==============================================================================
  240. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  241. {
  242. const auto* b = reinterpret_cast<const char*> (a);
  243. return _mm256_set_epi8 (b[31], b[30], b[29], b[28], b[27], b[26], b[25], b[24],
  244. b[23], b[22], b[21], b[20], b[19], b[18], b[17], b[16],
  245. b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  246. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  247. }
  248. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint8_t* dest) noexcept
  249. {
  250. SIMDFallbackOps<uint8_t, __m256i>::store (value, dest);
  251. }
  252. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  253. {
  254. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  255. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  256. for (int i = 0; i < 3; ++i)
  257. {
  258. lo = _mm256_hadd_epi16 (lo, lo);
  259. hi = _mm256_hadd_epi16 (hi, hi);
  260. }
  261. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  262. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  263. return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  264. }
  265. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  266. {
  267. // unpack and multiply
  268. __m256i even = _mm256_mullo_epi16 (a, b);
  269. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  270. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  271. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  272. }
  273. };
  274. //==============================================================================
  275. /** Signed 16-bit integer AVX intrinsics.
  276. @tags{DSP}
  277. */
  278. template <>
  279. struct SIMDNativeOps<int16_t>
  280. {
  281. //==============================================================================
  282. typedef __m256i vSIMDType;
  283. //==============================================================================
  284. DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
  285. //==============================================================================
  286. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  287. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
  288. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  289. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  290. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  291. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  292. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  293. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  294. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  295. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  296. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
  297. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
  298. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  299. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
  300. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  301. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  302. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  303. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  304. //==============================================================================
  305. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  306. {
  307. return _mm256_set_epi16 (a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8],
  308. a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  309. }
  310. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int16_t* dest) noexcept
  311. {
  312. SIMDFallbackOps<int16_t, __m256i>::store (value, dest);
  313. }
  314. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  315. {
  316. __m256i tmp = _mm256_hadd_epi16 (a, a);
  317. tmp = _mm256_hadd_epi16 (tmp, tmp);
  318. tmp = _mm256_hadd_epi16 (tmp, tmp);
  319. int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
  320. return (int16_t) (ptr[0] + ptr[8]);
  321. }
  322. };
  323. //==============================================================================
  324. /** Unsigned 16-bit integer AVX intrinsics.
  325. @tags{DSP}
  326. */
  327. template <>
  328. struct SIMDNativeOps<uint16_t>
  329. {
  330. //==============================================================================
  331. typedef __m256i vSIMDType;
  332. //==============================================================================
  333. DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
  334. DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
  335. //==============================================================================
  336. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  337. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  338. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
  339. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  340. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  341. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  342. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  343. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  344. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  345. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  346. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  347. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
  348. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
  349. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  350. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
  351. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  352. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  353. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  354. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  355. //==============================================================================
  356. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  357. {
  358. const auto* b = reinterpret_cast<const int16_t*> (a);
  359. return _mm256_set_epi16 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  360. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  361. }
  362. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint16_t* dest) noexcept
  363. {
  364. SIMDFallbackOps<uint16_t, __m256i>::store (value, dest);
  365. }
  366. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  367. {
  368. __m256i tmp = _mm256_hadd_epi16 (a, a);
  369. tmp = _mm256_hadd_epi16 (tmp, tmp);
  370. tmp = _mm256_hadd_epi16 (tmp, tmp);
  371. uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
  372. return (uint16_t) (ptr[0] + ptr[8]);
  373. }
  374. };
  375. //==============================================================================
  376. /** Signed 32-bit integer AVX intrinsics.
  377. @tags{DSP}
  378. */
  379. template <>
  380. struct SIMDNativeOps<int32_t>
  381. {
  382. //==============================================================================
  383. typedef __m256i vSIMDType;
  384. //==============================================================================
  385. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  386. //==============================================================================
  387. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  388. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
  389. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  390. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  391. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  392. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  393. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  394. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  395. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  396. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  397. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
  398. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
  399. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  400. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
  401. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  402. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  403. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  404. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  405. //==============================================================================
  406. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept
  407. {
  408. return _mm256_set_epi32 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  409. }
  410. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int32_t* dest) noexcept
  411. {
  412. SIMDFallbackOps<int32_t, __m256i>::store (value, dest);
  413. }
  414. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  415. {
  416. __m256i tmp = _mm256_hadd_epi32 (a, a);
  417. tmp = _mm256_hadd_epi32 (tmp, tmp);
  418. int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
  419. return ptr[0] + ptr[4];
  420. }
  421. };
  422. //==============================================================================
  423. /** Unsigned 32-bit integer AVX intrinsics.
  424. @tags{DSP}
  425. */
  426. template <>
  427. struct SIMDNativeOps<uint32_t>
  428. {
  429. //==============================================================================
  430. typedef __m256i vSIMDType;
  431. //==============================================================================
  432. DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
  433. DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
  434. //==============================================================================
  435. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  436. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  437. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
  438. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  439. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  440. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  441. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  442. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  443. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  444. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  445. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  446. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
  447. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
  448. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  449. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
  450. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  451. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  452. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  453. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  454. //==============================================================================
  455. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  456. {
  457. const auto* b = reinterpret_cast<const int32_t*> (a);
  458. return _mm256_set_epi32 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  459. }
  460. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint32_t* dest) noexcept
  461. {
  462. SIMDFallbackOps<uint32_t, __m256i>::store (value, dest);
  463. }
  464. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  465. {
  466. __m256i tmp = _mm256_hadd_epi32 (a, a);
  467. tmp = _mm256_hadd_epi32 (tmp, tmp);
  468. uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
  469. return ptr[0] + ptr[4];
  470. }
  471. };
  472. //==============================================================================
  473. /** Signed 64-bit integer AVX intrinsics.
  474. @tags{DSP}
  475. */
  476. template <>
  477. struct SIMDNativeOps<int64_t>
  478. {
  479. //==============================================================================
  480. typedef __m256i vSIMDType;
  481. //==============================================================================
  482. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  483. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  484. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  485. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  486. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  487. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  488. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  489. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  490. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  491. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  492. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  493. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  494. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
  495. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  496. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  497. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  498. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  499. //==============================================================================
  500. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept
  501. {
  502. return _mm256_set_epi64x (a[3], a[2], a[1], a[0]);
  503. }
  504. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, int64_t* dest) noexcept
  505. {
  506. SIMDFallbackOps<int64_t, __m256i>::store (value, dest);
  507. }
  508. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  509. {
  510. #ifdef _MSC_VER
  511. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  512. return *reinterpret_cast<const __m256i*> (&tmp);
  513. #else
  514. return _mm256_set1_epi64x ((int64_t) s);
  515. #endif
  516. }
  517. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  518. {
  519. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  520. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  521. }
  522. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  523. {
  524. __m256i retval;
  525. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  526. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  527. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  528. for (int i = 0; i < 4; ++i)
  529. dst[i] = aptr[i] * bptr[i];
  530. return retval;
  531. }
  532. };
  533. //==============================================================================
  534. /** Unsigned 64-bit integer AVX intrinsics.
  535. @tags{DSP}
  536. */
  537. template <>
  538. struct SIMDNativeOps<uint64_t>
  539. {
  540. //==============================================================================
  541. typedef __m256i vSIMDType;
  542. //==============================================================================
  543. DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
  544. DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
  545. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  546. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  547. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  548. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  549. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  550. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  551. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  552. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  553. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  554. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  555. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  556. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  557. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
  558. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  559. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  560. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  561. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m256i a, __m256i b) noexcept { return (_mm256_movemask_epi8 (equal (a, b)) == -1); }
  562. //==============================================================================
  563. static forcedinline __m256i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  564. {
  565. const auto* b = reinterpret_cast<const int64_t*> (a);
  566. return _mm256_set_epi64x (b[3], b[2], b[1], b[0]);
  567. }
  568. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m256i value, uint64_t* dest) noexcept
  569. {
  570. SIMDFallbackOps<uint64_t, __m256i>::store (value, dest);
  571. }
  572. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  573. {
  574. #ifdef _MSC_VER
  575. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  576. return *reinterpret_cast<const __m256i*> (&tmp);
  577. #else
  578. return _mm256_set1_epi64x ((int64_t) s);
  579. #endif
  580. }
  581. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  582. {
  583. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  584. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  585. }
  586. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  587. {
  588. __m256i retval;
  589. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  590. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  591. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  592. for (int i = 0; i < 4; ++i)
  593. dst[i] = aptr[i] * bptr[i];
  594. return retval;
  595. }
  596. };
  597. #endif
  598. } // namespace dsp
  599. } // namespace juce