The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

568 lines
40KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_AVX_SIMD_CONST(type, name) \
  26. static __declspec(align(32)) const type name[32 / sizeof (type)]
  27. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(32)) const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)]
  29. #else
  30. #define DECLARE_AVX_SIMD_CONST(type, name) \
  31. static const type name[32 / sizeof (type)] __attribute__((aligned(32)))
  32. #define DEFINE_AVX_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name[32 / sizeof (type)] __attribute__((aligned(32)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point AVX intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. typedef __m256 vSIMDType;
  43. //==============================================================================
  44. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  45. DECLARE_AVX_SIMD_CONST (int32_t, kEvenHighBit);
  46. DECLARE_AVX_SIMD_CONST (float, kOne);
  47. //==============================================================================
  48. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const float* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  49. static forcedinline __m256 JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256*> (a); }
  50. static forcedinline __m256 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm256_broadcast_ss (&s); }
  51. static forcedinline __m256 JUCE_VECTOR_CALLTYPE add (__m256 a, __m256 b) noexcept { return _mm256_add_ps (a, b); }
  52. static forcedinline __m256 JUCE_VECTOR_CALLTYPE sub (__m256 a, __m256 b) noexcept { return _mm256_sub_ps (a, b); }
  53. static forcedinline __m256 JUCE_VECTOR_CALLTYPE mul (__m256 a, __m256 b) noexcept { return _mm256_mul_ps (a, b); }
  54. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_and (__m256 a, __m256 b) noexcept { return _mm256_and_ps (a, b); }
  55. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_or (__m256 a, __m256 b) noexcept { return _mm256_or_ps (a, b); }
  56. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_xor (__m256 a, __m256 b) noexcept { return _mm256_xor_ps (a, b); }
  57. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_notand (__m256 a, __m256 b) noexcept { return _mm256_andnot_ps (a, b); }
  58. static forcedinline __m256 JUCE_VECTOR_CALLTYPE bit_not (__m256 a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  59. static forcedinline __m256 JUCE_VECTOR_CALLTYPE min (__m256 a, __m256 b) noexcept { return _mm256_min_ps (a, b); }
  60. static forcedinline __m256 JUCE_VECTOR_CALLTYPE max (__m256 a, __m256 b) noexcept { return _mm256_max_ps (a, b); }
  61. static forcedinline __m256 JUCE_VECTOR_CALLTYPE equal (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_EQ_OQ); }
  62. static forcedinline __m256 JUCE_VECTOR_CALLTYPE notEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_NEQ_OQ); }
  63. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThan (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GT_OQ); }
  64. static forcedinline __m256 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256 a, __m256 b) noexcept { return _mm256_cmp_ps (a, b, _CMP_GE_OQ); }
  65. static forcedinline __m256 JUCE_VECTOR_CALLTYPE multiplyAdd (__m256 a, __m256 b, __m256 c) noexcept { return _mm256_fmadd_ps (b, c, a); }
  66. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupeven (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  67. static forcedinline __m256 JUCE_VECTOR_CALLTYPE dupodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  68. static forcedinline __m256 JUCE_VECTOR_CALLTYPE swapevenodd (__m256 a) noexcept { return _mm256_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  69. static forcedinline __m256 JUCE_VECTOR_CALLTYPE oddevensum (__m256 a) noexcept
  70. {
  71. a = _mm256_add_ps (_mm256_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a);
  72. return add (_mm256_permute2f128_ps (a, a, 1), a);
  73. }
  74. //==============================================================================
  75. static forcedinline __m256 JUCE_VECTOR_CALLTYPE cmplxmul (__m256 a, __m256 b) noexcept
  76. {
  77. __m256 rr_ir = mul (a, dupeven (b));
  78. __m256 ii_ri = mul (swapevenodd (a), dupodd (b));
  79. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  80. }
  81. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m256 a) noexcept
  82. {
  83. __m256 retval = _mm256_dp_ps (a, vconst (kOne), 0xff);
  84. __m256 tmp = _mm256_permute2f128_ps (retval, retval, 1);
  85. retval = _mm256_add_ps (retval, tmp);
  86. return ((float*) &retval)[0];
  87. }
  88. };
  89. //==============================================================================
  90. /** Double-precision floating point AVX intrinsics. */
  91. template <>
  92. struct SIMDNativeOps<double>
  93. {
  94. typedef __m256d vSIMDType;
  95. //==============================================================================
  96. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  97. DECLARE_AVX_SIMD_CONST (int64_t, kEvenHighBit);
  98. DECLARE_AVX_SIMD_CONST (double, kOne);
  99. //==============================================================================
  100. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  101. static forcedinline __m256d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256d*> (a); }
  102. static forcedinline __m256d expand (double s) noexcept { return _mm256_broadcast_sd (&s); }
  103. static forcedinline __m256d JUCE_VECTOR_CALLTYPE add (__m256d a, __m256d b) noexcept { return _mm256_add_pd (a, b); }
  104. static forcedinline __m256d JUCE_VECTOR_CALLTYPE sub (__m256d a, __m256d b) noexcept { return _mm256_sub_pd (a, b); }
  105. static forcedinline __m256d JUCE_VECTOR_CALLTYPE mul (__m256d a, __m256d b) noexcept { return _mm256_mul_pd (a, b); }
  106. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_and (__m256d a, __m256d b) noexcept { return _mm256_and_pd (a, b); }
  107. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_or (__m256d a, __m256d b) noexcept { return _mm256_or_pd (a, b); }
  108. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_xor (__m256d a, __m256d b) noexcept { return _mm256_xor_pd (a, b); }
  109. static forcedinline __m256d JUCE_VECTOR_CALLTYPE bit_notand (__m256d a, __m256d b) noexcept { return _mm256_andnot_pd (a, b); }
  110. static forcedinline __m256d bit_not (__m256d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  111. static forcedinline __m256d JUCE_VECTOR_CALLTYPE min (__m256d a, __m256d b) noexcept { return _mm256_min_pd (a, b); }
  112. static forcedinline __m256d JUCE_VECTOR_CALLTYPE max (__m256d a, __m256d b) noexcept { return _mm256_max_pd (a, b); }
  113. static forcedinline __m256d JUCE_VECTOR_CALLTYPE equal (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_EQ_OQ); }
  114. static forcedinline __m256d JUCE_VECTOR_CALLTYPE notEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_NEQ_OQ); }
  115. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThan (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GT_OQ); }
  116. static forcedinline __m256d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256d a, __m256d b) noexcept { return _mm256_cmp_pd (a, b, _CMP_GE_OQ); }
  117. static forcedinline __m256d JUCE_VECTOR_CALLTYPE multiplyAdd (__m256d a, __m256d b, __m256d c) noexcept { return _mm256_add_pd (a, _mm256_mul_pd (b, c)); }
  118. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupeven (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, 0); }
  119. static forcedinline __m256d JUCE_VECTOR_CALLTYPE dupodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3)); }
  120. static forcedinline __m256d JUCE_VECTOR_CALLTYPE swapevenodd (__m256d a) noexcept { return _mm256_shuffle_pd (a, a, (1 << 0) | (0 << 1) | (1 << 2) | (0 << 3)); }
  121. static forcedinline __m256d JUCE_VECTOR_CALLTYPE oddevensum (__m256d a) noexcept { return _mm256_add_pd (_mm256_permute2f128_pd (a, a, 1), a); }
  122. //==============================================================================
  123. static forcedinline __m256d JUCE_VECTOR_CALLTYPE cmplxmul (__m256d a, __m256d b) noexcept
  124. {
  125. __m256d rr_ir = mul (a, dupeven (b));
  126. __m256d ii_ri = mul (swapevenodd (a), dupodd (b));
  127. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  128. }
  129. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m256d a) noexcept
  130. {
  131. __m256d retval = _mm256_hadd_pd (a, a);
  132. __m256d tmp = _mm256_permute2f128_pd (retval, retval, 1);
  133. retval = _mm256_add_pd (retval, tmp);
  134. return ((double*) &retval)[0];
  135. }
  136. };
  137. //==============================================================================
  138. /** Signed 8-bit integer AVX intrinsics */
  139. template <>
  140. struct SIMDNativeOps<int8_t>
  141. {
  142. typedef __m256i vSIMDType;
  143. //==============================================================================
  144. DECLARE_AVX_SIMD_CONST (int8_t, kAllBitsSet);
  145. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  146. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm256_set1_epi8 (s); }
  147. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  148. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  149. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  150. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  151. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  152. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  153. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  154. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi8 (a, b); }
  155. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi8 (a, b); }
  156. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  157. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (a, b); }
  158. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  159. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  160. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  161. //==============================================================================
  162. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  163. {
  164. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  165. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  166. for (int i = 0; i < 3; ++i)
  167. {
  168. lo = _mm256_hadd_epi16 (lo, lo);
  169. hi = _mm256_hadd_epi16 (hi, hi);
  170. }
  171. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  172. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  173. return (int8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  174. }
  175. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  176. {
  177. // unpack and multiply
  178. __m256i even = _mm256_mullo_epi16 (a, b);
  179. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  180. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  181. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  182. }
  183. };
  184. //==============================================================================
  185. /** Unsigned 8-bit integer AVX intrinsics. */
  186. template <>
  187. struct SIMDNativeOps<uint8_t>
  188. {
  189. //==============================================================================
  190. typedef __m256i vSIMDType;
  191. //==============================================================================
  192. DECLARE_AVX_SIMD_CONST (uint8_t, kHighBit);
  193. DECLARE_AVX_SIMD_CONST (uint8_t, kAllBitsSet);
  194. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  195. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  196. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm256_set1_epi8 ((int8_t) s); }
  197. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi8 (a, b); }
  198. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi8 (a, b); }
  199. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  200. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  201. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  202. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  203. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  204. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu8 (a, b); }
  205. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu8 (a, b); }
  206. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi8 (a, b); }
  207. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi8 (ssign (a), ssign (b)); }
  208. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  209. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  210. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  211. //==============================================================================
  212. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  213. {
  214. __m256i lo = _mm256_unpacklo_epi8 (a, _mm256_setzero_si256());
  215. __m256i hi = _mm256_unpackhi_epi8 (a, _mm256_setzero_si256());
  216. for (int i = 0; i < 3; ++i)
  217. {
  218. lo = _mm256_hadd_epi16 (lo, lo);
  219. hi = _mm256_hadd_epi16 (hi, hi);
  220. }
  221. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  222. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  223. return (uint8_t) (lo_ptr[0] + hi_ptr[0] + lo_ptr[16] + hi_ptr[16]);
  224. }
  225. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b)
  226. {
  227. // unpack and multiply
  228. __m256i even = _mm256_mullo_epi16 (a, b);
  229. __m256i odd = _mm256_mullo_epi16 (_mm256_srli_epi16 (a, 8), _mm256_srli_epi16 (b, 8));
  230. return _mm256_or_si256 (_mm256_slli_epi16 (odd, 8),
  231. _mm256_srli_epi16 (_mm256_slli_epi16 (even, 8), 8));
  232. }
  233. };
  234. //==============================================================================
  235. /** Signed 16-bit integer AVX intrinsics. */
  236. template <>
  237. struct SIMDNativeOps<int16_t>
  238. {
  239. //==============================================================================
  240. typedef __m256i vSIMDType;
  241. //==============================================================================
  242. DECLARE_AVX_SIMD_CONST (int16_t, kAllBitsSet);
  243. //==============================================================================
  244. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  245. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm256_set1_epi16 (s); }
  246. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  247. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  248. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  249. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  250. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  251. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  252. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  253. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  254. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi16 (a, b); }
  255. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi16 (a, b); }
  256. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  257. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (a, b); }
  258. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  259. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  260. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  261. //==============================================================================
  262. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  263. {
  264. __m256i tmp = _mm256_hadd_epi16 (a, a);
  265. tmp = _mm256_hadd_epi16 (tmp, tmp);
  266. tmp = _mm256_hadd_epi16 (tmp, tmp);
  267. int16_t* ptr = reinterpret_cast<int16_t*> (&tmp);
  268. return (int16_t) (ptr[0] + ptr[8]);
  269. }
  270. };
  271. //==============================================================================
  272. /** Unsigned 16-bit integer AVX intrinsics. */
  273. template <>
  274. struct SIMDNativeOps<uint16_t>
  275. {
  276. //==============================================================================
  277. typedef __m256i vSIMDType;
  278. //==============================================================================
  279. DECLARE_AVX_SIMD_CONST (uint16_t, kHighBit);
  280. DECLARE_AVX_SIMD_CONST (uint16_t, kAllBitsSet);
  281. //==============================================================================
  282. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  283. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  284. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm256_set1_epi16 ((int16_t) s); }
  285. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi16 (a, b); }
  286. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi16 (a, b); }
  287. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi16 (a, b); }
  288. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  289. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  290. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  291. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  292. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  293. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu16 (a, b); }
  294. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu16 (a, b); }
  295. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi16 (a, b); }
  296. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi16 (ssign (a), ssign (b)); }
  297. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  298. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  299. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  300. //==============================================================================
  301. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  302. {
  303. __m256i tmp = _mm256_hadd_epi16 (a, a);
  304. tmp = _mm256_hadd_epi16 (tmp, tmp);
  305. tmp = _mm256_hadd_epi16 (tmp, tmp);
  306. uint16_t* ptr = reinterpret_cast<uint16_t*> (&tmp);
  307. return (uint16_t) (ptr[0] + ptr[8]);
  308. }
  309. };
  310. //==============================================================================
  311. /** Signed 32-bit integer AVX intrinsics. */
  312. template <>
  313. struct SIMDNativeOps<int32_t>
  314. {
  315. //==============================================================================
  316. typedef __m256i vSIMDType;
  317. //==============================================================================
  318. DECLARE_AVX_SIMD_CONST (int32_t, kAllBitsSet);
  319. //==============================================================================
  320. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  321. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm256_set1_epi32 (s); }
  322. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  323. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  324. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  325. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  326. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  327. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  328. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  329. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  330. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epi32 (a, b); }
  331. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epi32 (a, b); }
  332. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  333. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (a, b); }
  334. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  335. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  336. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  337. //==============================================================================
  338. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  339. {
  340. __m256i tmp = _mm256_hadd_epi32 (a, a);
  341. tmp = _mm256_hadd_epi32 (tmp, tmp);
  342. int32_t* ptr = reinterpret_cast<int32_t*> (&tmp);
  343. return ptr[0] + ptr[4];
  344. }
  345. };
  346. //==============================================================================
  347. /** Unsigned 32-bit integer AVX intrinsics. */
  348. template <>
  349. struct SIMDNativeOps<uint32_t>
  350. {
  351. //==============================================================================
  352. typedef __m256i vSIMDType;
  353. //==============================================================================
  354. DECLARE_AVX_SIMD_CONST (uint32_t, kAllBitsSet);
  355. DECLARE_AVX_SIMD_CONST (uint32_t, kHighBit);
  356. //==============================================================================
  357. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  358. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  359. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm256_set1_epi32 ((int32_t) s); }
  360. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi32 (a, b); }
  361. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi32 (a, b); }
  362. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept { return _mm256_mullo_epi32 (a, b); }
  363. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  364. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  365. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  366. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  367. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  368. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { return _mm256_min_epu32 (a, b); }
  369. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { return _mm256_max_epu32 (a, b); }
  370. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi32 (a, b); }
  371. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi32 (ssign (a), ssign (b)); }
  372. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  373. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  374. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  375. //==============================================================================
  376. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  377. {
  378. __m256i tmp = _mm256_hadd_epi32 (a, a);
  379. tmp = _mm256_hadd_epi32 (tmp, tmp);
  380. uint32_t* ptr = reinterpret_cast<uint32_t*> (&tmp);
  381. return ptr[0] + ptr[4];
  382. }
  383. };
  384. //==============================================================================
  385. /** Signed 64-bit integer AVX intrinsics. */
  386. template <>
  387. struct SIMDNativeOps<int64_t>
  388. {
  389. //==============================================================================
  390. typedef __m256i vSIMDType;
  391. //==============================================================================
  392. DECLARE_AVX_SIMD_CONST (int64_t, kAllBitsSet);
  393. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  394. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  395. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  396. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  397. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  398. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  399. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  400. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  401. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  402. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  403. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  404. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (a, b); }
  405. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  406. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  407. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  408. //==============================================================================
  409. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  410. {
  411. #ifdef _MSC_VER
  412. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  413. return *reinterpret_cast<const __m256i*> (&tmp);
  414. #else
  415. return _mm256_set1_epi64x ((int64_t) s);
  416. #endif
  417. }
  418. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  419. {
  420. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  421. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  422. }
  423. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  424. {
  425. __m256i retval;
  426. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  427. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  428. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  429. for (int i = 0; i < 4; ++i)
  430. dst[i] = aptr[i] * bptr[i];
  431. return retval;
  432. }
  433. };
  434. //==============================================================================
  435. /** Unsigned 64-bit integer AVX intrinsics. */
  436. template <>
  437. struct SIMDNativeOps<uint64_t>
  438. {
  439. //==============================================================================
  440. typedef __m256i vSIMDType;
  441. //==============================================================================
  442. DECLARE_AVX_SIMD_CONST (uint64_t, kAllBitsSet);
  443. DECLARE_AVX_SIMD_CONST (uint64_t, kHighBit);
  444. static forcedinline __m256i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m256i*> (a); }
  445. static forcedinline __m256i JUCE_VECTOR_CALLTYPE ssign (__m256i a) noexcept { return _mm256_xor_si256 (a, vconst (kHighBit)); }
  446. static forcedinline __m256i JUCE_VECTOR_CALLTYPE add (__m256i a, __m256i b) noexcept { return _mm256_add_epi64 (a, b); }
  447. static forcedinline __m256i JUCE_VECTOR_CALLTYPE sub (__m256i a, __m256i b) noexcept { return _mm256_sub_epi64 (a, b); }
  448. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_and (__m256i a, __m256i b) noexcept { return _mm256_and_si256 (a, b); }
  449. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_or (__m256i a, __m256i b) noexcept { return _mm256_or_si256 (a, b); }
  450. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_xor (__m256i a, __m256i b) noexcept { return _mm256_xor_si256 (a, b); }
  451. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_andnot (__m256i a, __m256i b) noexcept { return _mm256_andnot_si256 (a, b); }
  452. static forcedinline __m256i JUCE_VECTOR_CALLTYPE bit_not (__m256i a) noexcept { return _mm256_andnot_si256 (a, vconst (kAllBitsSet)); }
  453. static forcedinline __m256i JUCE_VECTOR_CALLTYPE min (__m256i a, __m256i b) noexcept { __m256i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  454. static forcedinline __m256i JUCE_VECTOR_CALLTYPE max (__m256i a, __m256i b) noexcept { __m256i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  455. static forcedinline __m256i JUCE_VECTOR_CALLTYPE equal (__m256i a, __m256i b) noexcept { return _mm256_cmpeq_epi64 (a, b); }
  456. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThan (__m256i a, __m256i b) noexcept { return _mm256_cmpgt_epi64 (ssign (a), ssign (b)); }
  457. static forcedinline __m256i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m256i a, __m256i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  458. static forcedinline __m256i JUCE_VECTOR_CALLTYPE multiplyAdd (__m256i a, __m256i b, __m256i c) noexcept { return add (a, mul (b, c)); }
  459. static forcedinline __m256i JUCE_VECTOR_CALLTYPE notEqual (__m256i a, __m256i b) noexcept { return bit_not (equal (a, b)); }
  460. //==============================================================================
  461. static forcedinline __m256i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  462. {
  463. #ifdef _MSC_VER
  464. __m256d tmp = _mm256_broadcast_sd (reinterpret_cast<const double*> (&s));
  465. return *reinterpret_cast<const __m256i*> (&tmp);
  466. #else
  467. return _mm256_set1_epi64x ((int64_t) s);
  468. #endif
  469. }
  470. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m256i a) noexcept
  471. {
  472. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  473. return ptr[0] + ptr[1] + ptr[2] + ptr[3];
  474. }
  475. static forcedinline __m256i JUCE_VECTOR_CALLTYPE mul (__m256i a, __m256i b) noexcept
  476. {
  477. __m256i retval;
  478. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  479. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  480. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  481. for (int i = 0; i < 4; ++i)
  482. dst[i] = aptr[i] * bptr[i];
  483. return retval;
  484. }
  485. };
  486. #endif
  487. } // namespace dsp
  488. } // namespace juce