The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

749 lines
45KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static __declspec(align(16)) const type name [16 / sizeof (type)]
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  29. #else
  30. #define DECLARE_SSE_SIMD_CONST(type, name) \
  31. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  32. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point SSE intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. //==============================================================================
  43. typedef __m128 vSIMDType;
  44. //==============================================================================
  45. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  46. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  47. DECLARE_SSE_SIMD_CONST (float, kOne);
  48. //==============================================================================
  49. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  50. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  51. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  53. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  68. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  69. //==============================================================================
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  71. {
  72. __m128 rr_ir = mul (a, dupeven (b));
  73. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  74. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  75. }
  76. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  77. {
  78. #if defined(__SSE4__)
  79. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  80. #elif defined(__SSE3__)
  81. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  82. #else
  83. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  84. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  85. #endif
  86. return ((float*) &retval) [0];
  87. }
  88. };
  89. //==============================================================================
  90. /** Double-precision floating point SSE intrinsics. */
  91. template <>
  92. struct SIMDNativeOps<double>
  93. {
  94. //==============================================================================
  95. typedef __m128d vSIMDType;
  96. //==============================================================================
  97. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  98. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  99. DECLARE_SSE_SIMD_CONST (double, kOne);
  100. //==============================================================================
  101. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  102. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  103. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  104. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  105. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  106. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  107. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  108. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  110. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  122. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  123. //==============================================================================
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  125. {
  126. __m128d rr_ir = mul (a, dupeven (b));
  127. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  128. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  129. }
  130. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  131. {
  132. #if defined(__SSE4__)
  133. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  134. #elif defined(__SSE3__)
  135. __m128d retval = _mm_hadd_pd (a, a);
  136. #else
  137. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  138. #endif
  139. return ((double*) &retval) [0];
  140. }
  141. };
  142. //==============================================================================
  143. /** Signed 8-bit integer SSE intrinsics. */
  144. template <>
  145. struct SIMDNativeOps<int8_t>
  146. {
  147. //==============================================================================
  148. typedef __m128i vSIMDType;
  149. //==============================================================================
  150. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  151. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  152. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  153. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  154. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  155. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  156. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  157. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  158. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  159. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  160. #if defined(__SSE4__)
  161. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  162. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  163. #else
  164. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  166. #endif
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  169. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  170. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  172. //==============================================================================
  173. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  174. {
  175. #ifdef __SSSE3__
  176. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  177. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  178. for (int i = 0; i < 3; ++i)
  179. {
  180. lo = _mm_hadd_epi16 (lo, lo);
  181. hi = _mm_hadd_epi16 (hi, hi);
  182. }
  183. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  184. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  185. return lo_ptr[0] + hi_ptr[0];
  186. #else
  187. int8_t sum = 0;
  188. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  189. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  190. sum += src [i];
  191. return sum;
  192. #endif
  193. }
  194. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  195. {
  196. // unpack and multiply
  197. __m128i even = _mm_mullo_epi16 (a, b);
  198. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  199. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  200. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  201. }
  202. };
  203. //==============================================================================
  204. /** Unsigned 8-bit integer SSE intrinsics. */
  205. template <>
  206. struct SIMDNativeOps<uint8_t>
  207. {
  208. //==============================================================================
  209. typedef __m128i vSIMDType;
  210. //==============================================================================
  211. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  212. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  213. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  214. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  215. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  216. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  217. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  218. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  219. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  220. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  221. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  222. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  223. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  224. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  225. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  226. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  227. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  228. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  229. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  230. //==============================================================================
  231. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  232. {
  233. #ifdef __SSSE3__
  234. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  235. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  236. for (int i = 0; i < 3; ++i)
  237. {
  238. lo = _mm_hadd_epi16 (lo, lo);
  239. hi = _mm_hadd_epi16 (hi, hi);
  240. }
  241. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  242. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  243. return lo_ptr[0] + hi_ptr[0];
  244. #else
  245. uint8_t sum = 0;
  246. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  247. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  248. sum += src [i];
  249. return sum;
  250. #endif
  251. }
  252. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  253. {
  254. // unpack and multiply
  255. __m128i even = _mm_mullo_epi16 (a, b);
  256. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  257. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  258. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  259. }
  260. };
  261. //==============================================================================
  262. /** Signed 16-bit integer SSE intrinsics. */
  263. template <>
  264. struct SIMDNativeOps<int16_t>
  265. {
  266. //==============================================================================
  267. typedef __m128i vSIMDType;
  268. //==============================================================================
  269. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  270. //==============================================================================
  271. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  272. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  273. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  274. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  275. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  276. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  277. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  278. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  279. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  280. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  281. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  282. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  283. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  284. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  285. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  286. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  287. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  288. //==============================================================================
  289. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  290. {
  291. #ifdef __SSSE3__
  292. __m128i tmp = _mm_hadd_epi16 (a, a);
  293. tmp = _mm_hadd_epi16 (tmp, tmp);
  294. tmp = _mm_hadd_epi16 (tmp, tmp);
  295. return *reinterpret_cast<int16_t*> (&tmp);
  296. #else
  297. int16_t sum = 0;
  298. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  299. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  300. sum += src [i];
  301. return sum;
  302. #endif
  303. }
  304. };
  305. //==============================================================================
  306. /** Unsigned 16-bit integer SSE intrinsics. */
  307. template <>
  308. struct SIMDNativeOps<uint16_t>
  309. {
  310. //==============================================================================
  311. typedef __m128i vSIMDType;
  312. //==============================================================================
  313. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  314. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  315. //==============================================================================
  316. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  317. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  319. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  320. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  321. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  322. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  323. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  324. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  325. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  326. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  327. #if defined(__SSE4__)
  328. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  329. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  330. #else
  331. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  332. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  333. #endif
  334. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  335. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  336. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  337. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  338. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  339. //==============================================================================
  340. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  341. {
  342. #ifdef __SSSE3__
  343. __m128i tmp = _mm_hadd_epi16 (a, a);
  344. tmp = _mm_hadd_epi16 (tmp, tmp);
  345. tmp = _mm_hadd_epi16 (tmp, tmp);
  346. return *reinterpret_cast<uint16_t*> (&tmp);
  347. #else
  348. uint16_t sum = 0;
  349. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  350. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  351. sum += src [i];
  352. return sum;
  353. #endif
  354. }
  355. };
  356. //==============================================================================
  357. /** Signed 32-bit integer SSE intrinsics. */
  358. template <>
  359. struct SIMDNativeOps<int32_t>
  360. {
  361. //==============================================================================
  362. typedef __m128i vSIMDType;
  363. //==============================================================================
  364. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  365. //==============================================================================
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  367. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  370. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  371. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  374. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  375. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  376. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  377. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  378. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  379. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  380. //==============================================================================
  381. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  382. {
  383. #ifdef __SSSE3__
  384. __m128i tmp = _mm_hadd_epi32 (a, a);
  385. tmp = _mm_hadd_epi32 (tmp, tmp);
  386. return *reinterpret_cast<int32_t*> (&tmp);
  387. #else
  388. int32_t sum = 0;
  389. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  390. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  391. sum += src [i];
  392. return sum;
  393. #endif
  394. }
  395. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  396. {
  397. #if defined(__SSE4_1__)
  398. return _mm_mullo_epi32 (a, b);
  399. #else
  400. __m128i even = _mm_mul_epu32 (a,b);
  401. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  402. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  403. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  404. #endif
  405. }
  406. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  407. {
  408. #if defined(__SSE4_1__)
  409. return _mm_min_epi32 (a, b);
  410. #else
  411. __m128i lt = greaterThan (b, a);
  412. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  413. #endif
  414. }
  415. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  416. {
  417. #if defined(__SSE4_1__)
  418. return _mm_max_epi32 (a, b);
  419. #else
  420. __m128i gt = greaterThan (a, b);
  421. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  422. #endif
  423. }
  424. };
  425. //==============================================================================
  426. /** Unsigned 32-bit integer SSE intrinsics. */
  427. template <>
  428. struct SIMDNativeOps<uint32_t>
  429. {
  430. //==============================================================================
  431. typedef __m128i vSIMDType;
  432. //==============================================================================
  433. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  434. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  435. //==============================================================================
  436. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  437. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  438. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  439. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  440. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  441. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  442. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  443. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  444. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  445. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  446. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  447. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  448. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  449. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  450. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  451. //==============================================================================
  452. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  453. {
  454. #ifdef __SSSE3__
  455. __m128i tmp = _mm_hadd_epi32 (a, a);
  456. tmp = _mm_hadd_epi32 (tmp, tmp);
  457. return *reinterpret_cast<uint32_t*> (&tmp);
  458. #else
  459. uint32_t sum = 0;
  460. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  461. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  462. sum += src [i];
  463. return sum;
  464. #endif
  465. }
  466. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  467. {
  468. #if defined(__SSE4_1__)
  469. return _mm_mullo_epi32 (a, b);
  470. #else
  471. __m128i even = _mm_mul_epu32 (a,b);
  472. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  473. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  474. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  475. #endif
  476. }
  477. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  478. {
  479. #if defined(__SSE4_1__)
  480. return _mm_min_epi32 (a, b);
  481. #else
  482. __m128i lt = greaterThan (b, a);
  483. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  484. #endif
  485. }
  486. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  487. {
  488. #if defined(__SSE4_1__)
  489. return _mm_max_epi32 (a, b);
  490. #else
  491. __m128i gt = greaterThan (a, b);
  492. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  493. #endif
  494. }
  495. };
  496. //==============================================================================
  497. /** Signed 64-bit integer SSE intrinsics. */
  498. template <>
  499. struct SIMDNativeOps<int64_t>
  500. {
  501. //==============================================================================
  502. typedef __m128i vSIMDType;
  503. //==============================================================================
  504. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  505. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  506. {
  507. __m128i retval;
  508. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  509. ptr[0] = ptr[1] = s;
  510. return retval;
  511. }
  512. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  513. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  514. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  515. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  516. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  517. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  518. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  519. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  520. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  521. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  522. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  523. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  524. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  525. //==============================================================================
  526. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  527. {
  528. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  529. return ptr[0] + ptr[1];
  530. }
  531. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  532. {
  533. __m128i retval;
  534. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  535. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  536. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  537. dst[0] = aptr[0] * bptr[0];
  538. dst[1] = aptr[1] * bptr[1];
  539. return retval;
  540. }
  541. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  542. {
  543. #if defined(__SSE4_1__)
  544. return _mm_cmpeq_epi64 (a, b);
  545. #else
  546. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  547. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  548. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  549. #endif
  550. }
  551. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  552. {
  553. #if defined(__SSE4_1__) && !defined(__clang__)
  554. return _mm_cmpgt_epi64 (a, b);
  555. #else
  556. __m128i retval;
  557. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  558. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  559. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  560. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  561. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  562. return retval;
  563. #endif
  564. }
  565. };
  566. //==============================================================================
  567. /** Unsigned 64-bit integer SSE intrinsics. */
  568. template <>
  569. struct SIMDNativeOps<uint64_t>
  570. {
  571. //==============================================================================
  572. typedef __m128i vSIMDType;
  573. //==============================================================================
  574. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  575. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  576. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  577. {
  578. __m128i retval;
  579. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  580. ptr[0] = ptr[1] = s;
  581. return retval;
  582. }
  583. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  584. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  585. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  586. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  587. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  588. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  589. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  590. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  591. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  592. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  593. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  594. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  595. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  596. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  597. //==============================================================================
  598. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  599. {
  600. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  601. return ptr[0] + ptr[1];
  602. }
  603. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  604. {
  605. __m128i retval;
  606. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  607. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  608. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  609. dst[0] = aptr[0] * bptr[0];
  610. dst[1] = aptr[1] * bptr[1];
  611. return retval;
  612. }
  613. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  614. {
  615. #if defined(__SSE4_1__)
  616. return _mm_cmpeq_epi64 (a, b);
  617. #else
  618. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  619. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  620. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  621. #endif
  622. }
  623. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  624. {
  625. #if defined(__SSE4_1__) && !defined(__clang__)
  626. return _mm_cmpgt_epi64 (a, b);
  627. #else
  628. __m128i retval;
  629. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  630. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  631. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  632. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  633. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  634. return retval;
  635. #endif
  636. }
  637. };
  638. #endif
  639. } // namespace dsp
  640. } // namespace juce