The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

741 lines
45KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. #ifndef DOXYGEN
  20. #ifdef _MSC_VER
  21. #define DECLARE_SSE_SIMD_CONST(type, name) \
  22. static __declspec(align(16)) const type name [16 / sizeof (type)]
  23. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  24. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  25. #else
  26. #define DECLARE_SSE_SIMD_CONST(type, name) \
  27. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  28. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  29. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  30. #endif
  31. template <typename type>
  32. struct SIMDNativeOps;
  33. //==============================================================================
  34. /** Single-precision floating point SSE intrinsics. */
  35. template <>
  36. struct SIMDNativeOps<float>
  37. {
  38. //==============================================================================
  39. typedef __m128 vSIMDType;
  40. //==============================================================================
  41. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  42. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  43. DECLARE_SSE_SIMD_CONST (float, kOne);
  44. //==============================================================================
  45. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  46. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  47. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  48. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  49. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  50. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  51. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  53. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  65. //==============================================================================
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  67. {
  68. __m128 rr_ir = mul (a, dupeven (b));
  69. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  70. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  71. }
  72. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  73. {
  74. #if defined(__SSE4__)
  75. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  76. #elif defined(__SSE3__)
  77. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  78. #else
  79. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  80. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  81. #endif
  82. return ((float*) &retval) [0];
  83. }
  84. };
  85. //==============================================================================
  86. /** Double-precision floating point SSE intrinsics. */
  87. template <>
  88. struct SIMDNativeOps<double>
  89. {
  90. //==============================================================================
  91. typedef __m128d vSIMDType;
  92. //==============================================================================
  93. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  94. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  95. DECLARE_SSE_SIMD_CONST (double, kOne);
  96. //==============================================================================
  97. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  98. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  99. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  100. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  101. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  102. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  103. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  104. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  105. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  106. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  107. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  108. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  110. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  118. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  119. //==============================================================================
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  121. {
  122. __m128d rr_ir = mul (a, dupeven (b));
  123. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  124. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  125. }
  126. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  127. {
  128. #if defined(__SSE4__)
  129. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  130. #elif defined(__SSE3__)
  131. __m128d retval = _mm_hadd_pd (a, a);
  132. #else
  133. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  134. #endif
  135. return ((double*) &retval) [0];
  136. }
  137. };
  138. //==============================================================================
  139. /** Signed 8-bit integer SSE intrinsics. */
  140. template <>
  141. struct SIMDNativeOps<int8_t>
  142. {
  143. //==============================================================================
  144. typedef __m128i vSIMDType;
  145. //==============================================================================
  146. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  147. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  148. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  149. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  150. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  151. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  152. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  153. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  154. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  155. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  156. #if defined(__SSE4__)
  157. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  158. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  159. #else
  160. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  161. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  162. #endif
  163. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  164. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  166. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  168. //==============================================================================
  169. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  170. {
  171. #ifdef __SSSE3__
  172. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  173. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  174. for (int i = 0; i < 3; ++i)
  175. {
  176. lo = _mm_hadd_epi16 (lo, lo);
  177. hi = _mm_hadd_epi16 (hi, hi);
  178. }
  179. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  180. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  181. return lo_ptr[0] + hi_ptr[0];
  182. #else
  183. int8_t sum = 0;
  184. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  185. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  186. sum += src [i];
  187. return sum;
  188. #endif
  189. }
  190. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  191. {
  192. // unpack and multiply
  193. __m128i even = _mm_mullo_epi16 (a, b);
  194. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  195. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  196. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  197. }
  198. };
  199. //==============================================================================
  200. /** Unsigned 8-bit integer SSE intrinsics. */
  201. template <>
  202. struct SIMDNativeOps<uint8_t>
  203. {
  204. //==============================================================================
  205. typedef __m128i vSIMDType;
  206. //==============================================================================
  207. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  208. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  209. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  210. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  211. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  212. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  213. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  214. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  215. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  216. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  217. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  218. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  219. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  220. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  221. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  222. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  223. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  224. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  225. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  226. //==============================================================================
  227. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  228. {
  229. #ifdef __SSSE3__
  230. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  231. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  232. for (int i = 0; i < 3; ++i)
  233. {
  234. lo = _mm_hadd_epi16 (lo, lo);
  235. hi = _mm_hadd_epi16 (hi, hi);
  236. }
  237. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  238. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  239. return lo_ptr[0] + hi_ptr[0];
  240. #else
  241. uint8_t sum = 0;
  242. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  243. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  244. sum += src [i];
  245. return sum;
  246. #endif
  247. }
  248. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  249. {
  250. // unpack and multiply
  251. __m128i even = _mm_mullo_epi16 (a, b);
  252. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  253. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  254. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  255. }
  256. };
  257. //==============================================================================
  258. /** Signed 16-bit integer SSE intrinsics. */
  259. template <>
  260. struct SIMDNativeOps<int16_t>
  261. {
  262. //==============================================================================
  263. typedef __m128i vSIMDType;
  264. //==============================================================================
  265. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  266. //==============================================================================
  267. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  268. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  269. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  270. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  271. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  272. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  273. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  274. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  275. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  276. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  277. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  278. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  279. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  280. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  281. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  282. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  283. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  284. //==============================================================================
  285. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  286. {
  287. #ifdef __SSSE3__
  288. __m128i tmp = _mm_hadd_epi16 (a, a);
  289. tmp = _mm_hadd_epi16 (tmp, tmp);
  290. tmp = _mm_hadd_epi16 (tmp, tmp);
  291. return *reinterpret_cast<int16_t*> (&tmp);
  292. #else
  293. int16_t sum = 0;
  294. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  295. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  296. sum += src [i];
  297. return sum;
  298. #endif
  299. }
  300. };
  301. //==============================================================================
  302. /** Unsigned 16-bit integer SSE intrinsics. */
  303. template <>
  304. struct SIMDNativeOps<uint16_t>
  305. {
  306. //==============================================================================
  307. typedef __m128i vSIMDType;
  308. //==============================================================================
  309. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  310. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  311. //==============================================================================
  312. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  314. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  315. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  316. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  317. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  319. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  320. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  321. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  322. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  323. #if defined(__SSE4__)
  324. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  325. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  326. #else
  327. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  328. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  329. #endif
  330. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  331. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  332. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  333. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  334. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  335. //==============================================================================
  336. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  337. {
  338. #ifdef __SSSE3__
  339. __m128i tmp = _mm_hadd_epi16 (a, a);
  340. tmp = _mm_hadd_epi16 (tmp, tmp);
  341. tmp = _mm_hadd_epi16 (tmp, tmp);
  342. return *reinterpret_cast<uint16_t*> (&tmp);
  343. #else
  344. uint16_t sum = 0;
  345. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  346. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  347. sum += src [i];
  348. return sum;
  349. #endif
  350. }
  351. };
  352. //==============================================================================
  353. /** Signed 32-bit integer SSE intrinsics. */
  354. template <>
  355. struct SIMDNativeOps<int32_t>
  356. {
  357. //==============================================================================
  358. typedef __m128i vSIMDType;
  359. //==============================================================================
  360. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  361. //==============================================================================
  362. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  363. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  364. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  365. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  367. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  370. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  371. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  374. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  375. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  376. //==============================================================================
  377. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  378. {
  379. #ifdef __SSSE3__
  380. __m128i tmp = _mm_hadd_epi32 (a, a);
  381. tmp = _mm_hadd_epi32 (tmp, tmp);
  382. return *reinterpret_cast<int32_t*> (&tmp);
  383. #else
  384. int32_t sum = 0;
  385. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  386. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  387. sum += src [i];
  388. return sum;
  389. #endif
  390. }
  391. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  392. {
  393. #if defined(__SSE4_1__)
  394. return _mm_mullo_epi32 (a, b);
  395. #else
  396. __m128i even = _mm_mul_epu32 (a,b);
  397. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  398. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  399. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  400. #endif
  401. }
  402. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  403. {
  404. #if defined(__SSE4_1__)
  405. return _mm_min_epi32 (a, b);
  406. #else
  407. __m128i lt = greaterThan (b, a);
  408. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  409. #endif
  410. }
  411. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  412. {
  413. #if defined(__SSE4_1__)
  414. return _mm_max_epi32 (a, b);
  415. #else
  416. __m128i gt = greaterThan (a, b);
  417. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  418. #endif
  419. }
  420. };
  421. //==============================================================================
  422. /** Unsigned 32-bit integer SSE intrinsics. */
  423. template <>
  424. struct SIMDNativeOps<uint32_t>
  425. {
  426. //==============================================================================
  427. typedef __m128i vSIMDType;
  428. //==============================================================================
  429. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  430. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  431. //==============================================================================
  432. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  433. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  434. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  435. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  436. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  437. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  438. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  439. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  440. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  441. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  442. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  443. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  444. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  445. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  446. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  447. //==============================================================================
  448. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  449. {
  450. #ifdef __SSSE3__
  451. __m128i tmp = _mm_hadd_epi32 (a, a);
  452. tmp = _mm_hadd_epi32 (tmp, tmp);
  453. return *reinterpret_cast<uint32_t*> (&tmp);
  454. #else
  455. uint32_t sum = 0;
  456. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  457. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  458. sum += src [i];
  459. return sum;
  460. #endif
  461. }
  462. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  463. {
  464. #if defined(__SSE4_1__)
  465. return _mm_mullo_epi32 (a, b);
  466. #else
  467. __m128i even = _mm_mul_epu32 (a,b);
  468. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  469. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  470. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  471. #endif
  472. }
  473. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  474. {
  475. #if defined(__SSE4_1__)
  476. return _mm_min_epi32 (a, b);
  477. #else
  478. __m128i lt = greaterThan (b, a);
  479. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  480. #endif
  481. }
  482. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  483. {
  484. #if defined(__SSE4_1__)
  485. return _mm_max_epi32 (a, b);
  486. #else
  487. __m128i gt = greaterThan (a, b);
  488. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  489. #endif
  490. }
  491. };
  492. //==============================================================================
  493. /** Signed 64-bit integer SSE intrinsics. */
  494. template <>
  495. struct SIMDNativeOps<int64_t>
  496. {
  497. //==============================================================================
  498. typedef __m128i vSIMDType;
  499. //==============================================================================
  500. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  501. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  502. {
  503. __m128i retval;
  504. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  505. ptr[0] = ptr[1] = s;
  506. return retval;
  507. }
  508. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  509. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  510. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  511. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  512. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  513. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  514. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  515. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  516. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  517. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  518. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  519. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  520. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  521. //==============================================================================
  522. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  523. {
  524. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  525. return ptr[0] + ptr[1];
  526. }
  527. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  528. {
  529. __m128i retval;
  530. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  531. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  532. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  533. dst[0] = aptr[0] * bptr[0];
  534. dst[1] = aptr[1] * bptr[1];
  535. return retval;
  536. }
  537. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  538. {
  539. #if defined(__SSE4_1__)
  540. return _mm_cmpeq_epi64 (a, b);
  541. #else
  542. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  543. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  544. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  545. #endif
  546. }
  547. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  548. {
  549. #if defined(__SSE4_1__) && !defined(__clang__)
  550. return _mm_cmpgt_epi64 (a, b);
  551. #else
  552. __m128i retval;
  553. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  554. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  555. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  556. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  557. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  558. return retval;
  559. #endif
  560. }
  561. };
  562. //==============================================================================
  563. /** Unsigned 64-bit integer SSE intrinsics. */
  564. template <>
  565. struct SIMDNativeOps<uint64_t>
  566. {
  567. //==============================================================================
  568. typedef __m128i vSIMDType;
  569. //==============================================================================
  570. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  571. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  572. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  573. {
  574. __m128i retval;
  575. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  576. ptr[0] = ptr[1] = s;
  577. return retval;
  578. }
  579. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  580. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  581. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  582. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  583. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  584. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  585. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  586. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  587. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  588. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  589. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  590. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  591. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  592. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  593. //==============================================================================
  594. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  595. {
  596. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  597. return ptr[0] + ptr[1];
  598. }
  599. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  600. {
  601. __m128i retval;
  602. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  603. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  604. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  605. dst[0] = aptr[0] * bptr[0];
  606. dst[1] = aptr[1] * bptr[1];
  607. return retval;
  608. }
  609. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  610. {
  611. #if defined(__SSE4_1__)
  612. return _mm_cmpeq_epi64 (a, b);
  613. #else
  614. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  615. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  616. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  617. #endif
  618. }
  619. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  620. {
  621. #if defined(__SSE4_1__) && !defined(__clang__)
  622. return _mm_cmpgt_epi64 (a, b);
  623. #else
  624. __m128i retval;
  625. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  626. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  627. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  628. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  629. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  630. return retval;
  631. #endif
  632. }
  633. };
  634. #endif