Audio plugin host https://kx.studio/carla
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

732 lines
52KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2022 - Raw Material Software Limited
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 7 End-User License
  8. Agreement and JUCE Privacy Policy.
  9. End User License Agreement: www.juce.com/juce-7-licence
  10. Privacy Policy: www.juce.com/juce-privacy-policy
  11. Or: You may also use this code under the terms of the GPL v3 (see
  12. www.gnu.org/licenses).
  13. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  14. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  15. DISCLAIMED.
  16. ==============================================================================
  17. */
  18. namespace juce
  19. {
  20. namespace dsp
  21. {
  22. #ifndef DOXYGEN
  23. JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
  24. #ifdef _MSC_VER
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static __declspec(align(16)) const type name [16 / sizeof (type)]
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  29. #else
  30. #define DECLARE_SSE_SIMD_CONST(type, name) \
  31. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  32. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point SSE intrinsics.
  39. @tags{DSP}
  40. */
  41. template <>
  42. struct SIMDNativeOps<float>
  43. {
  44. //==============================================================================
  45. using vSIMDType = __m128;
  46. //==============================================================================
  47. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  48. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  49. DECLARE_SSE_SIMD_CONST (float, kOne);
  50. //==============================================================================
  51. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  53. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  68. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
  69. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  71. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  72. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  73. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  74. static forcedinline float JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept { return SIMDFallbackOps<float, __m128>::get (v, i); }
  75. static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m128>::set (v, i, s); }
  76. static forcedinline __m128 JUCE_VECTOR_CALLTYPE truncate (__m128 a) noexcept { return _mm_cvtepi32_ps (_mm_cvttps_epi32 (a)); }
  77. //==============================================================================
  78. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  79. {
  80. __m128 rr_ir = mul (a, dupeven (b));
  81. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  82. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  83. }
  84. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  85. {
  86. #if defined(__SSE4__)
  87. const auto retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  88. #elif defined(__SSE3__)
  89. const auto shuffled = _mm_movehdup_ps (a);
  90. const auto sums = _mm_add_ps (a, shuffled);
  91. const auto retval = _mm_add_ss (sums, _mm_movehl_ps (shuffled, sums));
  92. #else
  93. auto retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  94. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  95. #endif
  96. return _mm_cvtss_f32 (retval);
  97. }
  98. };
  99. //==============================================================================
  100. /** Double-precision floating point SSE intrinsics.
  101. @tags{DSP}
  102. */
  103. template <>
  104. struct SIMDNativeOps<double>
  105. {
  106. //==============================================================================
  107. using vSIMDType = __m128d;
  108. //==============================================================================
  109. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  110. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  111. DECLARE_SSE_SIMD_CONST (double, kOne);
  112. //==============================================================================
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm_castsi128_pd (_mm_load_si128 (reinterpret_cast<const __m128i*> (a))); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  117. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  123. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  125. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  126. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  127. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  128. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  129. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  130. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  131. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  132. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
  133. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  134. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  135. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  136. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  137. static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept { return a; }
  138. static forcedinline double JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept { return SIMDFallbackOps<double, __m128d>::get (v, i); }
  139. static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m128d>::set (v, i, s); }
  140. static forcedinline __m128d JUCE_VECTOR_CALLTYPE truncate (__m128d a) noexcept { return _mm_cvtepi32_pd (_mm_cvttpd_epi32 (a)); }
  141. //==============================================================================
  142. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  143. {
  144. __m128d rr_ir = mul (a, dupeven (b));
  145. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  146. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  147. }
  148. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  149. {
  150. #if defined(__SSE4__)
  151. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  152. #elif defined(__SSE3__)
  153. __m128d retval = _mm_hadd_pd (a, a);
  154. #else
  155. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  156. #endif
  157. return _mm_cvtsd_f64 (retval);
  158. }
  159. };
  160. //==============================================================================
  161. /** Signed 8-bit integer SSE intrinsics.
  162. @tags{DSP}
  163. */
  164. template <>
  165. struct SIMDNativeOps<int8_t>
  166. {
  167. //==============================================================================
  168. using vSIMDType = __m128i;
  169. //==============================================================================
  170. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return load (a); }
  172. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  173. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  175. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  176. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  178. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  179. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  180. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  181. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  182. #if defined(__SSE4__)
  183. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  184. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  185. #else
  186. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  187. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  188. #endif
  189. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  190. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  191. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  192. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  193. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  194. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  195. static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m128i>::get (v, i); }
  196. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m128i>::set (v, i, s); }
  197. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  198. //==============================================================================
  199. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  200. {
  201. #ifdef __SSSE3__
  202. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  203. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  204. for (int i = 0; i < 3; ++i)
  205. {
  206. lo = _mm_hadd_epi16 (lo, lo);
  207. hi = _mm_hadd_epi16 (hi, hi);
  208. }
  209. return static_cast<int8_t> ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff));
  210. #else
  211. return SIMDFallbackOps<int8_t, __m128i>::sum (a);
  212. #endif
  213. }
  214. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  215. {
  216. // unpack and multiply
  217. __m128i even = _mm_mullo_epi16 (a, b);
  218. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  219. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  220. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  221. }
  222. };
  223. //==============================================================================
  224. /** Unsigned 8-bit integer SSE intrinsics.
  225. @tags{DSP}
  226. */
  227. template <>
  228. struct SIMDNativeOps<uint8_t>
  229. {
  230. //==============================================================================
  231. using vSIMDType = __m128i;
  232. //==============================================================================
  233. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  234. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  235. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return load (a); }
  236. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  237. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  238. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  239. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  240. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  241. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  244. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  246. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  247. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  248. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  249. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  250. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  251. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  252. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  253. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  254. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  255. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::get (v, i); }
  256. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::set (v, i, s); }
  257. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  258. //==============================================================================
  259. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  260. {
  261. #ifdef __SSSE3__
  262. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  263. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  264. for (int i = 0; i < 3; ++i)
  265. {
  266. lo = _mm_hadd_epi16 (lo, lo);
  267. hi = _mm_hadd_epi16 (hi, hi);
  268. }
  269. return static_cast<uint8_t> ((static_cast<uint32_t> (_mm_cvtsi128_si32 (lo)) & 0xffu)
  270. + (static_cast<uint32_t> (_mm_cvtsi128_si32 (hi)) & 0xffu));
  271. #else
  272. return SIMDFallbackOps<uint8_t, __m128i>::sum (a);
  273. #endif
  274. }
  275. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  276. {
  277. // unpack and multiply
  278. __m128i even = _mm_mullo_epi16 (a, b);
  279. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  280. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  281. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  282. }
  283. };
  284. //==============================================================================
  285. /** Signed 16-bit integer SSE intrinsics.
  286. @tags{DSP}
  287. */
  288. template <>
  289. struct SIMDNativeOps<int16_t>
  290. {
  291. //==============================================================================
  292. using vSIMDType = __m128i;
  293. //==============================================================================
  294. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  295. //==============================================================================
  296. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return load (a); }
  297. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  298. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  299. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  300. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  301. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  302. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  303. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  304. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  305. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  306. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  307. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  308. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  309. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  310. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  311. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  312. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  314. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  315. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  316. static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m128i>::get (v, i); }
  317. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m128i>::set (v, i, s); }
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  319. //==============================================================================
  320. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  321. {
  322. #ifdef __SSSE3__
  323. __m128i tmp = _mm_hadd_epi16 (a, a);
  324. tmp = _mm_hadd_epi16 (tmp, tmp);
  325. tmp = _mm_hadd_epi16 (tmp, tmp);
  326. return static_cast<int16_t> (_mm_cvtsi128_si32 (tmp) & 0xffff);
  327. #else
  328. return SIMDFallbackOps<int16_t, __m128i>::sum (a);
  329. #endif
  330. }
  331. };
  332. //==============================================================================
  333. /** Unsigned 16-bit integer SSE intrinsics.
  334. @tags{DSP}
  335. */
  336. template <>
  337. struct SIMDNativeOps<uint16_t>
  338. {
  339. //==============================================================================
  340. using vSIMDType = __m128i;
  341. //==============================================================================
  342. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  343. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  344. //==============================================================================
  345. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return load (a); }
  346. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  347. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  348. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  349. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  350. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  351. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  352. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  353. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  354. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  355. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  356. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  357. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  358. #if defined(__SSE4__)
  359. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  360. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  361. #else
  362. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  363. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  364. #endif
  365. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  367. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  370. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  371. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::get (v, i); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::set (v, i, s); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  374. //==============================================================================
  375. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  376. {
  377. #ifdef __SSSE3__
  378. __m128i tmp = _mm_hadd_epi16 (a, a);
  379. tmp = _mm_hadd_epi16 (tmp, tmp);
  380. tmp = _mm_hadd_epi16 (tmp, tmp);
  381. return static_cast<uint16_t> (static_cast<uint32_t> (_mm_cvtsi128_si32 (tmp)) & 0xffffu);
  382. #else
  383. return SIMDFallbackOps<uint16_t, __m128i>::sum (a);
  384. #endif
  385. }
  386. };
  387. //==============================================================================
  388. /** Signed 32-bit integer SSE intrinsics.
  389. @tags{DSP}
  390. */
  391. template <>
  392. struct SIMDNativeOps<int32_t>
  393. {
  394. //==============================================================================
  395. using vSIMDType = __m128i;
  396. //==============================================================================
  397. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  398. //==============================================================================
  399. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return load (a); }
  400. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  401. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  402. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  403. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  404. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  405. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  406. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  407. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  408. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  409. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  410. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  411. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  412. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  413. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  414. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  415. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  416. static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m128i>::get (v, i); }
  417. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m128i>::set (v, i, s); }
  418. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  419. //==============================================================================
  420. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  421. {
  422. #ifdef __SSSE3__
  423. __m128i tmp = _mm_hadd_epi32 (a, a);
  424. return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp));
  425. #else
  426. return SIMDFallbackOps<int32_t, __m128i>::sum (a);
  427. #endif
  428. }
  429. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  430. {
  431. #if defined(__SSE4_1__)
  432. return _mm_mullo_epi32 (a, b);
  433. #else
  434. __m128i even = _mm_mul_epu32 (a,b);
  435. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  436. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  437. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  438. #endif
  439. }
  440. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  441. {
  442. #if defined(__SSE4_1__)
  443. return _mm_min_epi32 (a, b);
  444. #else
  445. __m128i lt = greaterThan (b, a);
  446. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  447. #endif
  448. }
  449. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  450. {
  451. #if defined(__SSE4_1__)
  452. return _mm_max_epi32 (a, b);
  453. #else
  454. __m128i gt = greaterThan (a, b);
  455. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  456. #endif
  457. }
  458. };
  459. //==============================================================================
  460. /** Unsigned 32-bit integer SSE intrinsics.
  461. @tags{DSP}
  462. */
  463. template <>
  464. struct SIMDNativeOps<uint32_t>
  465. {
  466. //==============================================================================
  467. using vSIMDType = __m128i;
  468. //==============================================================================
  469. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  470. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  471. //==============================================================================
  472. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return load (a); }
  473. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  474. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  475. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  476. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  477. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  478. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  479. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  480. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  481. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  482. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  483. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  484. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  485. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  486. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  487. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  488. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  489. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  490. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::get (v, i); }
  491. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::set (v, i, s); }
  492. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  493. //==============================================================================
  494. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  495. {
  496. #ifdef __SSSE3__
  497. __m128i tmp = _mm_hadd_epi32 (a, a);
  498. return static_cast<uint32_t> (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)));
  499. #else
  500. return SIMDFallbackOps<uint32_t, __m128i>::sum (a);
  501. #endif
  502. }
  503. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  504. {
  505. #if defined(__SSE4_1__)
  506. return _mm_mullo_epi32 (a, b);
  507. #else
  508. __m128i even = _mm_mul_epu32 (a,b);
  509. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  510. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  511. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  512. #endif
  513. }
  514. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  515. {
  516. #if defined(__SSE4_1__)
  517. return _mm_min_epi32 (a, b);
  518. #else
  519. __m128i lt = greaterThan (b, a);
  520. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  521. #endif
  522. }
  523. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  524. {
  525. #if defined(__SSE4_1__)
  526. return _mm_max_epi32 (a, b);
  527. #else
  528. __m128i gt = greaterThan (a, b);
  529. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  530. #endif
  531. }
  532. };
  533. //==============================================================================
  534. /** Signed 64-bit integer SSE intrinsics.
  535. @tags{DSP}
  536. */
  537. template <>
  538. struct SIMDNativeOps<int64_t>
  539. {
  540. //==============================================================================
  541. using vSIMDType = __m128i;
  542. //==============================================================================
  543. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  544. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return load (a); }
  545. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm_set1_epi64x (s); }
  546. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  547. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  548. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  549. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  550. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  551. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  552. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  553. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  554. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  555. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  556. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  557. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  558. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  559. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  560. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  561. static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m128i>::get (v, i); }
  562. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m128i>::set (v, i, s); }
  563. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<int64_t, __m128i>::sum (a); }
  564. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<int64_t, __m128i>::mul (a, b); }
  565. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  566. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  567. {
  568. #if defined(__SSE4_1__)
  569. return _mm_cmpeq_epi64 (a, b);
  570. #else
  571. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  572. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  573. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  574. #endif
  575. }
  576. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  577. {
  578. #if defined(__SSE4_2__)
  579. return _mm_cmpgt_epi64 (a, b);
  580. #else
  581. return SIMDFallbackOps<int64_t, __m128i>::greaterThan (a, b);
  582. #endif
  583. }
  584. };
  585. //==============================================================================
  586. /** Unsigned 64-bit integer SSE intrinsics.
  587. @tags{DSP}
  588. */
  589. template <>
  590. struct SIMDNativeOps<uint64_t>
  591. {
  592. //==============================================================================
  593. using vSIMDType = __m128i;
  594. //==============================================================================
  595. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  596. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  597. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return load (a); }
  598. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm_set1_epi64x ((int64_t) s); }
  599. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  600. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  601. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  602. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  603. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  604. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  605. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  606. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  607. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  608. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  609. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  610. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  611. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  612. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  613. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  614. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  615. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::get (v, i); }
  616. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::set (v, i, s); }
  617. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::sum (a); }
  618. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::mul (a, b); }
  619. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  620. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  621. {
  622. #if defined(__SSE4_1__)
  623. return _mm_cmpeq_epi64 (a, b);
  624. #else
  625. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  626. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  627. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  628. #endif
  629. }
  630. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  631. {
  632. #if defined(__SSE4_2__)
  633. return _mm_cmpgt_epi64 (ssign (a), ssign (b));
  634. #else
  635. return SIMDFallbackOps<uint64_t, __m128i>::greaterThan (a, b);
  636. #endif
  637. }
  638. };
  639. #endif
  640. JUCE_END_IGNORE_WARNINGS_GCC_LIKE
  641. } // namespace dsp
  642. } // namespace juce