The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

881 lines
50KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #if JUCE_GCC && (__GNUC__ >= 6)
  25. #pragma GCC diagnostic push
  26. #pragma GCC diagnostic ignored "-Wignored-attributes"
  27. #endif
  28. #ifdef _MSC_VER
  29. #define DECLARE_SSE_SIMD_CONST(type, name) \
  30. static __declspec(align(16)) const type name [16 / sizeof (type)]
  31. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  32. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  33. #else
  34. #define DECLARE_SSE_SIMD_CONST(type, name) \
  35. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  36. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  37. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  38. #endif
  39. template <typename type>
  40. struct SIMDNativeOps;
  41. //==============================================================================
  42. /** Single-precision floating point SSE intrinsics.
  43. @tags{DSP}
  44. */
  45. template <>
  46. struct SIMDNativeOps<float>
  47. {
  48. //==============================================================================
  49. typedef __m128 vSIMDType;
  50. //==============================================================================
  51. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  52. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  53. DECLARE_SSE_SIMD_CONST (float, kOne);
  54. //==============================================================================
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  57. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  68. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  69. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  71. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  72. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
  73. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  74. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  75. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  76. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  77. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  78. //==============================================================================
  79. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  80. {
  81. __m128 rr_ir = mul (a, dupeven (b));
  82. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  83. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  84. }
  85. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  86. {
  87. #if defined(__SSE4__)
  88. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  89. #elif defined(__SSE3__)
  90. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  91. #else
  92. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  93. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  94. #endif
  95. return ((float*) &retval) [0];
  96. }
  97. };
  98. //==============================================================================
  99. /** Double-precision floating point SSE intrinsics.
  100. @tags{DSP}
  101. */
  102. template <>
  103. struct SIMDNativeOps<double>
  104. {
  105. //==============================================================================
  106. typedef __m128d vSIMDType;
  107. //==============================================================================
  108. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  109. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  110. DECLARE_SSE_SIMD_CONST (double, kOne);
  111. //==============================================================================
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  116. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  123. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  125. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  126. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  127. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  128. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  129. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  130. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  131. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
  132. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  133. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  134. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  135. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  136. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  137. //==============================================================================
  138. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  139. {
  140. __m128d rr_ir = mul (a, dupeven (b));
  141. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  142. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  143. }
  144. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  145. {
  146. #if defined(__SSE4__)
  147. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  148. #elif defined(__SSE3__)
  149. __m128d retval = _mm_hadd_pd (a, a);
  150. #else
  151. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  152. #endif
  153. return ((double*) &retval) [0];
  154. }
  155. };
  156. //==============================================================================
  157. /** Signed 8-bit integer SSE intrinsics.
  158. @tags{DSP}
  159. */
  160. template <>
  161. struct SIMDNativeOps<int8_t>
  162. {
  163. //==============================================================================
  164. typedef __m128i vSIMDType;
  165. //==============================================================================
  166. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  169. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  170. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  172. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  173. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  175. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  176. #if defined(__SSE4__)
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  178. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  179. #else
  180. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  181. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  182. #endif
  183. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  184. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  185. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  186. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  187. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  188. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  189. //==============================================================================
  190. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  191. {
  192. const auto* b = reinterpret_cast<const char*> (a);
  193. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  194. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  195. }
  196. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
  197. {
  198. SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
  199. }
  200. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  201. {
  202. #ifdef __SSSE3__
  203. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  204. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  205. for (int i = 0; i < 3; ++i)
  206. {
  207. lo = _mm_hadd_epi16 (lo, lo);
  208. hi = _mm_hadd_epi16 (hi, hi);
  209. }
  210. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  211. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  212. return lo_ptr[0] + hi_ptr[0];
  213. #else
  214. int8_t sum = 0;
  215. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  216. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  217. sum += src [i];
  218. return sum;
  219. #endif
  220. }
  221. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  222. {
  223. // unpack and multiply
  224. __m128i even = _mm_mullo_epi16 (a, b);
  225. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  226. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  227. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  228. }
  229. };
  230. //==============================================================================
  231. /** Unsigned 8-bit integer SSE intrinsics.
  232. @tags{DSP}
  233. */
  234. template <>
  235. struct SIMDNativeOps<uint8_t>
  236. {
  237. //==============================================================================
  238. typedef __m128i vSIMDType;
  239. //==============================================================================
  240. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  241. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  244. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  246. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  247. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  248. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  249. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  250. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  251. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  252. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  253. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  254. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  255. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  256. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  257. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  258. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  259. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  260. //==============================================================================
  261. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  262. {
  263. const auto* b = reinterpret_cast<const char*> (a);
  264. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  265. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  266. }
  267. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
  268. {
  269. SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
  270. }
  271. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  272. {
  273. #ifdef __SSSE3__
  274. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  275. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  276. for (int i = 0; i < 3; ++i)
  277. {
  278. lo = _mm_hadd_epi16 (lo, lo);
  279. hi = _mm_hadd_epi16 (hi, hi);
  280. }
  281. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  282. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  283. return lo_ptr[0] + hi_ptr[0];
  284. #else
  285. uint8_t sum = 0;
  286. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  287. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  288. sum += src [i];
  289. return sum;
  290. #endif
  291. }
  292. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  293. {
  294. // unpack and multiply
  295. __m128i even = _mm_mullo_epi16 (a, b);
  296. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  297. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  298. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  299. }
  300. };
  301. //==============================================================================
  302. /** Signed 16-bit integer SSE intrinsics.
  303. @tags{DSP}
  304. */
  305. template <>
  306. struct SIMDNativeOps<int16_t>
  307. {
  308. //==============================================================================
  309. typedef __m128i vSIMDType;
  310. //==============================================================================
  311. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  312. //==============================================================================
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  314. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  315. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  316. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  317. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  319. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  320. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  321. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  322. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  323. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  324. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  325. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  326. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  327. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  328. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  329. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  330. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  331. //==============================================================================
  332. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  333. {
  334. return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  335. }
  336. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
  337. {
  338. SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
  339. }
  340. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  341. {
  342. #ifdef __SSSE3__
  343. __m128i tmp = _mm_hadd_epi16 (a, a);
  344. tmp = _mm_hadd_epi16 (tmp, tmp);
  345. tmp = _mm_hadd_epi16 (tmp, tmp);
  346. return *reinterpret_cast<int16_t*> (&tmp);
  347. #else
  348. int16_t sum = 0;
  349. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  350. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  351. sum += src [i];
  352. return sum;
  353. #endif
  354. }
  355. };
  356. //==============================================================================
  357. /** Unsigned 16-bit integer SSE intrinsics.
  358. @tags{DSP}
  359. */
  360. template <>
  361. struct SIMDNativeOps<uint16_t>
  362. {
  363. //==============================================================================
  364. typedef __m128i vSIMDType;
  365. //==============================================================================
  366. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  367. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  368. //==============================================================================
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  370. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  371. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  374. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  375. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  376. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  377. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  378. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  379. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  380. #if defined(__SSE4__)
  381. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  382. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  383. #else
  384. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  385. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  386. #endif
  387. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  388. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  389. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  390. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  391. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  392. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  393. //==============================================================================
  394. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  395. {
  396. const auto* b = reinterpret_cast<const int16_t*> (a);
  397. return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  398. }
  399. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
  400. {
  401. SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
  402. }
  403. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  404. {
  405. #ifdef __SSSE3__
  406. __m128i tmp = _mm_hadd_epi16 (a, a);
  407. tmp = _mm_hadd_epi16 (tmp, tmp);
  408. tmp = _mm_hadd_epi16 (tmp, tmp);
  409. return *reinterpret_cast<uint16_t*> (&tmp);
  410. #else
  411. uint16_t sum = 0;
  412. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  413. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  414. sum += src [i];
  415. return sum;
  416. #endif
  417. }
  418. };
  419. //==============================================================================
  420. /** Signed 32-bit integer SSE intrinsics.
  421. @tags{DSP}
  422. */
  423. template <>
  424. struct SIMDNativeOps<int32_t>
  425. {
  426. //==============================================================================
  427. typedef __m128i vSIMDType;
  428. //==============================================================================
  429. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  430. //==============================================================================
  431. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  432. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  433. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
  434. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  435. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  436. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  437. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  438. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  439. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  440. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  441. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  442. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  443. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  444. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  445. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  446. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  447. //==============================================================================
  448. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
  449. {
  450. SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
  451. }
  452. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  453. {
  454. #ifdef __SSSE3__
  455. __m128i tmp = _mm_hadd_epi32 (a, a);
  456. tmp = _mm_hadd_epi32 (tmp, tmp);
  457. return *reinterpret_cast<int32_t*> (&tmp);
  458. #else
  459. int32_t sum = 0;
  460. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  461. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  462. sum += src [i];
  463. return sum;
  464. #endif
  465. }
  466. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  467. {
  468. #if defined(__SSE4_1__)
  469. return _mm_mullo_epi32 (a, b);
  470. #else
  471. __m128i even = _mm_mul_epu32 (a,b);
  472. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  473. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  474. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  475. #endif
  476. }
  477. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  478. {
  479. #if defined(__SSE4_1__)
  480. return _mm_min_epi32 (a, b);
  481. #else
  482. __m128i lt = greaterThan (b, a);
  483. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  484. #endif
  485. }
  486. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  487. {
  488. #if defined(__SSE4_1__)
  489. return _mm_max_epi32 (a, b);
  490. #else
  491. __m128i gt = greaterThan (a, b);
  492. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  493. #endif
  494. }
  495. };
  496. //==============================================================================
  497. /** Unsigned 32-bit integer SSE intrinsics.
  498. @tags{DSP}
  499. */
  500. template <>
  501. struct SIMDNativeOps<uint32_t>
  502. {
  503. //==============================================================================
  504. typedef __m128i vSIMDType;
  505. //==============================================================================
  506. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  507. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  508. //==============================================================================
  509. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  510. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  511. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  512. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  513. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  514. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  515. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  516. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  517. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  518. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  519. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  520. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  521. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  522. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  523. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  524. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  525. //==============================================================================
  526. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  527. {
  528. const auto* b = reinterpret_cast<const int32_t*> (a);
  529. return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
  530. }
  531. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
  532. {
  533. SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
  534. }
  535. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  536. {
  537. #ifdef __SSSE3__
  538. __m128i tmp = _mm_hadd_epi32 (a, a);
  539. tmp = _mm_hadd_epi32 (tmp, tmp);
  540. return *reinterpret_cast<uint32_t*> (&tmp);
  541. #else
  542. uint32_t sum = 0;
  543. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  544. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  545. sum += src [i];
  546. return sum;
  547. #endif
  548. }
  549. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  550. {
  551. #if defined(__SSE4_1__)
  552. return _mm_mullo_epi32 (a, b);
  553. #else
  554. __m128i even = _mm_mul_epu32 (a,b);
  555. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  556. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  557. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  558. #endif
  559. }
  560. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  561. {
  562. #if defined(__SSE4_1__)
  563. return _mm_min_epi32 (a, b);
  564. #else
  565. __m128i lt = greaterThan (b, a);
  566. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  567. #endif
  568. }
  569. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  570. {
  571. #if defined(__SSE4_1__)
  572. return _mm_max_epi32 (a, b);
  573. #else
  574. __m128i gt = greaterThan (a, b);
  575. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  576. #endif
  577. }
  578. };
  579. //==============================================================================
  580. /** Signed 64-bit integer SSE intrinsics.
  581. @tags{DSP}
  582. */
  583. template <>
  584. struct SIMDNativeOps<int64_t>
  585. {
  586. //==============================================================================
  587. typedef __m128i vSIMDType;
  588. //==============================================================================
  589. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  590. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  591. {
  592. __m128i retval;
  593. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  594. ptr[0] = ptr[1] = s;
  595. return retval;
  596. }
  597. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); }
  598. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  599. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  600. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  601. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  602. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  603. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  604. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  605. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  606. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  607. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  608. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  609. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  610. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  611. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  612. //==============================================================================
  613. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
  614. {
  615. SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
  616. }
  617. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  618. {
  619. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  620. return ptr[0] + ptr[1];
  621. }
  622. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  623. {
  624. __m128i retval;
  625. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  626. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  627. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  628. dst[0] = aptr[0] * bptr[0];
  629. dst[1] = aptr[1] * bptr[1];
  630. return retval;
  631. }
  632. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  633. {
  634. #if defined(__SSE4_1__)
  635. return _mm_cmpeq_epi64 (a, b);
  636. #else
  637. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  638. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  639. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  640. #endif
  641. }
  642. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  643. {
  644. #if defined(__SSE4_1__) && !defined(__clang__)
  645. return _mm_cmpgt_epi64 (a, b);
  646. #else
  647. __m128i retval;
  648. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  649. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  650. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  651. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  652. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  653. return retval;
  654. #endif
  655. }
  656. };
  657. //==============================================================================
  658. /** Unsigned 64-bit integer SSE intrinsics.
  659. @tags{DSP}
  660. */
  661. template <>
  662. struct SIMDNativeOps<uint64_t>
  663. {
  664. //==============================================================================
  665. typedef __m128i vSIMDType;
  666. //==============================================================================
  667. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  668. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  669. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  670. {
  671. __m128i retval;
  672. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  673. ptr[0] = ptr[1] = s;
  674. return retval;
  675. }
  676. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  677. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  678. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  679. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  680. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  681. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  682. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  683. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  684. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  685. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  686. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  687. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  688. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  689. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  690. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  691. //==============================================================================
  692. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  693. {
  694. const auto* b = reinterpret_cast<const int64_t*> (a);
  695. return _mm_set_epi64x (b[1], b[0]);
  696. }
  697. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
  698. {
  699. SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
  700. }
  701. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  702. {
  703. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  704. return ptr[0] + ptr[1];
  705. }
  706. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  707. {
  708. __m128i retval;
  709. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  710. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  711. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  712. dst[0] = aptr[0] * bptr[0];
  713. dst[1] = aptr[1] * bptr[1];
  714. return retval;
  715. }
  716. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  717. {
  718. #if defined(__SSE4_1__)
  719. return _mm_cmpeq_epi64 (a, b);
  720. #else
  721. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  722. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  723. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  724. #endif
  725. }
  726. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  727. {
  728. #if defined(__SSE4_1__) && !defined(__clang__)
  729. return _mm_cmpgt_epi64 (a, b);
  730. #else
  731. __m128i retval;
  732. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  733. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  734. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  735. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  736. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  737. return retval;
  738. #endif
  739. }
  740. };
  741. #endif
  742. #if JUCE_GCC && (__GNUC__ >= 6)
  743. #pragma GCC diagnostic pop
  744. #endif
  745. } // namespace dsp
  746. } // namespace juce