The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

842 lines
50KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static __declspec(align(16)) const type name [16 / sizeof (type)]
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  29. #else
  30. #define DECLARE_SSE_SIMD_CONST(type, name) \
  31. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  32. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point SSE intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. //==============================================================================
  43. typedef __m128 vSIMDType;
  44. //==============================================================================
  45. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  46. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  47. DECLARE_SSE_SIMD_CONST (float, kOne);
  48. //==============================================================================
  49. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  50. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  51. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  53. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  66. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  68. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  69. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  71. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  72. //==============================================================================
  73. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  74. {
  75. __m128 rr_ir = mul (a, dupeven (b));
  76. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  77. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  78. }
  79. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  80. {
  81. #if defined(__SSE4__)
  82. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  83. #elif defined(__SSE3__)
  84. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  85. #else
  86. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  87. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  88. #endif
  89. return ((float*) &retval) [0];
  90. }
  91. };
  92. //==============================================================================
  93. /** Double-precision floating point SSE intrinsics. */
  94. template <>
  95. struct SIMDNativeOps<double>
  96. {
  97. //==============================================================================
  98. typedef __m128d vSIMDType;
  99. //==============================================================================
  100. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  101. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  102. DECLARE_SSE_SIMD_CONST (double, kOne);
  103. //==============================================================================
  104. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  105. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  106. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  107. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  108. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  110. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  123. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  125. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  126. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  127. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  128. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  129. //==============================================================================
  130. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  131. {
  132. __m128d rr_ir = mul (a, dupeven (b));
  133. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  134. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  135. }
  136. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  137. {
  138. #if defined(__SSE4__)
  139. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  140. #elif defined(__SSE3__)
  141. __m128d retval = _mm_hadd_pd (a, a);
  142. #else
  143. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  144. #endif
  145. return ((double*) &retval) [0];
  146. }
  147. };
  148. //==============================================================================
  149. /** Signed 8-bit integer SSE intrinsics. */
  150. template <>
  151. struct SIMDNativeOps<int8_t>
  152. {
  153. //==============================================================================
  154. typedef __m128i vSIMDType;
  155. //==============================================================================
  156. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  157. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  158. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  159. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  160. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  161. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  162. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  163. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  164. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  166. #if defined(__SSE4__)
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  169. #else
  170. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  172. #endif
  173. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  175. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  176. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  178. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  179. //==============================================================================
  180. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  181. {
  182. const auto* b = reinterpret_cast<const char*> (a);
  183. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  184. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  185. }
  186. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
  187. {
  188. SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
  189. }
  190. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  191. {
  192. #ifdef __SSSE3__
  193. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  194. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  195. for (int i = 0; i < 3; ++i)
  196. {
  197. lo = _mm_hadd_epi16 (lo, lo);
  198. hi = _mm_hadd_epi16 (hi, hi);
  199. }
  200. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  201. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  202. return lo_ptr[0] + hi_ptr[0];
  203. #else
  204. int8_t sum = 0;
  205. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  206. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  207. sum += src [i];
  208. return sum;
  209. #endif
  210. }
  211. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  212. {
  213. // unpack and multiply
  214. __m128i even = _mm_mullo_epi16 (a, b);
  215. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  216. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  217. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  218. }
  219. };
  220. //==============================================================================
  221. /** Unsigned 8-bit integer SSE intrinsics. */
  222. template <>
  223. struct SIMDNativeOps<uint8_t>
  224. {
  225. //==============================================================================
  226. typedef __m128i vSIMDType;
  227. //==============================================================================
  228. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  229. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  230. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  231. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  232. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  233. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  234. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  235. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  236. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  237. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  238. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  239. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  240. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  241. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  244. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  246. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  247. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  248. //==============================================================================
  249. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  250. {
  251. const auto* b = reinterpret_cast<const char*> (a);
  252. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  253. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  254. }
  255. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
  256. {
  257. SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
  258. }
  259. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  260. {
  261. #ifdef __SSSE3__
  262. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  263. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  264. for (int i = 0; i < 3; ++i)
  265. {
  266. lo = _mm_hadd_epi16 (lo, lo);
  267. hi = _mm_hadd_epi16 (hi, hi);
  268. }
  269. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  270. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  271. return lo_ptr[0] + hi_ptr[0];
  272. #else
  273. uint8_t sum = 0;
  274. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  275. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  276. sum += src [i];
  277. return sum;
  278. #endif
  279. }
  280. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  281. {
  282. // unpack and multiply
  283. __m128i even = _mm_mullo_epi16 (a, b);
  284. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  285. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  286. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  287. }
  288. };
  289. //==============================================================================
  290. /** Signed 16-bit integer SSE intrinsics. */
  291. template <>
  292. struct SIMDNativeOps<int16_t>
  293. {
  294. //==============================================================================
  295. typedef __m128i vSIMDType;
  296. //==============================================================================
  297. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  298. //==============================================================================
  299. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  300. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  301. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  302. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  303. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  304. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  305. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  306. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  307. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  308. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  309. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  310. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  311. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  312. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  314. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  315. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  316. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  317. //==============================================================================
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  319. {
  320. return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  321. }
  322. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
  323. {
  324. SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
  325. }
  326. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  327. {
  328. #ifdef __SSSE3__
  329. __m128i tmp = _mm_hadd_epi16 (a, a);
  330. tmp = _mm_hadd_epi16 (tmp, tmp);
  331. tmp = _mm_hadd_epi16 (tmp, tmp);
  332. return *reinterpret_cast<int16_t*> (&tmp);
  333. #else
  334. int16_t sum = 0;
  335. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  336. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  337. sum += src [i];
  338. return sum;
  339. #endif
  340. }
  341. };
  342. //==============================================================================
  343. /** Unsigned 16-bit integer SSE intrinsics. */
  344. template <>
  345. struct SIMDNativeOps<uint16_t>
  346. {
  347. //==============================================================================
  348. typedef __m128i vSIMDType;
  349. //==============================================================================
  350. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  351. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  352. //==============================================================================
  353. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  354. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  355. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  356. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  357. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  358. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  359. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  360. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  361. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  362. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  363. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  364. #if defined(__SSE4__)
  365. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  367. #else
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  370. #endif
  371. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  374. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  375. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  376. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  377. //==============================================================================
  378. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  379. {
  380. const auto* b = reinterpret_cast<const int16_t*> (a);
  381. return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  382. }
  383. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
  384. {
  385. SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
  386. }
  387. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  388. {
  389. #ifdef __SSSE3__
  390. __m128i tmp = _mm_hadd_epi16 (a, a);
  391. tmp = _mm_hadd_epi16 (tmp, tmp);
  392. tmp = _mm_hadd_epi16 (tmp, tmp);
  393. return *reinterpret_cast<uint16_t*> (&tmp);
  394. #else
  395. uint16_t sum = 0;
  396. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  397. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  398. sum += src [i];
  399. return sum;
  400. #endif
  401. }
  402. };
  403. //==============================================================================
  404. /** Signed 32-bit integer SSE intrinsics. */
  405. template <>
  406. struct SIMDNativeOps<int32_t>
  407. {
  408. //==============================================================================
  409. typedef __m128i vSIMDType;
  410. //==============================================================================
  411. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  412. //==============================================================================
  413. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  414. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  415. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
  416. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  417. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  418. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  419. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  420. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  421. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  422. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  423. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  424. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  425. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  426. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  427. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  428. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  429. //==============================================================================
  430. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
  431. {
  432. SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
  433. }
  434. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  435. {
  436. #ifdef __SSSE3__
  437. __m128i tmp = _mm_hadd_epi32 (a, a);
  438. tmp = _mm_hadd_epi32 (tmp, tmp);
  439. return *reinterpret_cast<int32_t*> (&tmp);
  440. #else
  441. int32_t sum = 0;
  442. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  443. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  444. sum += src [i];
  445. return sum;
  446. #endif
  447. }
  448. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  449. {
  450. #if defined(__SSE4_1__)
  451. return _mm_mullo_epi32 (a, b);
  452. #else
  453. __m128i even = _mm_mul_epu32 (a,b);
  454. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  455. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  456. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  457. #endif
  458. }
  459. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  460. {
  461. #if defined(__SSE4_1__)
  462. return _mm_min_epi32 (a, b);
  463. #else
  464. __m128i lt = greaterThan (b, a);
  465. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  466. #endif
  467. }
  468. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  469. {
  470. #if defined(__SSE4_1__)
  471. return _mm_max_epi32 (a, b);
  472. #else
  473. __m128i gt = greaterThan (a, b);
  474. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  475. #endif
  476. }
  477. };
  478. //==============================================================================
  479. /** Unsigned 32-bit integer SSE intrinsics. */
  480. template <>
  481. struct SIMDNativeOps<uint32_t>
  482. {
  483. //==============================================================================
  484. typedef __m128i vSIMDType;
  485. //==============================================================================
  486. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  487. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  488. //==============================================================================
  489. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  490. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  491. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  492. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  493. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  494. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  495. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  496. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  497. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  498. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  499. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  500. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  501. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  502. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  503. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  504. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  505. //==============================================================================
  506. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  507. {
  508. const auto* b = reinterpret_cast<const int32_t*> (a);
  509. return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
  510. }
  511. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
  512. {
  513. SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
  514. }
  515. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  516. {
  517. #ifdef __SSSE3__
  518. __m128i tmp = _mm_hadd_epi32 (a, a);
  519. tmp = _mm_hadd_epi32 (tmp, tmp);
  520. return *reinterpret_cast<uint32_t*> (&tmp);
  521. #else
  522. uint32_t sum = 0;
  523. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  524. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  525. sum += src [i];
  526. return sum;
  527. #endif
  528. }
  529. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  530. {
  531. #if defined(__SSE4_1__)
  532. return _mm_mullo_epi32 (a, b);
  533. #else
  534. __m128i even = _mm_mul_epu32 (a,b);
  535. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  536. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  537. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  538. #endif
  539. }
  540. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  541. {
  542. #if defined(__SSE4_1__)
  543. return _mm_min_epi32 (a, b);
  544. #else
  545. __m128i lt = greaterThan (b, a);
  546. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  547. #endif
  548. }
  549. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  550. {
  551. #if defined(__SSE4_1__)
  552. return _mm_max_epi32 (a, b);
  553. #else
  554. __m128i gt = greaterThan (a, b);
  555. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  556. #endif
  557. }
  558. };
  559. //==============================================================================
  560. /** Signed 64-bit integer SSE intrinsics. */
  561. template <>
  562. struct SIMDNativeOps<int64_t>
  563. {
  564. //==============================================================================
  565. typedef __m128i vSIMDType;
  566. //==============================================================================
  567. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  568. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  569. {
  570. __m128i retval;
  571. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  572. ptr[0] = ptr[1] = s;
  573. return retval;
  574. }
  575. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); }
  576. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  577. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  578. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  579. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  580. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  581. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  582. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  583. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  584. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  585. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  586. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  587. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  588. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  589. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  590. //==============================================================================
  591. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
  592. {
  593. SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
  594. }
  595. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  596. {
  597. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  598. return ptr[0] + ptr[1];
  599. }
  600. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  601. {
  602. __m128i retval;
  603. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  604. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  605. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  606. dst[0] = aptr[0] * bptr[0];
  607. dst[1] = aptr[1] * bptr[1];
  608. return retval;
  609. }
  610. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  611. {
  612. #if defined(__SSE4_1__)
  613. return _mm_cmpeq_epi64 (a, b);
  614. #else
  615. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  616. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  617. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  618. #endif
  619. }
  620. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  621. {
  622. #if defined(__SSE4_1__) && !defined(__clang__)
  623. return _mm_cmpgt_epi64 (a, b);
  624. #else
  625. __m128i retval;
  626. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  627. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  628. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  629. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  630. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  631. return retval;
  632. #endif
  633. }
  634. };
  635. //==============================================================================
  636. /** Unsigned 64-bit integer SSE intrinsics. */
  637. template <>
  638. struct SIMDNativeOps<uint64_t>
  639. {
  640. //==============================================================================
  641. typedef __m128i vSIMDType;
  642. //==============================================================================
  643. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  644. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  645. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  646. {
  647. __m128i retval;
  648. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  649. ptr[0] = ptr[1] = s;
  650. return retval;
  651. }
  652. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  653. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  654. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  655. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  656. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  657. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  658. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  659. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  660. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  661. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  662. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  663. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  664. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  665. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  666. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  667. //==============================================================================
  668. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  669. {
  670. const auto* b = reinterpret_cast<const int64_t*> (a);
  671. return _mm_set_epi64x (b[1], b[0]);
  672. }
  673. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
  674. {
  675. SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
  676. }
  677. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  678. {
  679. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  680. return ptr[0] + ptr[1];
  681. }
  682. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  683. {
  684. __m128i retval;
  685. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  686. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  687. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  688. dst[0] = aptr[0] * bptr[0];
  689. dst[1] = aptr[1] * bptr[1];
  690. return retval;
  691. }
  692. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  693. {
  694. #if defined(__SSE4_1__)
  695. return _mm_cmpeq_epi64 (a, b);
  696. #else
  697. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  698. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  699. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  700. #endif
  701. }
  702. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  703. {
  704. #if defined(__SSE4_1__) && !defined(__clang__)
  705. return _mm_cmpgt_epi64 (a, b);
  706. #else
  707. __m128i retval;
  708. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  709. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  710. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  711. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  712. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  713. return retval;
  714. #endif
  715. }
  716. };
  717. #endif
  718. } // namespace dsp
  719. } // namespace juce