The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

832 lines
48KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static __declspec(align(16)) const type name [16 / sizeof (type)]
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  29. #else
  30. #define DECLARE_SSE_SIMD_CONST(type, name) \
  31. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  32. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point SSE intrinsics. */
  39. template <>
  40. struct SIMDNativeOps<float>
  41. {
  42. //==============================================================================
  43. typedef __m128 vSIMDType;
  44. //==============================================================================
  45. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  46. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  47. DECLARE_SSE_SIMD_CONST (float, kOne);
  48. //==============================================================================
  49. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  50. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  51. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  53. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  68. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  69. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  71. //==============================================================================
  72. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  73. {
  74. __m128 rr_ir = mul (a, dupeven (b));
  75. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  76. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  77. }
  78. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  79. {
  80. #if defined(__SSE4__)
  81. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  82. #elif defined(__SSE3__)
  83. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  84. #else
  85. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  86. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  87. #endif
  88. return ((float*) &retval) [0];
  89. }
  90. };
  91. //==============================================================================
  92. /** Double-precision floating point SSE intrinsics. */
  93. template <>
  94. struct SIMDNativeOps<double>
  95. {
  96. //==============================================================================
  97. typedef __m128d vSIMDType;
  98. //==============================================================================
  99. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  100. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  101. DECLARE_SSE_SIMD_CONST (double, kOne);
  102. //==============================================================================
  103. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  104. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  105. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  106. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  107. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  108. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  110. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  123. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  125. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  126. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  127. //==============================================================================
  128. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  129. {
  130. __m128d rr_ir = mul (a, dupeven (b));
  131. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  132. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  133. }
  134. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  135. {
  136. #if defined(__SSE4__)
  137. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  138. #elif defined(__SSE3__)
  139. __m128d retval = _mm_hadd_pd (a, a);
  140. #else
  141. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  142. #endif
  143. return ((double*) &retval) [0];
  144. }
  145. };
  146. //==============================================================================
  147. /** Signed 8-bit integer SSE intrinsics. */
  148. template <>
  149. struct SIMDNativeOps<int8_t>
  150. {
  151. //==============================================================================
  152. typedef __m128i vSIMDType;
  153. //==============================================================================
  154. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  155. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  156. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  157. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  158. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  159. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  160. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  161. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  162. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  163. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  164. #if defined(__SSE4__)
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  166. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  167. #else
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  169. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  170. #endif
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  172. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  173. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  175. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  176. //==============================================================================
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  178. {
  179. const auto* b = reinterpret_cast<const char*> (a);
  180. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  181. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  182. }
  183. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
  184. {
  185. SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
  186. }
  187. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  188. {
  189. #ifdef __SSSE3__
  190. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  191. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  192. for (int i = 0; i < 3; ++i)
  193. {
  194. lo = _mm_hadd_epi16 (lo, lo);
  195. hi = _mm_hadd_epi16 (hi, hi);
  196. }
  197. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  198. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  199. return lo_ptr[0] + hi_ptr[0];
  200. #else
  201. int8_t sum = 0;
  202. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  203. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  204. sum += src [i];
  205. return sum;
  206. #endif
  207. }
  208. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  209. {
  210. // unpack and multiply
  211. __m128i even = _mm_mullo_epi16 (a, b);
  212. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  213. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  214. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  215. }
  216. };
  217. //==============================================================================
  218. /** Unsigned 8-bit integer SSE intrinsics. */
  219. template <>
  220. struct SIMDNativeOps<uint8_t>
  221. {
  222. //==============================================================================
  223. typedef __m128i vSIMDType;
  224. //==============================================================================
  225. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  226. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  227. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  228. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  229. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  230. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  231. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  232. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  233. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  234. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  235. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  236. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  237. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  238. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  239. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  240. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  241. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  244. //==============================================================================
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  246. {
  247. const auto* b = reinterpret_cast<const char*> (a);
  248. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  249. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  250. }
  251. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
  252. {
  253. SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
  254. }
  255. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  256. {
  257. #ifdef __SSSE3__
  258. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  259. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  260. for (int i = 0; i < 3; ++i)
  261. {
  262. lo = _mm_hadd_epi16 (lo, lo);
  263. hi = _mm_hadd_epi16 (hi, hi);
  264. }
  265. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  266. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  267. return lo_ptr[0] + hi_ptr[0];
  268. #else
  269. uint8_t sum = 0;
  270. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  271. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  272. sum += src [i];
  273. return sum;
  274. #endif
  275. }
  276. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  277. {
  278. // unpack and multiply
  279. __m128i even = _mm_mullo_epi16 (a, b);
  280. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  281. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  282. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  283. }
  284. };
  285. //==============================================================================
  286. /** Signed 16-bit integer SSE intrinsics. */
  287. template <>
  288. struct SIMDNativeOps<int16_t>
  289. {
  290. //==============================================================================
  291. typedef __m128i vSIMDType;
  292. //==============================================================================
  293. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  294. //==============================================================================
  295. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  296. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  297. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  298. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  299. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  300. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  301. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  302. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  303. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  304. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  305. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  306. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  307. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  308. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  309. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  310. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  311. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  312. //==============================================================================
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  314. {
  315. return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  316. }
  317. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
  318. {
  319. SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
  320. }
  321. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  322. {
  323. #ifdef __SSSE3__
  324. __m128i tmp = _mm_hadd_epi16 (a, a);
  325. tmp = _mm_hadd_epi16 (tmp, tmp);
  326. tmp = _mm_hadd_epi16 (tmp, tmp);
  327. return *reinterpret_cast<int16_t*> (&tmp);
  328. #else
  329. int16_t sum = 0;
  330. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  331. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  332. sum += src [i];
  333. return sum;
  334. #endif
  335. }
  336. };
  337. //==============================================================================
  338. /** Unsigned 16-bit integer SSE intrinsics. */
  339. template <>
  340. struct SIMDNativeOps<uint16_t>
  341. {
  342. //==============================================================================
  343. typedef __m128i vSIMDType;
  344. //==============================================================================
  345. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  346. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  347. //==============================================================================
  348. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  349. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  350. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  351. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  352. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  353. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  354. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  355. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  356. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  357. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  358. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  359. #if defined(__SSE4__)
  360. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  361. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  362. #else
  363. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  364. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  365. #endif
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  367. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  370. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  371. //==============================================================================
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  373. {
  374. const auto* b = reinterpret_cast<const int16_t*> (a);
  375. return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  376. }
  377. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
  378. {
  379. SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
  380. }
  381. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  382. {
  383. #ifdef __SSSE3__
  384. __m128i tmp = _mm_hadd_epi16 (a, a);
  385. tmp = _mm_hadd_epi16 (tmp, tmp);
  386. tmp = _mm_hadd_epi16 (tmp, tmp);
  387. return *reinterpret_cast<uint16_t*> (&tmp);
  388. #else
  389. uint16_t sum = 0;
  390. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  391. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  392. sum += src [i];
  393. return sum;
  394. #endif
  395. }
  396. };
  397. //==============================================================================
  398. /** Signed 32-bit integer SSE intrinsics. */
  399. template <>
  400. struct SIMDNativeOps<int32_t>
  401. {
  402. //==============================================================================
  403. typedef __m128i vSIMDType;
  404. //==============================================================================
  405. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  406. //==============================================================================
  407. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  408. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  409. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
  410. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  411. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  412. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  413. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  414. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  415. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  416. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  417. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  418. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  419. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  420. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  421. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  422. //==============================================================================
  423. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
  424. {
  425. SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
  426. }
  427. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  428. {
  429. #ifdef __SSSE3__
  430. __m128i tmp = _mm_hadd_epi32 (a, a);
  431. tmp = _mm_hadd_epi32 (tmp, tmp);
  432. return *reinterpret_cast<int32_t*> (&tmp);
  433. #else
  434. int32_t sum = 0;
  435. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  436. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  437. sum += src [i];
  438. return sum;
  439. #endif
  440. }
  441. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  442. {
  443. #if defined(__SSE4_1__)
  444. return _mm_mullo_epi32 (a, b);
  445. #else
  446. __m128i even = _mm_mul_epu32 (a,b);
  447. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  448. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  449. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  450. #endif
  451. }
  452. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  453. {
  454. #if defined(__SSE4_1__)
  455. return _mm_min_epi32 (a, b);
  456. #else
  457. __m128i lt = greaterThan (b, a);
  458. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  459. #endif
  460. }
  461. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  462. {
  463. #if defined(__SSE4_1__)
  464. return _mm_max_epi32 (a, b);
  465. #else
  466. __m128i gt = greaterThan (a, b);
  467. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  468. #endif
  469. }
  470. };
  471. //==============================================================================
  472. /** Unsigned 32-bit integer SSE intrinsics. */
  473. template <>
  474. struct SIMDNativeOps<uint32_t>
  475. {
  476. //==============================================================================
  477. typedef __m128i vSIMDType;
  478. //==============================================================================
  479. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  480. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  481. //==============================================================================
  482. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  483. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  484. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  485. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  486. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  487. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  488. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  489. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  490. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  491. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  492. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  493. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  494. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  495. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  496. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  497. //==============================================================================
  498. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  499. {
  500. const auto* b = reinterpret_cast<const int32_t*> (a);
  501. return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
  502. }
  503. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
  504. {
  505. SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
  506. }
  507. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  508. {
  509. #ifdef __SSSE3__
  510. __m128i tmp = _mm_hadd_epi32 (a, a);
  511. tmp = _mm_hadd_epi32 (tmp, tmp);
  512. return *reinterpret_cast<uint32_t*> (&tmp);
  513. #else
  514. uint32_t sum = 0;
  515. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  516. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  517. sum += src [i];
  518. return sum;
  519. #endif
  520. }
  521. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  522. {
  523. #if defined(__SSE4_1__)
  524. return _mm_mullo_epi32 (a, b);
  525. #else
  526. __m128i even = _mm_mul_epu32 (a,b);
  527. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  528. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  529. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  530. #endif
  531. }
  532. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  533. {
  534. #if defined(__SSE4_1__)
  535. return _mm_min_epi32 (a, b);
  536. #else
  537. __m128i lt = greaterThan (b, a);
  538. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  539. #endif
  540. }
  541. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  542. {
  543. #if defined(__SSE4_1__)
  544. return _mm_max_epi32 (a, b);
  545. #else
  546. __m128i gt = greaterThan (a, b);
  547. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  548. #endif
  549. }
  550. };
  551. //==============================================================================
  552. /** Signed 64-bit integer SSE intrinsics. */
  553. template <>
  554. struct SIMDNativeOps<int64_t>
  555. {
  556. //==============================================================================
  557. typedef __m128i vSIMDType;
  558. //==============================================================================
  559. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  560. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  561. {
  562. __m128i retval;
  563. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  564. ptr[0] = ptr[1] = s;
  565. return retval;
  566. }
  567. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); }
  568. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  569. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  570. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  571. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  572. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  573. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  574. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  575. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  576. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  577. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  578. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  579. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  580. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  581. //==============================================================================
  582. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
  583. {
  584. SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
  585. }
  586. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  587. {
  588. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  589. return ptr[0] + ptr[1];
  590. }
  591. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  592. {
  593. __m128i retval;
  594. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  595. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  596. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  597. dst[0] = aptr[0] * bptr[0];
  598. dst[1] = aptr[1] * bptr[1];
  599. return retval;
  600. }
  601. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  602. {
  603. #if defined(__SSE4_1__)
  604. return _mm_cmpeq_epi64 (a, b);
  605. #else
  606. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  607. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  608. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  609. #endif
  610. }
  611. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  612. {
  613. #if defined(__SSE4_1__) && !defined(__clang__)
  614. return _mm_cmpgt_epi64 (a, b);
  615. #else
  616. __m128i retval;
  617. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  618. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  619. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  620. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  621. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  622. return retval;
  623. #endif
  624. }
  625. };
  626. //==============================================================================
  627. /** Unsigned 64-bit integer SSE intrinsics. */
  628. template <>
  629. struct SIMDNativeOps<uint64_t>
  630. {
  631. //==============================================================================
  632. typedef __m128i vSIMDType;
  633. //==============================================================================
  634. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  635. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  636. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  637. {
  638. __m128i retval;
  639. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  640. ptr[0] = ptr[1] = s;
  641. return retval;
  642. }
  643. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  644. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  645. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  646. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  647. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  648. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  649. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  650. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  651. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  652. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  653. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  654. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  655. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  656. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  657. //==============================================================================
  658. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  659. {
  660. const auto* b = reinterpret_cast<const int64_t*> (a);
  661. return _mm_set_epi64x (b[1], b[0]);
  662. }
  663. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
  664. {
  665. SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
  666. }
  667. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  668. {
  669. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  670. return ptr[0] + ptr[1];
  671. }
  672. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  673. {
  674. __m128i retval;
  675. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  676. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  677. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  678. dst[0] = aptr[0] * bptr[0];
  679. dst[1] = aptr[1] * bptr[1];
  680. return retval;
  681. }
  682. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  683. {
  684. #if defined(__SSE4_1__)
  685. return _mm_cmpeq_epi64 (a, b);
  686. #else
  687. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  688. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  689. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  690. #endif
  691. }
  692. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  693. {
  694. #if defined(__SSE4_1__) && !defined(__clang__)
  695. return _mm_cmpgt_epi64 (a, b);
  696. #else
  697. __m128i retval;
  698. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  699. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  700. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  701. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  702. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  703. return retval;
  704. #endif
  705. }
  706. };
  707. #endif
  708. } // namespace dsp
  709. } // namespace juce