The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

872 lines
50KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE library.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. JUCE is an open source library subject to commercial or open-source
  6. licensing.
  7. By using JUCE, you agree to the terms of both the JUCE 5 End-User License
  8. Agreement and JUCE 5 Privacy Policy (both updated and effective as of the
  9. 27th April 2017).
  10. End User License Agreement: www.juce.com/juce-5-licence
  11. Privacy Policy: www.juce.com/juce-5-privacy-policy
  12. Or: You may also use this code under the terms of the GPL v3 (see
  13. www.gnu.org/licenses).
  14. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  15. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  16. DISCLAIMED.
  17. ==============================================================================
  18. */
  19. namespace juce
  20. {
  21. namespace dsp
  22. {
  23. #ifndef DOXYGEN
  24. #ifdef _MSC_VER
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static __declspec(align(16)) const type name [16 / sizeof (type)]
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  29. #else
  30. #define DECLARE_SSE_SIMD_CONST(type, name) \
  31. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  32. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  33. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  34. #endif
  35. template <typename type>
  36. struct SIMDNativeOps;
  37. //==============================================================================
  38. /** Single-precision floating point SSE intrinsics.
  39. @tags{DSP}
  40. */
  41. template <>
  42. struct SIMDNativeOps<float>
  43. {
  44. //==============================================================================
  45. typedef __m128 vSIMDType;
  46. //==============================================================================
  47. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  48. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  49. DECLARE_SSE_SIMD_CONST (float, kOne);
  50. //==============================================================================
  51. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  53. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  63. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  68. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
  69. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  71. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  72. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  73. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  74. //==============================================================================
  75. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  76. {
  77. __m128 rr_ir = mul (a, dupeven (b));
  78. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  79. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  80. }
  81. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  82. {
  83. #if defined(__SSE4__)
  84. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  85. #elif defined(__SSE3__)
  86. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  87. #else
  88. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  89. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  90. #endif
  91. return ((float*) &retval) [0];
  92. }
  93. };
  94. //==============================================================================
  95. /** Double-precision floating point SSE intrinsics.
  96. @tags{DSP}
  97. */
  98. template <>
  99. struct SIMDNativeOps<double>
  100. {
  101. //==============================================================================
  102. typedef __m128d vSIMDType;
  103. //==============================================================================
  104. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  105. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  106. DECLARE_SSE_SIMD_CONST (double, kOne);
  107. //==============================================================================
  108. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128d*> (a); }
  110. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  112. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  123. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  125. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  126. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  127. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
  128. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  129. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  130. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  131. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  132. static forcedinline __m128d oddevensum (__m128d a) noexcept { return a; }
  133. //==============================================================================
  134. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  135. {
  136. __m128d rr_ir = mul (a, dupeven (b));
  137. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  138. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  139. }
  140. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  141. {
  142. #if defined(__SSE4__)
  143. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  144. #elif defined(__SSE3__)
  145. __m128d retval = _mm_hadd_pd (a, a);
  146. #else
  147. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  148. #endif
  149. return ((double*) &retval) [0];
  150. }
  151. };
  152. //==============================================================================
  153. /** Signed 8-bit integer SSE intrinsics.
  154. @tags{DSP}
  155. */
  156. template <>
  157. struct SIMDNativeOps<int8_t>
  158. {
  159. //==============================================================================
  160. typedef __m128i vSIMDType;
  161. //==============================================================================
  162. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  163. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  164. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  166. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  169. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  170. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  172. #if defined(__SSE4__)
  173. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  175. #else
  176. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  178. #endif
  179. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  180. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  181. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  182. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  183. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  184. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  185. //==============================================================================
  186. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept
  187. {
  188. const auto* b = reinterpret_cast<const char*> (a);
  189. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  190. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  191. }
  192. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int8_t* dest) noexcept
  193. {
  194. SIMDFallbackOps<int8_t, __m128i>::store (value, dest);
  195. }
  196. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  197. {
  198. #ifdef __SSSE3__
  199. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  200. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  201. for (int i = 0; i < 3; ++i)
  202. {
  203. lo = _mm_hadd_epi16 (lo, lo);
  204. hi = _mm_hadd_epi16 (hi, hi);
  205. }
  206. const int8_t* lo_ptr = reinterpret_cast<const int8_t*> (&lo);
  207. const int8_t* hi_ptr = reinterpret_cast<const int8_t*> (&hi);
  208. return lo_ptr[0] + hi_ptr[0];
  209. #else
  210. int8_t sum = 0;
  211. const int8_t* src = reinterpret_cast<const int8_t*> (&a);
  212. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  213. sum += src [i];
  214. return sum;
  215. #endif
  216. }
  217. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  218. {
  219. // unpack and multiply
  220. __m128i even = _mm_mullo_epi16 (a, b);
  221. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  222. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  223. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  224. }
  225. };
  226. //==============================================================================
  227. /** Unsigned 8-bit integer SSE intrinsics.
  228. @tags{DSP}
  229. */
  230. template <>
  231. struct SIMDNativeOps<uint8_t>
  232. {
  233. //==============================================================================
  234. typedef __m128i vSIMDType;
  235. //==============================================================================
  236. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  237. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  238. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  239. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  240. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  241. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  244. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  246. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  247. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  248. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  249. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  250. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  251. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  252. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  253. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  254. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  255. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  256. //==============================================================================
  257. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept
  258. {
  259. const auto* b = reinterpret_cast<const char*> (a);
  260. return _mm_set_epi8 (b[15], b[14], b[13], b[12], b[11], b[10], b[9], b[8],
  261. b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  262. }
  263. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint8_t* dest) noexcept
  264. {
  265. SIMDFallbackOps<uint8_t, __m128i>::store (value, dest);
  266. }
  267. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  268. {
  269. #ifdef __SSSE3__
  270. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  271. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  272. for (int i = 0; i < 3; ++i)
  273. {
  274. lo = _mm_hadd_epi16 (lo, lo);
  275. hi = _mm_hadd_epi16 (hi, hi);
  276. }
  277. const uint8_t* lo_ptr = reinterpret_cast<const uint8_t*> (&lo);
  278. const uint8_t* hi_ptr = reinterpret_cast<const uint8_t*> (&hi);
  279. return lo_ptr[0] + hi_ptr[0];
  280. #else
  281. uint8_t sum = 0;
  282. const uint8_t* src = reinterpret_cast<const uint8_t*> (&a);
  283. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int8_t)); ++i)
  284. sum += src [i];
  285. return sum;
  286. #endif
  287. }
  288. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  289. {
  290. // unpack and multiply
  291. __m128i even = _mm_mullo_epi16 (a, b);
  292. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  293. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  294. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  295. }
  296. };
  297. //==============================================================================
  298. /** Signed 16-bit integer SSE intrinsics.
  299. @tags{DSP}
  300. */
  301. template <>
  302. struct SIMDNativeOps<int16_t>
  303. {
  304. //==============================================================================
  305. typedef __m128i vSIMDType;
  306. //==============================================================================
  307. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  308. //==============================================================================
  309. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  310. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  311. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  312. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  313. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  314. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  315. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  316. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  317. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  318. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  319. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  320. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  321. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  322. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  323. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  324. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  325. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  326. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  327. //==============================================================================
  328. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept
  329. {
  330. return _mm_set_epi16 (a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
  331. }
  332. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int16_t* dest) noexcept
  333. {
  334. SIMDFallbackOps<int16_t, __m128i>::store (value, dest);
  335. }
  336. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  337. {
  338. #ifdef __SSSE3__
  339. __m128i tmp = _mm_hadd_epi16 (a, a);
  340. tmp = _mm_hadd_epi16 (tmp, tmp);
  341. tmp = _mm_hadd_epi16 (tmp, tmp);
  342. return *reinterpret_cast<int16_t*> (&tmp);
  343. #else
  344. int16_t sum = 0;
  345. const int16_t* src = reinterpret_cast<const int16_t*> (&a);
  346. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int16_t)); ++i)
  347. sum += src [i];
  348. return sum;
  349. #endif
  350. }
  351. };
  352. //==============================================================================
  353. /** Unsigned 16-bit integer SSE intrinsics.
  354. @tags{DSP}
  355. */
  356. template <>
  357. struct SIMDNativeOps<uint16_t>
  358. {
  359. //==============================================================================
  360. typedef __m128i vSIMDType;
  361. //==============================================================================
  362. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  363. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  364. //==============================================================================
  365. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  367. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  368. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  369. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  370. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  371. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  372. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  373. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  374. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  375. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  376. #if defined(__SSE4__)
  377. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  378. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  379. #else
  380. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  381. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  382. #endif
  383. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  384. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  385. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  386. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  387. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  388. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  389. //==============================================================================
  390. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept
  391. {
  392. const auto* b = reinterpret_cast<const int16_t*> (a);
  393. return _mm_set_epi16 (b[7], b[6], b[5], b[4], b[3], b[2], b[1], b[0]);
  394. }
  395. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint16_t* dest) noexcept
  396. {
  397. SIMDFallbackOps<uint16_t, __m128i>::store (value, dest);
  398. }
  399. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  400. {
  401. #ifdef __SSSE3__
  402. __m128i tmp = _mm_hadd_epi16 (a, a);
  403. tmp = _mm_hadd_epi16 (tmp, tmp);
  404. tmp = _mm_hadd_epi16 (tmp, tmp);
  405. return *reinterpret_cast<uint16_t*> (&tmp);
  406. #else
  407. uint16_t sum = 0;
  408. const uint16_t* src = reinterpret_cast<const uint16_t*> (&a);
  409. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint16_t)); ++i)
  410. sum += src [i];
  411. return sum;
  412. #endif
  413. }
  414. };
  415. //==============================================================================
  416. /** Signed 32-bit integer SSE intrinsics.
  417. @tags{DSP}
  418. */
  419. template <>
  420. struct SIMDNativeOps<int32_t>
  421. {
  422. //==============================================================================
  423. typedef __m128i vSIMDType;
  424. //==============================================================================
  425. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  426. //==============================================================================
  427. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  428. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  429. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_set_epi32 (a[3], a[2], a[1], a[0]); }
  430. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  431. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  432. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  433. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  434. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  435. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  436. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  437. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  438. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  439. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  440. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  441. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  442. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  443. //==============================================================================
  444. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int32_t* dest) noexcept
  445. {
  446. SIMDFallbackOps<int32_t, __m128i>::store (value, dest);
  447. }
  448. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  449. {
  450. #ifdef __SSSE3__
  451. __m128i tmp = _mm_hadd_epi32 (a, a);
  452. tmp = _mm_hadd_epi32 (tmp, tmp);
  453. return *reinterpret_cast<int32_t*> (&tmp);
  454. #else
  455. int32_t sum = 0;
  456. const int32_t* src = reinterpret_cast<const int32_t*> (&a);
  457. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(int32_t)); ++i)
  458. sum += src [i];
  459. return sum;
  460. #endif
  461. }
  462. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  463. {
  464. #if defined(__SSE4_1__)
  465. return _mm_mullo_epi32 (a, b);
  466. #else
  467. __m128i even = _mm_mul_epu32 (a,b);
  468. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  469. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  470. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  471. #endif
  472. }
  473. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  474. {
  475. #if defined(__SSE4_1__)
  476. return _mm_min_epi32 (a, b);
  477. #else
  478. __m128i lt = greaterThan (b, a);
  479. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  480. #endif
  481. }
  482. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  483. {
  484. #if defined(__SSE4_1__)
  485. return _mm_max_epi32 (a, b);
  486. #else
  487. __m128i gt = greaterThan (a, b);
  488. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  489. #endif
  490. }
  491. };
  492. //==============================================================================
  493. /** Unsigned 32-bit integer SSE intrinsics.
  494. @tags{DSP}
  495. */
  496. template <>
  497. struct SIMDNativeOps<uint32_t>
  498. {
  499. //==============================================================================
  500. typedef __m128i vSIMDType;
  501. //==============================================================================
  502. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  503. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  504. //==============================================================================
  505. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  506. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  507. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  508. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  509. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  510. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  511. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  512. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  513. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  514. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  515. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  516. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  517. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  518. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  519. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  520. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  521. //==============================================================================
  522. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept
  523. {
  524. const auto* b = reinterpret_cast<const int32_t*> (a);
  525. return _mm_set_epi32 (b[3], b[2], b[1], b[0]);
  526. }
  527. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint32_t* dest) noexcept
  528. {
  529. SIMDFallbackOps<uint32_t, __m128i>::store (value, dest);
  530. }
  531. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  532. {
  533. #ifdef __SSSE3__
  534. __m128i tmp = _mm_hadd_epi32 (a, a);
  535. tmp = _mm_hadd_epi32 (tmp, tmp);
  536. return *reinterpret_cast<uint32_t*> (&tmp);
  537. #else
  538. uint32_t sum = 0;
  539. const uint32_t* src = reinterpret_cast<const uint32_t*> (&a);
  540. for (std::size_t i = 0; i < (sizeof (vSIMDType) / sizeof(uint32_t)); ++i)
  541. sum += src [i];
  542. return sum;
  543. #endif
  544. }
  545. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  546. {
  547. #if defined(__SSE4_1__)
  548. return _mm_mullo_epi32 (a, b);
  549. #else
  550. __m128i even = _mm_mul_epu32 (a,b);
  551. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  552. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  553. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  554. #endif
  555. }
  556. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  557. {
  558. #if defined(__SSE4_1__)
  559. return _mm_min_epi32 (a, b);
  560. #else
  561. __m128i lt = greaterThan (b, a);
  562. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  563. #endif
  564. }
  565. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  566. {
  567. #if defined(__SSE4_1__)
  568. return _mm_max_epi32 (a, b);
  569. #else
  570. __m128i gt = greaterThan (a, b);
  571. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  572. #endif
  573. }
  574. };
  575. //==============================================================================
  576. /** Signed 64-bit integer SSE intrinsics.
  577. @tags{DSP}
  578. */
  579. template <>
  580. struct SIMDNativeOps<int64_t>
  581. {
  582. //==============================================================================
  583. typedef __m128i vSIMDType;
  584. //==============================================================================
  585. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  586. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept
  587. {
  588. __m128i retval;
  589. int64_t* ptr = reinterpret_cast<int64_t*> (&retval);
  590. ptr[0] = ptr[1] = s;
  591. return retval;
  592. }
  593. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_set_epi64x (a[1], a[0]); }
  594. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  595. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  596. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  597. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  598. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  599. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  600. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  601. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  602. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  603. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  604. static forcedinline __m128i greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  605. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  606. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  607. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  608. //==============================================================================
  609. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, int64_t* dest) noexcept
  610. {
  611. SIMDFallbackOps<int64_t, __m128i>::store (value, dest);
  612. }
  613. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  614. {
  615. const int64_t* ptr = reinterpret_cast<const int64_t*> (&a);
  616. return ptr[0] + ptr[1];
  617. }
  618. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  619. {
  620. __m128i retval;
  621. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  622. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  623. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  624. dst[0] = aptr[0] * bptr[0];
  625. dst[1] = aptr[1] * bptr[1];
  626. return retval;
  627. }
  628. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  629. {
  630. #if defined(__SSE4_1__)
  631. return _mm_cmpeq_epi64 (a, b);
  632. #else
  633. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  634. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  635. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  636. #endif
  637. }
  638. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  639. {
  640. #if defined(__SSE4_1__) && !defined(__clang__)
  641. return _mm_cmpgt_epi64 (a, b);
  642. #else
  643. __m128i retval;
  644. const int64_t* aptr = reinterpret_cast<const int64_t*> (&a);
  645. const int64_t* bptr = reinterpret_cast<const int64_t*> (&b);
  646. int64_t* dst = reinterpret_cast<int64_t*> (&retval);
  647. dst[0] = aptr[0] > bptr[0] ? -1LL : 0;
  648. dst[1] = aptr[1] > bptr[1] ? -1LL : 0;
  649. return retval;
  650. #endif
  651. }
  652. };
  653. //==============================================================================
  654. /** Unsigned 64-bit integer SSE intrinsics.
  655. @tags{DSP}
  656. */
  657. template <>
  658. struct SIMDNativeOps<uint64_t>
  659. {
  660. //==============================================================================
  661. typedef __m128i vSIMDType;
  662. //==============================================================================
  663. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  664. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  665. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept
  666. {
  667. __m128i retval;
  668. uint64_t* ptr = reinterpret_cast<uint64_t*> (&retval);
  669. ptr[0] = ptr[1] = s;
  670. return retval;
  671. }
  672. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return *reinterpret_cast<const __m128i*> (a); }
  673. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  674. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  675. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  676. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  677. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  678. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  679. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  680. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  681. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  682. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  683. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  684. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  685. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  686. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  687. //==============================================================================
  688. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept
  689. {
  690. const auto* b = reinterpret_cast<const int64_t*> (a);
  691. return _mm_set_epi64x (b[1], b[0]);
  692. }
  693. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i value, uint64_t* dest) noexcept
  694. {
  695. SIMDFallbackOps<uint64_t, __m128i>::store (value, dest);
  696. }
  697. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  698. {
  699. const uint64_t* ptr = reinterpret_cast<const uint64_t*> (&a);
  700. return ptr[0] + ptr[1];
  701. }
  702. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  703. {
  704. __m128i retval;
  705. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  706. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  707. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  708. dst[0] = aptr[0] * bptr[0];
  709. dst[1] = aptr[1] * bptr[1];
  710. return retval;
  711. }
  712. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  713. {
  714. #if defined(__SSE4_1__)
  715. return _mm_cmpeq_epi64 (a, b);
  716. #else
  717. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  718. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  719. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  720. #endif
  721. }
  722. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  723. {
  724. #if defined(__SSE4_1__) && !defined(__clang__)
  725. return _mm_cmpgt_epi64 (a, b);
  726. #else
  727. __m128i retval;
  728. const uint64_t* aptr = reinterpret_cast<const uint64_t*> (&a);
  729. const uint64_t* bptr = reinterpret_cast<const uint64_t*> (&b);
  730. uint64_t* dst = reinterpret_cast<uint64_t*> (&retval);
  731. dst[0] = aptr[0] > bptr[0] ? (uint64_t) -1LL : 0;
  732. dst[1] = aptr[1] > bptr[1] ? (uint64_t) -1LL : 0;
  733. return retval;
  734. #endif
  735. }
  736. };
  737. #endif
  738. } // namespace dsp
  739. } // namespace juce