The JUCE cross-platform C++ framework, with DISTRHO/KXStudio specific changes
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

723 lines
51KB

  1. /*
  2. ==============================================================================
  3. This file is part of the JUCE 6 technical preview.
  4. Copyright (c) 2017 - ROLI Ltd.
  5. You may use this code under the terms of the GPL v3
  6. (see www.gnu.org/licenses).
  7. For this technical preview, this file is not subject to commercial licensing.
  8. JUCE IS PROVIDED "AS IS" WITHOUT ANY WARRANTY, AND ALL WARRANTIES, WHETHER
  9. EXPRESSED OR IMPLIED, INCLUDING MERCHANTABILITY AND FITNESS FOR PURPOSE, ARE
  10. DISCLAIMED.
  11. ==============================================================================
  12. */
  13. namespace juce
  14. {
  15. namespace dsp
  16. {
  17. #ifndef DOXYGEN
  18. JUCE_BEGIN_IGNORE_WARNINGS_GCC_LIKE ("-Wignored-attributes")
  19. #ifdef _MSC_VER
  20. #define DECLARE_SSE_SIMD_CONST(type, name) \
  21. static __declspec(align(16)) const type name [16 / sizeof (type)]
  22. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  23. __declspec(align(16)) const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)]
  24. #else
  25. #define DECLARE_SSE_SIMD_CONST(type, name) \
  26. static const type name [16 / sizeof (type)] __attribute__((aligned(16)))
  27. #define DEFINE_SSE_SIMD_CONST(type, class_type, name) \
  28. const type SIMDNativeOps<class_type>:: name [16 / sizeof (type)] __attribute__((aligned(16)))
  29. #endif
  30. template <typename type>
  31. struct SIMDNativeOps;
  32. //==============================================================================
  33. /** Single-precision floating point SSE intrinsics.
  34. @tags{DSP}
  35. */
  36. template <>
  37. struct SIMDNativeOps<float>
  38. {
  39. //==============================================================================
  40. using vSIMDType = __m128;
  41. //==============================================================================
  42. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  43. DECLARE_SSE_SIMD_CONST (int32_t, kEvenHighBit);
  44. DECLARE_SSE_SIMD_CONST (float, kOne);
  45. //==============================================================================
  46. static forcedinline __m128 JUCE_VECTOR_CALLTYPE expand (float s) noexcept { return _mm_load1_ps (&s); }
  47. static forcedinline __m128 JUCE_VECTOR_CALLTYPE load (const float* a) noexcept { return _mm_load_ps (a); }
  48. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128 value, float* dest) noexcept { _mm_store_ps (dest, value); }
  49. static forcedinline __m128 JUCE_VECTOR_CALLTYPE add (__m128 a, __m128 b) noexcept { return _mm_add_ps (a, b); }
  50. static forcedinline __m128 JUCE_VECTOR_CALLTYPE sub (__m128 a, __m128 b) noexcept { return _mm_sub_ps (a, b); }
  51. static forcedinline __m128 JUCE_VECTOR_CALLTYPE mul (__m128 a, __m128 b) noexcept { return _mm_mul_ps (a, b); }
  52. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_and (__m128 a, __m128 b) noexcept { return _mm_and_ps (a, b); }
  53. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_or (__m128 a, __m128 b) noexcept { return _mm_or_ps (a, b); }
  54. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_xor (__m128 a, __m128 b) noexcept { return _mm_xor_ps (a, b); }
  55. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_notand (__m128 a, __m128 b) noexcept { return _mm_andnot_ps (a, b); }
  56. static forcedinline __m128 JUCE_VECTOR_CALLTYPE bit_not (__m128 a) noexcept { return bit_notand (a, _mm_loadu_ps ((float*) kAllBitsSet)); }
  57. static forcedinline __m128 JUCE_VECTOR_CALLTYPE min (__m128 a, __m128 b) noexcept { return _mm_min_ps (a, b); }
  58. static forcedinline __m128 JUCE_VECTOR_CALLTYPE max (__m128 a, __m128 b) noexcept { return _mm_max_ps (a, b); }
  59. static forcedinline __m128 JUCE_VECTOR_CALLTYPE equal (__m128 a, __m128 b) noexcept { return _mm_cmpeq_ps (a, b); }
  60. static forcedinline __m128 JUCE_VECTOR_CALLTYPE notEqual (__m128 a, __m128 b) noexcept { return _mm_cmpneq_ps (a, b); }
  61. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThan (__m128 a, __m128 b) noexcept { return _mm_cmpgt_ps (a, b); }
  62. static forcedinline __m128 JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128 a, __m128 b) noexcept { return _mm_cmpge_ps (a, b); }
  63. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128 a, __m128 b ) noexcept { return (_mm_movemask_ps (equal (a, b)) == 0xf); }
  64. static forcedinline __m128 JUCE_VECTOR_CALLTYPE multiplyAdd (__m128 a, __m128 b, __m128 c) noexcept { return _mm_add_ps (a, _mm_mul_ps (b, c)); }
  65. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupeven (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 2, 0, 0)); }
  66. static forcedinline __m128 JUCE_VECTOR_CALLTYPE dupodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 3, 1, 1)); }
  67. static forcedinline __m128 JUCE_VECTOR_CALLTYPE swapevenodd (__m128 a) noexcept { return _mm_shuffle_ps (a, a, _MM_SHUFFLE (2, 3, 0, 1)); }
  68. static forcedinline __m128 JUCE_VECTOR_CALLTYPE oddevensum (__m128 a) noexcept { return _mm_add_ps (_mm_shuffle_ps (a, a, _MM_SHUFFLE (1, 0, 3, 2)), a); }
  69. static forcedinline float JUCE_VECTOR_CALLTYPE get (__m128 v, size_t i) noexcept { return SIMDFallbackOps<float, __m128>::get (v, i); }
  70. static forcedinline __m128 JUCE_VECTOR_CALLTYPE set (__m128 v, size_t i, float s) noexcept { return SIMDFallbackOps<float, __m128>::set (v, i, s); }
  71. static forcedinline __m128 JUCE_VECTOR_CALLTYPE truncate (__m128 a) noexcept { return _mm_cvtepi32_ps (_mm_cvttps_epi32 (a)); }
  72. //==============================================================================
  73. static forcedinline __m128 JUCE_VECTOR_CALLTYPE cmplxmul (__m128 a, __m128 b) noexcept
  74. {
  75. __m128 rr_ir = mul (a, dupeven (b));
  76. __m128 ii_ri = mul (swapevenodd (a), dupodd (b));
  77. return add (rr_ir, bit_xor (ii_ri, _mm_loadu_ps ((float*) kEvenHighBit)));
  78. }
  79. static forcedinline float JUCE_VECTOR_CALLTYPE sum (__m128 a) noexcept
  80. {
  81. #if defined(__SSE4__)
  82. __m128 retval = _mm_dp_ps (a, _mm_loadu_ps (kOne), 0xff);
  83. #elif defined(__SSE3__)
  84. __m128 retval = _mm_hadd_ps (_mm_hadd_ps (a, a), a);
  85. #else
  86. __m128 retval = _mm_add_ps (_mm_shuffle_ps (a, a, 0x4e), a);
  87. retval = _mm_add_ps (retval, _mm_shuffle_ps (retval, retval, 0xb1));
  88. #endif
  89. return _mm_cvtss_f32 (retval);
  90. }
  91. };
  92. //==============================================================================
  93. /** Double-precision floating point SSE intrinsics.
  94. @tags{DSP}
  95. */
  96. template <>
  97. struct SIMDNativeOps<double>
  98. {
  99. //==============================================================================
  100. using vSIMDType = __m128d;
  101. //==============================================================================
  102. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  103. DECLARE_SSE_SIMD_CONST (int64_t, kEvenHighBit);
  104. DECLARE_SSE_SIMD_CONST (double, kOne);
  105. //==============================================================================
  106. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const double* a) noexcept { return load (a); }
  107. static forcedinline __m128d JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return _mm_castsi128_pd (_mm_load_si128 (reinterpret_cast<const __m128i*> (a))); }
  108. static forcedinline __m128d JUCE_VECTOR_CALLTYPE expand (double s) noexcept { return _mm_load1_pd (&s); }
  109. static forcedinline __m128d JUCE_VECTOR_CALLTYPE load (const double* a) noexcept { return _mm_load_pd (a); }
  110. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128d value, double* dest) noexcept { _mm_store_pd (dest, value); }
  111. static forcedinline __m128d JUCE_VECTOR_CALLTYPE add (__m128d a, __m128d b) noexcept { return _mm_add_pd (a, b); }
  112. static forcedinline __m128d JUCE_VECTOR_CALLTYPE sub (__m128d a, __m128d b) noexcept { return _mm_sub_pd (a, b); }
  113. static forcedinline __m128d JUCE_VECTOR_CALLTYPE mul (__m128d a, __m128d b) noexcept { return _mm_mul_pd (a, b); }
  114. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_and (__m128d a, __m128d b) noexcept { return _mm_and_pd (a, b); }
  115. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_or (__m128d a, __m128d b) noexcept { return _mm_or_pd (a, b); }
  116. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_xor (__m128d a, __m128d b) noexcept { return _mm_xor_pd (a, b); }
  117. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_notand (__m128d a, __m128d b) noexcept { return _mm_andnot_pd (a, b); }
  118. static forcedinline __m128d JUCE_VECTOR_CALLTYPE bit_not (__m128d a) noexcept { return bit_notand (a, vconst (kAllBitsSet)); }
  119. static forcedinline __m128d JUCE_VECTOR_CALLTYPE min (__m128d a, __m128d b) noexcept { return _mm_min_pd (a, b); }
  120. static forcedinline __m128d JUCE_VECTOR_CALLTYPE max (__m128d a, __m128d b) noexcept { return _mm_max_pd (a, b); }
  121. static forcedinline __m128d JUCE_VECTOR_CALLTYPE equal (__m128d a, __m128d b) noexcept { return _mm_cmpeq_pd (a, b); }
  122. static forcedinline __m128d JUCE_VECTOR_CALLTYPE notEqual (__m128d a, __m128d b) noexcept { return _mm_cmpneq_pd (a, b); }
  123. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThan (__m128d a, __m128d b) noexcept { return _mm_cmpgt_pd (a, b); }
  124. static forcedinline __m128d JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128d a, __m128d b) noexcept { return _mm_cmpge_pd (a, b); }
  125. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128d a, __m128d b ) noexcept { return (_mm_movemask_pd (equal (a, b)) == 0x3); }
  126. static forcedinline __m128d JUCE_VECTOR_CALLTYPE multiplyAdd (__m128d a, __m128d b, __m128d c) noexcept { return _mm_add_pd (a, _mm_mul_pd (b, c)); }
  127. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupeven (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 0)); }
  128. static forcedinline __m128d JUCE_VECTOR_CALLTYPE dupodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (1, 1)); }
  129. static forcedinline __m128d JUCE_VECTOR_CALLTYPE swapevenodd (__m128d a) noexcept { return _mm_shuffle_pd (a, a, _MM_SHUFFLE2 (0, 1)); }
  130. static forcedinline __m128d JUCE_VECTOR_CALLTYPE oddevensum (__m128d a) noexcept { return a; }
  131. static forcedinline double JUCE_VECTOR_CALLTYPE get (__m128d v, size_t i) noexcept { return SIMDFallbackOps<double, __m128d>::get (v, i); }
  132. static forcedinline __m128d JUCE_VECTOR_CALLTYPE set (__m128d v, size_t i, double s) noexcept { return SIMDFallbackOps<double, __m128d>::set (v, i, s); }
  133. static forcedinline __m128d JUCE_VECTOR_CALLTYPE truncate (__m128d a) noexcept { return _mm_cvtepi32_pd (_mm_cvttpd_epi32 (a)); }
  134. //==============================================================================
  135. static forcedinline __m128d JUCE_VECTOR_CALLTYPE cmplxmul (__m128d a, __m128d b) noexcept
  136. {
  137. __m128d rr_ir = mul (a, dupeven (b));
  138. __m128d ii_ri = mul (swapevenodd (a), dupodd (b));
  139. return add (rr_ir, bit_xor (ii_ri, vconst (kEvenHighBit)));
  140. }
  141. static forcedinline double JUCE_VECTOR_CALLTYPE sum (__m128d a) noexcept
  142. {
  143. #if defined(__SSE4__)
  144. __m128d retval = _mm_dp_pd (a, vconst (kOne), 0xff);
  145. #elif defined(__SSE3__)
  146. __m128d retval = _mm_hadd_pd (a, a);
  147. #else
  148. __m128d retval = _mm_add_pd (_mm_shuffle_pd (a, a, 0x01), a);
  149. #endif
  150. return _mm_cvtsd_f64 (retval);
  151. }
  152. };
  153. //==============================================================================
  154. /** Signed 8-bit integer SSE intrinsics.
  155. @tags{DSP}
  156. */
  157. template <>
  158. struct SIMDNativeOps<int8_t>
  159. {
  160. //==============================================================================
  161. using vSIMDType = __m128i;
  162. //==============================================================================
  163. DECLARE_SSE_SIMD_CONST (int8_t, kAllBitsSet);
  164. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int8_t* a) noexcept { return load (a); }
  165. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  166. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  167. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int8_t s) noexcept { return _mm_set1_epi8 (s); }
  168. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  169. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  170. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  171. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  172. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  173. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  174. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  175. #if defined(__SSE4__)
  176. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi8 (a, b); }
  177. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi8 (a, b); }
  178. #else
  179. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  180. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  181. #endif
  182. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  183. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (a, b); }
  184. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  185. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  186. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  187. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  188. static forcedinline int8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int8_t, __m128i>::get (v, i); }
  189. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int8_t s) noexcept { return SIMDFallbackOps<int8_t, __m128i>::set (v, i, s); }
  190. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  191. //==============================================================================
  192. static forcedinline int8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  193. {
  194. #ifdef __SSSE3__
  195. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  196. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  197. for (int i = 0; i < 3; ++i)
  198. {
  199. lo = _mm_hadd_epi16 (lo, lo);
  200. hi = _mm_hadd_epi16 (hi, hi);
  201. }
  202. return static_cast<int8_t> ((_mm_cvtsi128_si32 (lo) & 0xff) + (_mm_cvtsi128_si32 (hi) & 0xff));
  203. #else
  204. return SIMDFallbackOps<int8_t, __m128i>::sum (a);
  205. #endif
  206. }
  207. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  208. {
  209. // unpack and multiply
  210. __m128i even = _mm_mullo_epi16 (a, b);
  211. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  212. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  213. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  214. }
  215. };
  216. //==============================================================================
  217. /** Unsigned 8-bit integer SSE intrinsics.
  218. @tags{DSP}
  219. */
  220. template <>
  221. struct SIMDNativeOps<uint8_t>
  222. {
  223. //==============================================================================
  224. using vSIMDType = __m128i;
  225. //==============================================================================
  226. DECLARE_SSE_SIMD_CONST (uint8_t, kHighBit);
  227. DECLARE_SSE_SIMD_CONST (uint8_t, kAllBitsSet);
  228. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint8_t* a) noexcept { return load (a); }
  229. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  230. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint8_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  231. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint8_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  232. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint8_t s) noexcept { return _mm_set1_epi8 ((int8_t) s); }
  233. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi8 (a, b); }
  234. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi8 (a, b); }
  235. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  236. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  237. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  238. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  239. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  240. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu8 (a, b); }
  241. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu8 (a, b); }
  242. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi8 (a, b); }
  243. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi8 (ssign (a), ssign (b)); }
  244. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  245. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  246. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  247. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  248. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::get (v, i); }
  249. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint8_t s) noexcept { return SIMDFallbackOps<uint8_t, __m128i>::set (v, i, s); }
  250. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  251. //==============================================================================
  252. static forcedinline uint8_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  253. {
  254. #ifdef __SSSE3__
  255. __m128i lo = _mm_unpacklo_epi8 (a, _mm_setzero_si128());
  256. __m128i hi = _mm_unpackhi_epi8 (a, _mm_setzero_si128());
  257. for (int i = 0; i < 3; ++i)
  258. {
  259. lo = _mm_hadd_epi16 (lo, lo);
  260. hi = _mm_hadd_epi16 (hi, hi);
  261. }
  262. return static_cast<uint8_t> ((static_cast<uint32_t> (_mm_cvtsi128_si32 (lo)) & 0xffu)
  263. + (static_cast<uint32_t> (_mm_cvtsi128_si32 (hi)) & 0xffu));
  264. #else
  265. return SIMDFallbackOps<uint8_t, __m128i>::sum (a);
  266. #endif
  267. }
  268. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b)
  269. {
  270. // unpack and multiply
  271. __m128i even = _mm_mullo_epi16 (a, b);
  272. __m128i odd = _mm_mullo_epi16 (_mm_srli_epi16 (a, 8), _mm_srli_epi16 (b, 8));
  273. return _mm_or_si128 (_mm_slli_epi16 (odd, 8),
  274. _mm_srli_epi16 (_mm_slli_epi16 (even, 8), 8));
  275. }
  276. };
  277. //==============================================================================
  278. /** Signed 16-bit integer SSE intrinsics.
  279. @tags{DSP}
  280. */
  281. template <>
  282. struct SIMDNativeOps<int16_t>
  283. {
  284. //==============================================================================
  285. using vSIMDType = __m128i;
  286. //==============================================================================
  287. DECLARE_SSE_SIMD_CONST (int16_t, kAllBitsSet);
  288. //==============================================================================
  289. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int16_t* a) noexcept { return load (a); }
  290. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  291. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  292. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int16_t s) noexcept { return _mm_set1_epi16 (s); }
  293. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  294. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  295. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  296. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  297. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  298. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  299. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  300. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  301. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epi16 (a, b); }
  302. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epi16 (a, b); }
  303. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  304. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (a, b); }
  305. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  306. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  307. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  308. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  309. static forcedinline int16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int16_t, __m128i>::get (v, i); }
  310. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int16_t s) noexcept { return SIMDFallbackOps<int16_t, __m128i>::set (v, i, s); }
  311. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  312. //==============================================================================
  313. static forcedinline int16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  314. {
  315. #ifdef __SSSE3__
  316. __m128i tmp = _mm_hadd_epi16 (a, a);
  317. tmp = _mm_hadd_epi16 (tmp, tmp);
  318. tmp = _mm_hadd_epi16 (tmp, tmp);
  319. return static_cast<int16_t> (_mm_cvtsi128_si32 (tmp) & 0xffff);
  320. #else
  321. return SIMDFallbackOps<int16_t, __m128i>::sum (a);
  322. #endif
  323. }
  324. };
  325. //==============================================================================
  326. /** Unsigned 16-bit integer SSE intrinsics.
  327. @tags{DSP}
  328. */
  329. template <>
  330. struct SIMDNativeOps<uint16_t>
  331. {
  332. //==============================================================================
  333. using vSIMDType = __m128i;
  334. //==============================================================================
  335. DECLARE_SSE_SIMD_CONST (uint16_t, kHighBit);
  336. DECLARE_SSE_SIMD_CONST (uint16_t, kAllBitsSet);
  337. //==============================================================================
  338. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint16_t* a) noexcept { return load (a); }
  339. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  340. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint16_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  341. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint16_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  342. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint16_t s) noexcept { return _mm_set1_epi16 ((int16_t) s); }
  343. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi16 (a, b); }
  344. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi16 (a, b); }
  345. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return _mm_mullo_epi16 (a, b); }
  346. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  347. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  348. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  349. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  350. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  351. #if defined(__SSE4__)
  352. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { return _mm_min_epu16 (a, b); }
  353. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { return _mm_max_epu16 (a, b); }
  354. #else
  355. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  356. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  357. #endif
  358. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi16 (a, b); }
  359. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi16 (ssign (a), ssign (b)); }
  360. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  361. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  362. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  363. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  364. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::get (v, i); }
  365. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint16_t s) noexcept { return SIMDFallbackOps<uint16_t, __m128i>::set (v, i, s); }
  366. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  367. //==============================================================================
  368. static forcedinline uint16_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  369. {
  370. #ifdef __SSSE3__
  371. __m128i tmp = _mm_hadd_epi16 (a, a);
  372. tmp = _mm_hadd_epi16 (tmp, tmp);
  373. tmp = _mm_hadd_epi16 (tmp, tmp);
  374. return static_cast<uint16_t> (static_cast<uint32_t> (_mm_cvtsi128_si32 (tmp)) & 0xffffu);
  375. #else
  376. return SIMDFallbackOps<uint16_t, __m128i>::sum (a);
  377. #endif
  378. }
  379. };
  380. //==============================================================================
  381. /** Signed 32-bit integer SSE intrinsics.
  382. @tags{DSP}
  383. */
  384. template <>
  385. struct SIMDNativeOps<int32_t>
  386. {
  387. //==============================================================================
  388. using vSIMDType = __m128i;
  389. //==============================================================================
  390. DECLARE_SSE_SIMD_CONST (int32_t, kAllBitsSet);
  391. //==============================================================================
  392. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int32_t* a) noexcept { return load (a); }
  393. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  394. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  395. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int32_t s) noexcept { return _mm_set1_epi32 (s); }
  396. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  397. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  398. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  399. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  400. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  401. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  402. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  403. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  404. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (a, b); }
  405. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  406. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  407. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  408. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  409. static forcedinline int32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int32_t, __m128i>::get (v, i); }
  410. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int32_t s) noexcept { return SIMDFallbackOps<int32_t, __m128i>::set (v, i, s); }
  411. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  412. //==============================================================================
  413. static forcedinline int32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  414. {
  415. #ifdef __SSSE3__
  416. __m128i tmp = _mm_hadd_epi32 (a, a);
  417. return _mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp));
  418. #else
  419. return SIMDFallbackOps<int32_t, __m128i>::sum (a);
  420. #endif
  421. }
  422. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  423. {
  424. #if defined(__SSE4_1__)
  425. return _mm_mullo_epi32 (a, b);
  426. #else
  427. __m128i even = _mm_mul_epu32 (a,b);
  428. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  429. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  430. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  431. #endif
  432. }
  433. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  434. {
  435. #if defined(__SSE4_1__)
  436. return _mm_min_epi32 (a, b);
  437. #else
  438. __m128i lt = greaterThan (b, a);
  439. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  440. #endif
  441. }
  442. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  443. {
  444. #if defined(__SSE4_1__)
  445. return _mm_max_epi32 (a, b);
  446. #else
  447. __m128i gt = greaterThan (a, b);
  448. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  449. #endif
  450. }
  451. };
  452. //==============================================================================
  453. /** Unsigned 32-bit integer SSE intrinsics.
  454. @tags{DSP}
  455. */
  456. template <>
  457. struct SIMDNativeOps<uint32_t>
  458. {
  459. //==============================================================================
  460. using vSIMDType = __m128i;
  461. //==============================================================================
  462. DECLARE_SSE_SIMD_CONST (uint32_t, kAllBitsSet);
  463. DECLARE_SSE_SIMD_CONST (uint32_t, kHighBit);
  464. //==============================================================================
  465. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint32_t* a) noexcept { return load (a); }
  466. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  467. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint32_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  468. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint32_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  469. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint32_t s) noexcept { return _mm_set1_epi32 ((int32_t) s); }
  470. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi32 (a, b); }
  471. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi32 (a, b); }
  472. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  473. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  474. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  475. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  476. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  477. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept { return _mm_cmpeq_epi32 (a, b); }
  478. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept { return _mm_cmpgt_epi32 (ssign (a), ssign (b)); }
  479. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  480. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  481. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  482. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  483. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::get (v, i); }
  484. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint32_t s) noexcept { return SIMDFallbackOps<uint32_t, __m128i>::set (v, i, s); }
  485. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  486. //==============================================================================
  487. static forcedinline uint32_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept
  488. {
  489. #ifdef __SSSE3__
  490. __m128i tmp = _mm_hadd_epi32 (a, a);
  491. return static_cast<uint32_t> (_mm_cvtsi128_si32 (_mm_hadd_epi32 (tmp, tmp)));
  492. #else
  493. return SIMDFallbackOps<uint32_t, __m128i>::sum (a);
  494. #endif
  495. }
  496. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept
  497. {
  498. #if defined(__SSE4_1__)
  499. return _mm_mullo_epi32 (a, b);
  500. #else
  501. __m128i even = _mm_mul_epu32 (a,b);
  502. __m128i odd = _mm_mul_epu32 (_mm_srli_si128 (a,4), _mm_srli_si128 (b,4));
  503. return _mm_unpacklo_epi32 (_mm_shuffle_epi32(even, _MM_SHUFFLE (0,0,2,0)),
  504. _mm_shuffle_epi32(odd, _MM_SHUFFLE (0,0,2,0)));
  505. #endif
  506. }
  507. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept
  508. {
  509. #if defined(__SSE4_1__)
  510. return _mm_min_epi32 (a, b);
  511. #else
  512. __m128i lt = greaterThan (b, a);
  513. return bit_or (bit_and (lt, a), bit_andnot (lt, b));
  514. #endif
  515. }
  516. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept
  517. {
  518. #if defined(__SSE4_1__)
  519. return _mm_max_epi32 (a, b);
  520. #else
  521. __m128i gt = greaterThan (a, b);
  522. return bit_or (bit_and (gt, a), bit_andnot (gt, b));
  523. #endif
  524. }
  525. };
  526. //==============================================================================
  527. /** Signed 64-bit integer SSE intrinsics.
  528. @tags{DSP}
  529. */
  530. template <>
  531. struct SIMDNativeOps<int64_t>
  532. {
  533. //==============================================================================
  534. using vSIMDType = __m128i;
  535. //==============================================================================
  536. DECLARE_SSE_SIMD_CONST (int64_t, kAllBitsSet);
  537. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const int64_t* a) noexcept { return load (a); }
  538. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (int64_t s) noexcept { return _mm_set1_epi64x (s); }
  539. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const int64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  540. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, int64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  541. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  542. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  543. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  544. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  545. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  546. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  547. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  548. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  549. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  550. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  551. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  552. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  553. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  554. static forcedinline int64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<int64_t, __m128i>::get (v, i); }
  555. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, int64_t s) noexcept { return SIMDFallbackOps<int64_t, __m128i>::set (v, i, s); }
  556. static forcedinline int64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<int64_t, __m128i>::sum (a); }
  557. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<int64_t, __m128i>::mul (a, b); }
  558. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  559. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  560. {
  561. #if defined(__SSE4_1__)
  562. return _mm_cmpeq_epi64 (a, b);
  563. #else
  564. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  565. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  566. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  567. #endif
  568. }
  569. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  570. {
  571. #if defined(__SSE4_2__)
  572. return _mm_cmpgt_epi64 (a, b);
  573. #else
  574. return SIMDFallbackOps<int64_t, __m128i>::greaterThan (a, b);
  575. #endif
  576. }
  577. };
  578. //==============================================================================
  579. /** Unsigned 64-bit integer SSE intrinsics.
  580. @tags{DSP}
  581. */
  582. template <>
  583. struct SIMDNativeOps<uint64_t>
  584. {
  585. //==============================================================================
  586. using vSIMDType = __m128i;
  587. //==============================================================================
  588. DECLARE_SSE_SIMD_CONST (uint64_t, kAllBitsSet);
  589. DECLARE_SSE_SIMD_CONST (uint64_t, kHighBit);
  590. static forcedinline __m128i JUCE_VECTOR_CALLTYPE vconst (const uint64_t* a) noexcept { return load (a); }
  591. static forcedinline __m128i JUCE_VECTOR_CALLTYPE expand (uint64_t s) noexcept { return _mm_set1_epi64x ((int64_t) s); }
  592. static forcedinline __m128i JUCE_VECTOR_CALLTYPE ssign (__m128i a) noexcept { return _mm_xor_si128 (a, vconst (kHighBit)); }
  593. static forcedinline __m128i JUCE_VECTOR_CALLTYPE load (const uint64_t* a) noexcept { return _mm_load_si128 (reinterpret_cast<const __m128i*> (a)); }
  594. static forcedinline void JUCE_VECTOR_CALLTYPE store (__m128i v, uint64_t* p) noexcept { _mm_store_si128 (reinterpret_cast<__m128i*> (p), v); }
  595. static forcedinline __m128i JUCE_VECTOR_CALLTYPE add (__m128i a, __m128i b) noexcept { return _mm_add_epi64 (a, b); }
  596. static forcedinline __m128i JUCE_VECTOR_CALLTYPE sub (__m128i a, __m128i b) noexcept { return _mm_sub_epi64 (a, b); }
  597. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_and (__m128i a, __m128i b) noexcept { return _mm_and_si128 (a, b); }
  598. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_or (__m128i a, __m128i b) noexcept { return _mm_or_si128 (a, b); }
  599. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_xor (__m128i a, __m128i b) noexcept { return _mm_xor_si128 (a, b); }
  600. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_andnot (__m128i a, __m128i b) noexcept { return _mm_andnot_si128 (a, b); }
  601. static forcedinline __m128i JUCE_VECTOR_CALLTYPE bit_not (__m128i a) noexcept { return _mm_andnot_si128 (a, vconst (kAllBitsSet)); }
  602. static forcedinline __m128i JUCE_VECTOR_CALLTYPE min (__m128i a, __m128i b) noexcept { __m128i lt = greaterThan (b, a); return bit_or (bit_and (lt, a), bit_andnot (lt, b)); }
  603. static forcedinline __m128i JUCE_VECTOR_CALLTYPE max (__m128i a, __m128i b) noexcept { __m128i gt = greaterThan (a, b); return bit_or (bit_and (gt, a), bit_andnot (gt, b)); }
  604. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThanOrEqual (__m128i a, __m128i b) noexcept { return bit_or (greaterThan (a, b), equal (a,b)); }
  605. static forcedinline __m128i JUCE_VECTOR_CALLTYPE multiplyAdd (__m128i a, __m128i b, __m128i c) noexcept { return add (a, mul (b, c)); }
  606. static forcedinline __m128i JUCE_VECTOR_CALLTYPE notEqual (__m128i a, __m128i b) noexcept { return bit_not (equal (a, b)); }
  607. static forcedinline bool JUCE_VECTOR_CALLTYPE allEqual (__m128i a, __m128i b) noexcept { return (_mm_movemask_epi8 (equal (a, b)) == 0xffff); }
  608. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE get (__m128i v, size_t i) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::get (v, i); }
  609. static forcedinline __m128i JUCE_VECTOR_CALLTYPE set (__m128i v, size_t i, uint64_t s) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::set (v, i, s); }
  610. static forcedinline uint64_t JUCE_VECTOR_CALLTYPE sum (__m128i a) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::sum (a); }
  611. static forcedinline __m128i JUCE_VECTOR_CALLTYPE mul (__m128i a, __m128i b) noexcept { return SIMDFallbackOps<uint64_t, __m128i>::mul (a, b); }
  612. static forcedinline __m128i JUCE_VECTOR_CALLTYPE truncate (__m128i a) noexcept { return a; }
  613. static forcedinline __m128i JUCE_VECTOR_CALLTYPE equal (__m128i a, __m128i b) noexcept
  614. {
  615. #if defined(__SSE4_1__)
  616. return _mm_cmpeq_epi64 (a, b);
  617. #else
  618. __m128i bitmask = _mm_cmpeq_epi32 (a, b);
  619. bitmask = _mm_and_si128 (bitmask, _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 3, 0, 1)));
  620. return _mm_shuffle_epi32 (bitmask, _MM_SHUFFLE (2, 2, 0, 0));
  621. #endif
  622. }
  623. static forcedinline __m128i JUCE_VECTOR_CALLTYPE greaterThan (__m128i a, __m128i b) noexcept
  624. {
  625. #if defined(__SSE4_2__)
  626. return _mm_cmpgt_epi64 (ssign (a), ssign (b));
  627. #else
  628. return SIMDFallbackOps<uint64_t, __m128i>::greaterThan (a, b);
  629. #endif
  630. }
  631. };
  632. #endif
  633. JUCE_END_IGNORE_WARNINGS_GCC_LIKE
  634. } // namespace dsp
  635. } // namespace juce