You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

380 lines
11KB

  1. /*
  2. * DISTRHO Cardinal Plugin
  3. * Copyright (C) 2021-2022 Filipe Coelho <falktx@falktx.com>
  4. *
  5. * This program is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU General Public License as
  7. * published by the Free Software Foundation; either version 3 of
  8. * the License, or any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * For a full copy of the GNU General Public License see the LICENSE file.
  16. */
  17. /**
  18. * This file is an edited version of VCVRack's simd/Vector.hpp
  19. * Copyright (C) 2016-2021 VCV.
  20. *
  21. * This program is free software: you can redistribute it and/or
  22. * modify it under the terms of the GNU General Public License as
  23. * published by the Free Software Foundation; either version 3 of
  24. * the License, or (at your option) any later version.
  25. */
  26. #pragma once
  27. #include <cstring>
  28. #include <pmmintrin.h>
  29. /** NOTE alignas is required in some systems in order to allow SSE usage. */
  30. #ifndef ARCH_MAC
  31. #define SIMD_ALIGN alignas(32)
  32. #else
  33. #define SIMD_ALIGN
  34. #endif
  35. namespace rack {
  36. /** Abstraction of aligned types for SIMD computation
  37. */
  38. namespace simd {
  39. /** Generic class for vector types.
  40. This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  41. Example:
  42. float a[4], b[4];
  43. float_4 a = float_4::load(in);
  44. float_4 b = 2.f * a / (1 - a);
  45. b *= sin(2 * M_PI * a);
  46. b.store(out);
  47. */
  48. template <typename TYPE, int SIZE>
  49. struct Vector;
  50. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  51. */
  52. template <>
  53. struct Vector<float, 4> {
  54. using type = float;
  55. constexpr static int size = 4;
  56. union SIMD_ALIGN {
  57. __m128 v;
  58. /** Accessing this array of scalars is slow and defeats the purpose of vectorizing.
  59. */
  60. float s[4];
  61. };
  62. /** Constructs an uninitialized vector. */
  63. Vector() = default;
  64. /** Constructs a vector from a native `__m128` type. */
  65. Vector(__m128 v) : v(v) {}
  66. /** Constructs a vector with all elements set to `x`. */
  67. Vector(float x) {
  68. v = _mm_set1_ps(x);
  69. }
  70. /** Constructs a vector from four scalars. */
  71. Vector(float x1, float x2, float x3, float x4) {
  72. v = _mm_setr_ps(x1, x2, x3, x4);
  73. }
  74. /** Returns a vector with all 0 bits. */
  75. static Vector zero() {
  76. return Vector(_mm_setzero_ps());
  77. }
  78. /** Returns a vector with all 1 bits. */
  79. static Vector mask() {
  80. return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
  81. }
  82. /** Reads an array of 4 values.
  83. On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  84. */
  85. static Vector load(const float* x) {
  86. /*
  87. My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
  88. This post seems to agree. https://stackoverflow.com/a/20265193/272642
  89. I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance).
  90. */
  91. return Vector(_mm_loadu_ps(x));
  92. }
  93. /** Writes an array of 4 values.
  94. On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  95. */
  96. void store(float* x) {
  97. _mm_storeu_ps(x, v);
  98. }
  99. /** Accessing vector elements individually is slow and defeats the purpose of vectorizing.
  100. However, this operator is convenient when writing simple serial code in a non-bottlenecked section.
  101. */
  102. float& operator[](int i) {
  103. return s[i];
  104. }
  105. const float& operator[](int i) const {
  106. return s[i];
  107. }
  108. // Conversions
  109. Vector(Vector<int32_t, 4> a);
  110. // Casts
  111. static Vector cast(Vector<int32_t, 4> a);
  112. };
  113. template <>
  114. struct Vector<int32_t, 4> {
  115. using type = int32_t;
  116. constexpr static int size = 4;
  117. union SIMD_ALIGN {
  118. __m128i v;
  119. int32_t s[4];
  120. };
  121. Vector() = default;
  122. Vector(__m128i v) : v(v) {}
  123. Vector(int32_t x) {
  124. v = _mm_set1_epi32(x);
  125. }
  126. Vector(int32_t x1, int32_t x2, int32_t x3, int32_t x4) {
  127. v = _mm_setr_epi32(x1, x2, x3, x4);
  128. }
  129. static Vector zero() {
  130. return Vector(_mm_setzero_si128());
  131. }
  132. static Vector mask() {
  133. return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  134. }
  135. static Vector load(const int32_t* x) {
  136. // HACK
  137. // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
  138. return Vector(_mm_loadu_si128((const __m128i*) x));
  139. }
  140. void store(int32_t* x) {
  141. // HACK
  142. // Use _mm_storeu_si128() because GCC doesn't support _mm_storeu_si32()
  143. _mm_storeu_si128((__m128i*) x, v);
  144. }
  145. int32_t& operator[](int i) {
  146. return s[i];
  147. }
  148. const int32_t& operator[](int i) const {
  149. return s[i];
  150. }
  151. Vector(Vector<float, 4> a);
  152. static Vector cast(Vector<float, 4> a);
  153. };
  154. // Conversions and casts
  155. inline Vector<float, 4>::Vector(Vector<int32_t, 4> a) {
  156. v = _mm_cvtepi32_ps(a.v);
  157. }
  158. inline Vector<int32_t, 4>::Vector(Vector<float, 4> a) {
  159. v = _mm_cvttps_epi32(a.v);
  160. }
  161. inline Vector<float, 4> Vector<float, 4>::cast(Vector<int32_t, 4> a) {
  162. return Vector(_mm_castsi128_ps(a.v));
  163. }
  164. inline Vector<int32_t, 4> Vector<int32_t, 4>::cast(Vector<float, 4> a) {
  165. return Vector(_mm_castps_si128(a.v));
  166. }
  167. // Operator overloads
  168. /** `a @ b` */
  169. #define DECLARE_VECTOR_OPERATOR_INFIX(t, s, operator, func) \
  170. inline Vector<t, s> operator(const Vector<t, s>& a, const Vector<t, s>& b) { \
  171. return Vector<t, s>(func(a.v, b.v)); \
  172. }
  173. /** `a @= b` */
  174. #define DECLARE_VECTOR_OPERATOR_INCREMENT(t, s, operator, opfunc) \
  175. inline Vector<t, s>& operator(Vector<t, s>& a, const Vector<t, s>& b) { \
  176. return a = opfunc(a, b); \
  177. }
  178. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator+, _mm_add_ps)
  179. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator+, _mm_add_epi32)
  180. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator-, _mm_sub_ps)
  181. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator-, _mm_sub_epi32)
  182. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator*, _mm_mul_ps)
  183. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator*, NOT AVAILABLE IN SSE3)
  184. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator/, _mm_div_ps)
  185. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator/, NOT AVAILABLE IN SSE3)
  186. /* Use these to apply logic, bit masks, and conditions to elements.
  187. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  188. Examples:
  189. Subtract 1 from value if greater than or equal to 1.
  190. x -= (x >= 1.f) & 1.f;
  191. */
  192. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator^, _mm_xor_ps)
  193. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator^, _mm_xor_si128)
  194. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator&, _mm_and_ps)
  195. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator&, _mm_and_si128)
  196. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator|, _mm_or_ps)
  197. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator|, _mm_or_si128)
  198. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator+=, operator+)
  199. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator+=, operator+)
  200. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator-=, operator-)
  201. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator-=, operator-)
  202. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator*=, operator*)
  203. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator*=, NOT AVAILABLE IN SSE3)
  204. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator/=, operator/)
  205. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator/=, NOT AVAILABLE IN SSE3)
  206. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator^=, operator^)
  207. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator^=, operator^)
  208. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator&=, operator&)
  209. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator&=, operator&)
  210. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator|=, operator|)
  211. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator|=, operator|)
  212. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator==, _mm_cmpeq_ps)
  213. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator==, _mm_cmpeq_epi32)
  214. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>=, _mm_cmpge_ps)
  215. inline Vector<int32_t, 4> operator>=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  216. return Vector<int32_t, 4>(_mm_cmpgt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  217. }
  218. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>, _mm_cmpgt_ps)
  219. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator>, _mm_cmpgt_epi32)
  220. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<=, _mm_cmple_ps)
  221. inline Vector<int32_t, 4> operator<=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  222. return Vector<int32_t, 4>(_mm_cmplt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  223. }
  224. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<, _mm_cmplt_ps)
  225. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator<, _mm_cmplt_epi32)
  226. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator!=, _mm_cmpneq_ps)
  227. inline Vector<int32_t, 4> operator!=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  228. return Vector<int32_t, 4>(_mm_cmpeq_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  229. }
  230. /** `+a` */
  231. inline Vector<float, 4> operator+(const Vector<float, 4>& a) {
  232. return a;
  233. }
  234. inline Vector<int32_t, 4> operator+(const Vector<int32_t, 4>& a) {
  235. return a;
  236. }
  237. /** `-a` */
  238. inline Vector<float, 4> operator-(const Vector<float, 4>& a) {
  239. return 0.f - a;
  240. }
  241. inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {
  242. return 0 - a;
  243. }
  244. /** `++a` */
  245. inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
  246. return a += 1.f;
  247. }
  248. inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
  249. return a += 1;
  250. }
  251. /** `--a` */
  252. inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
  253. return a -= 1.f;
  254. }
  255. inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
  256. return a -= 1;
  257. }
  258. /** `a++` */
  259. inline Vector<float, 4> operator++(Vector<float, 4>& a, int) {
  260. Vector<float, 4> b = a;
  261. ++a;
  262. return b;
  263. }
  264. inline Vector<int32_t, 4> operator++(Vector<int32_t, 4>& a, int) {
  265. Vector<int32_t, 4> b = a;
  266. ++a;
  267. return b;
  268. }
  269. /** `a--` */
  270. inline Vector<float, 4> operator--(Vector<float, 4>& a, int) {
  271. Vector<float, 4> b = a;
  272. --a;
  273. return b;
  274. }
  275. inline Vector<int32_t, 4> operator--(Vector<int32_t, 4>& a, int) {
  276. Vector<int32_t, 4> b = a;
  277. --a;
  278. return b;
  279. }
  280. /** `~a` */
  281. inline Vector<float, 4> operator~(const Vector<float, 4>& a) {
  282. return a ^ Vector<float, 4>::mask();
  283. }
  284. inline Vector<int32_t, 4> operator~(const Vector<int32_t, 4>& a) {
  285. return a ^ Vector<int32_t, 4>::mask();
  286. }
  287. /** `a << b` */
  288. inline Vector<int32_t, 4> operator<<(const Vector<int32_t, 4>& a, const int& b) {
  289. return Vector<int32_t, 4>(_mm_slli_epi32(a.v, b));
  290. }
  291. /** `a >> b` */
  292. inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b) {
  293. return Vector<int32_t, 4>(_mm_srli_epi32(a.v, b));
  294. }
  295. // Typedefs
  296. using float_4 = Vector<float, 4>;
  297. using int32_4 = Vector<int32_t, 4>;
  298. } // namespace simd
  299. } // namespace rack