You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

322 lines
8.7KB

  1. #pragma once
  2. #include <cstring>
  3. #include <pmmintrin.h>
  4. namespace rack {
  5. /** Abstraction of byte-aligned values for SIMD CPU acceleration. */
  6. namespace simd {
  7. /** Generic class for vector types.
  8. This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  9. Usage example:
  10. float a[4], b[4];
  11. float_4 a = float_4::load(in);
  12. float_4 b = 2.f * a / (1 - a);
  13. b *= sin(2 * M_PI * a);
  14. b.store(out);
  15. */
  16. template <typename T, int N>
  17. struct Vector;
  18. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  19. */
  20. template <>
  21. struct Vector<float, 4> {
  22. union {
  23. __m128 v;
  24. /** Accessing this array of scalars is slow and defeats the purpose of vectorizing.
  25. */
  26. float s[4];
  27. };
  28. /** Constructs an uninitialized vector. */
  29. Vector() {}
  30. /** Constructs a vector from a native `__m128` type. */
  31. Vector(__m128 v) : v(v) {}
  32. /** Constructs a vector with all elements set to `x`. */
  33. Vector(float x) {
  34. v = _mm_set1_ps(x);
  35. }
  36. /** Constructs a vector from four values. */
  37. Vector(float x1, float x2, float x3, float x4) {
  38. v = _mm_set_ps(x1, x2, x3, x4);
  39. }
  40. /** Returns a vector initialized to zero. */
  41. static Vector zero() {
  42. return Vector(_mm_setzero_ps());
  43. }
  44. /** Returns a vector with all 1 bits. */
  45. static Vector mask() {
  46. return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  47. }
  48. /** Reads an array of 4 values. */
  49. static Vector load(const float *x) {
  50. /*
  51. My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
  52. This post seems to agree. https://stackoverflow.com/a/20265193/272642
  53. So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower).
  54. */
  55. return Vector(_mm_loadu_ps(x));
  56. }
  57. /** Writes an array of 4 values. */
  58. void store(float *x) {
  59. _mm_storeu_ps(x, v);
  60. }
  61. // Conversions
  62. Vector(Vector<int32_t, 4> a);
  63. // Casts
  64. static Vector cast(Vector<int32_t, 4> a);
  65. };
  66. template <>
  67. struct Vector<int32_t, 4> {
  68. union {
  69. __m128i v;
  70. int32_t s[4];
  71. };
  72. Vector() {}
  73. Vector(__m128i v) : v(v) {}
  74. Vector(int32_t x) {
  75. v = _mm_set1_epi32(x);
  76. }
  77. Vector(int32_t x1, int32_t x2, int32_t x3, int32_t x4) {
  78. v = _mm_set_epi32(x1, x2, x3, x4);
  79. }
  80. static Vector zero() {
  81. return Vector(_mm_setzero_si128());
  82. }
  83. static Vector mask() {
  84. return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  85. }
  86. static Vector load(const int32_t *x) {
  87. // HACK
  88. // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
  89. return Vector(_mm_loadu_si128((__m128i*) x));
  90. }
  91. void store(int32_t *x) {
  92. // HACK
  93. // Use _mm_storeu_si128() because GCC doesn't support _mm_storeu_si32()
  94. _mm_storeu_si128((__m128i*) x, v);
  95. }
  96. Vector(Vector<float, 4> a);
  97. static Vector cast(Vector<float, 4> a);
  98. };
  99. // Conversions and casts
  100. inline Vector<float, 4>::Vector(Vector<int32_t, 4> a) {
  101. v = _mm_cvtepi32_ps(a.v);
  102. }
  103. inline Vector<int32_t, 4>::Vector(Vector<float, 4> a) {
  104. v = _mm_cvtps_epi32(a.v);
  105. }
  106. inline Vector<float, 4> Vector<float, 4>::cast(Vector<int32_t, 4> a) {
  107. return Vector(_mm_castsi128_ps(a.v));
  108. }
  109. inline Vector<int32_t, 4> Vector<int32_t, 4>::cast(Vector<float, 4> a) {
  110. return Vector(_mm_castps_si128(a.v));
  111. }
  112. // Instructions not available as operators
  113. /** `~a & b` */
  114. inline Vector<float, 4> andnot(const Vector<float, 4> &a, const Vector<float, 4> &b) {
  115. return Vector<float, 4>(_mm_andnot_ps(a.v, b.v));
  116. }
  117. // Operator overloads
  118. /** `a @ b` */
  119. #define DECLARE_VECTOR_OPERATOR_INFIX(t, s, operator, func) \
  120. inline Vector<t, s> operator(const Vector<t, s> &a, const Vector<t, s> &b) { \
  121. return Vector<t, s>(func(a.v, b.v)); \
  122. }
  123. /** `a @= b` */
  124. #define DECLARE_VECTOR_OPERATOR_INCREMENT(t, s, operator, opfunc) \
  125. inline Vector<t, s> &operator(Vector<t, s> &a, const Vector<t, s> &b) { \
  126. a = opfunc(a, b); \
  127. return a; \
  128. }
  129. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator+, _mm_add_ps)
  130. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator+, _mm_add_epi32)
  131. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator-, _mm_sub_ps)
  132. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator-, _mm_sub_epi32)
  133. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator*, _mm_mul_ps)
  134. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator*, NOT AVAILABLE IN SSE3)
  135. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator/, _mm_div_ps)
  136. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator/, NOT AVAILABLE IN SSE3)
  137. /* Use these to apply logic, bit masks, and conditions to elements.
  138. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  139. Examples:
  140. Subtract 1 from value if greater than or equal to 1.
  141. x -= (x >= 1.f) & 1.f;
  142. */
  143. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator^, _mm_xor_ps)
  144. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator^, _mm_xor_si128)
  145. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator&, _mm_and_ps)
  146. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator&, _mm_and_si128)
  147. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator|, _mm_or_ps)
  148. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator|, _mm_or_si128)
  149. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator+=, operator+)
  150. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator+=, operator+)
  151. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator-=, operator-)
  152. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator-=, operator-)
  153. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator*=, operator*)
  154. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator*=, NOT AVAILABLE IN SSE3)
  155. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator/=, operator/)
  156. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator/=, NOT AVAILABLE IN SSE3)
  157. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator^=, operator^)
  158. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator^=, operator^)
  159. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator&=, operator&)
  160. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator&=, operator&)
  161. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator|=, operator|)
  162. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator|=, operator|)
  163. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator==, _mm_cmpeq_ps)
  164. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator==, _mm_cmpeq_epi32)
  165. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>=, _mm_cmpge_ps)
  166. inline Vector<int32_t, 4> operator>=(const Vector<int32_t, 4> &a, const Vector<int32_t, 4> &b) {
  167. return Vector<int32_t, 4>(_mm_cmpgt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  168. }
  169. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>, _mm_cmpgt_ps)
  170. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator>, _mm_cmpgt_epi32)
  171. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<=, _mm_cmple_ps)
  172. inline Vector<int32_t, 4> operator<=(const Vector<int32_t, 4> &a, const Vector<int32_t, 4> &b) {
  173. return Vector<int32_t, 4>(_mm_cmplt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  174. }
  175. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<, _mm_cmplt_ps)
  176. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator<, _mm_cmplt_epi32)
  177. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator!=, _mm_cmpneq_ps)
  178. inline Vector<int32_t, 4> operator!=(const Vector<int32_t, 4> &a, const Vector<int32_t, 4> &b) {
  179. return Vector<int32_t, 4>(_mm_cmpeq_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  180. }
  181. /** `+a` */
  182. inline Vector<float, 4> operator+(const Vector<float, 4> &a) {
  183. return a;
  184. }
  185. inline Vector<int32_t, 4> operator+(const Vector<int32_t, 4> &a) {
  186. return a;
  187. }
  188. /** `-a` */
  189. inline Vector<float, 4> operator-(const Vector<float, 4> &a) {
  190. return 0.f - a;
  191. }
  192. inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4> &a) {
  193. return 0 - a;
  194. }
  195. /** `++a` */
  196. inline Vector<float, 4> &operator++(Vector<float, 4> &a) {
  197. a += 1.f;
  198. return a;
  199. }
  200. inline Vector<int32_t, 4> &operator++(Vector<int32_t, 4> &a) {
  201. a += 1;
  202. return a;
  203. }
  204. /** `--a` */
  205. inline Vector<float, 4> &operator--(Vector<float, 4> &a) {
  206. a -= 1.f;
  207. return a;
  208. }
  209. inline Vector<int32_t, 4> &operator--(Vector<int32_t, 4> &a) {
  210. a -= 1;
  211. return a;
  212. }
  213. /** `a++` */
  214. inline Vector<float, 4> operator++(Vector<float, 4> &a, int) {
  215. Vector<float, 4> b = a;
  216. ++a;
  217. return b;
  218. }
  219. inline Vector<int32_t, 4> operator++(Vector<int32_t, 4> &a, int) {
  220. Vector<int32_t, 4> b = a;
  221. ++a;
  222. return b;
  223. }
  224. /** `a--` */
  225. inline Vector<float, 4> operator--(Vector<float, 4> &a, int) {
  226. Vector<float, 4> b = a;
  227. --a;
  228. return b;
  229. }
  230. inline Vector<int32_t, 4> operator--(Vector<int32_t, 4> &a, int) {
  231. Vector<int32_t, 4> b = a;
  232. --a;
  233. return b;
  234. }
  235. /** `~a` */
  236. inline Vector<float, 4> operator~(const Vector<float, 4> &a) {
  237. return a ^ Vector<float, 4>::mask();
  238. }
  239. inline Vector<int32_t, 4> operator~(const Vector<int32_t, 4> &a) {
  240. return a ^ Vector<int32_t, 4>::mask();
  241. }
  242. // Typedefs
  243. typedef Vector<float, 4> float_4;
  244. typedef Vector<int32_t, 4> int32_4;
  245. } // namespace simd
  246. } // namespace rack