You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

369 lines
9.9KB

  1. #pragma once
  2. #include <cstring>
  3. #include <pmmintrin.h>
  4. namespace rack {
  5. /** Abstraction of byte-aligned values for SIMD CPU acceleration. */
  6. namespace simd {
  7. /** Generic class for vector types.
  8. This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  9. Usage example:
  10. float a[4], b[4];
  11. float_4 a = float_4::load(in);
  12. float_4 b = 2.f * a / (1 - a);
  13. b *= sin(2 * M_PI * a);
  14. b.store(out);
  15. */
  16. template <typename TYPE, int SIZE>
  17. struct Vector;
  18. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  19. */
  20. template <>
  21. struct Vector<float, 4> {
  22. typedef float type;
  23. constexpr static int size = 4;
  24. union {
  25. __m128 v;
  26. /** Accessing this array of scalars is slow and defeats the purpose of vectorizing.
  27. */
  28. float s[4];
  29. };
  30. /** Constructs an uninitialized vector. */
  31. Vector() = default;
  32. /** Constructs a vector from a native `__m128` type. */
  33. Vector(__m128 v) : v(v) {}
  34. /** Constructs a vector with all elements set to `x`. */
  35. Vector(float x) {
  36. v = _mm_set1_ps(x);
  37. }
  38. /** Constructs a vector from four scalars. */
  39. Vector(float x1, float x2, float x3, float x4) {
  40. v = _mm_setr_ps(x1, x2, x3, x4);
  41. }
  42. /** Returns a vector initialized to zero. */
  43. static Vector zero() {
  44. return Vector();
  45. }
  46. /** Returns a vector with all 1 bits. */
  47. static Vector mask() {
  48. return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  49. }
  50. /** Reads an array of 4 values.
  51. On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  52. */
  53. static Vector load(const float* x) {
  54. /*
  55. My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
  56. This post seems to agree. https://stackoverflow.com/a/20265193/272642
  57. So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower).
  58. */
  59. return Vector(_mm_loadu_ps(x));
  60. }
  61. /** Writes an array of 4 values.
  62. On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  63. */
  64. void store(float* x) {
  65. _mm_storeu_ps(x, v);
  66. }
  67. float& operator[](int i) {
  68. return s[i];
  69. }
  70. const float& operator[](int i) const {
  71. return s[i];
  72. }
  73. // Conversions
  74. Vector(Vector<int32_t, 4> a);
  75. // Casts
  76. static Vector cast(Vector<int32_t, 4> a);
  77. };
  78. template <>
  79. struct Vector<int32_t, 4> {
  80. typedef int32_t type;
  81. constexpr static int size = 4;
  82. union {
  83. __m128i v;
  84. int32_t s[4];
  85. };
  86. Vector() = default;
  87. Vector(__m128i v) : v(v) {}
  88. Vector(int32_t x) {
  89. v = _mm_set1_epi32(x);
  90. }
  91. Vector(int32_t x1, int32_t x2, int32_t x3, int32_t x4) {
  92. v = _mm_setr_epi32(x1, x2, x3, x4);
  93. }
  94. static Vector zero() {
  95. return Vector();
  96. }
  97. static Vector mask() {
  98. return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  99. }
  100. static Vector load(const int32_t* x) {
  101. // HACK
  102. // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
  103. return Vector(_mm_loadu_si128((__m128i*) x));
  104. }
  105. void store(int32_t* x) {
  106. // HACK
  107. // Use _mm_storeu_si128() because GCC doesn't support _mm_storeu_si32()
  108. _mm_storeu_si128((__m128i*) x, v);
  109. }
  110. int32_t& operator[](int i) {
  111. return s[i];
  112. }
  113. const int32_t& operator[](int i) const {
  114. return s[i];
  115. }
  116. Vector(Vector<float, 4> a);
  117. static Vector cast(Vector<float, 4> a);
  118. };
  119. // Conversions and casts
  120. inline Vector<float, 4>::Vector(Vector<int32_t, 4> a) {
  121. v = _mm_cvtepi32_ps(a.v);
  122. }
  123. inline Vector<int32_t, 4>::Vector(Vector<float, 4> a) {
  124. v = _mm_cvttps_epi32(a.v);
  125. }
  126. inline Vector<float, 4> Vector<float, 4>::cast(Vector<int32_t, 4> a) {
  127. return Vector(_mm_castsi128_ps(a.v));
  128. }
  129. inline Vector<int32_t, 4> Vector<int32_t, 4>::cast(Vector<float, 4> a) {
  130. return Vector(_mm_castps_si128(a.v));
  131. }
  132. // Instructions not available as operators
  133. /** `~a & b` */
  134. inline Vector<float, 4> andnot(const Vector<float, 4>& a, const Vector<float, 4>& b) {
  135. return Vector<float, 4>(_mm_andnot_ps(a.v, b.v));
  136. }
  137. /** Returns an integer with each bit corresponding to the most significant bit of each element.
  138. For example, `movemask(float_4::mask())` returns 0xf.
  139. */
  140. inline int movemask(const Vector<float, 4>& a) {
  141. return _mm_movemask_ps(a.v);
  142. }
  143. /** Returns an integer with each bit corresponding to the most significant bit of each byte.
  144. For example, `movemask(int32_4::mask())` returns 0xffff.
  145. */
  146. inline int movemask(const Vector<int32_t, 4>& a) {
  147. return _mm_movemask_epi8(a.v);
  148. }
  149. // Operator overloads
  150. /** `a @ b` */
  151. #define DECLARE_VECTOR_OPERATOR_INFIX(t, s, operator, func) \
  152. inline Vector<t, s> operator(const Vector<t, s> &a, const Vector<t, s> &b) { \
  153. return Vector<t, s>(func(a.v, b.v)); \
  154. }
  155. /** `a @= b` */
  156. #define DECLARE_VECTOR_OPERATOR_INCREMENT(t, s, operator, opfunc) \
  157. inline Vector<t, s> &operator(Vector<t, s> &a, const Vector<t, s> &b) { \
  158. a = opfunc(a, b); \
  159. return a; \
  160. }
  161. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator+, _mm_add_ps)
  162. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator+, _mm_add_epi32)
  163. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator-, _mm_sub_ps)
  164. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator-, _mm_sub_epi32)
  165. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator*, _mm_mul_ps)
  166. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator*, NOT AVAILABLE IN SSE3)
  167. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator/, _mm_div_ps)
  168. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator/, NOT AVAILABLE IN SSE3)
  169. /* Use these to apply logic, bit masks, and conditions to elements.
  170. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  171. Examples:
  172. Subtract 1 from value if greater than or equal to 1.
  173. x -= (x >= 1.f) & 1.f;
  174. */
  175. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator^, _mm_xor_ps)
  176. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator^, _mm_xor_si128)
  177. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator&, _mm_and_ps)
  178. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator&, _mm_and_si128)
  179. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator|, _mm_or_ps)
  180. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator|, _mm_or_si128)
  181. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator+=, operator+)
  182. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator+=, operator+)
  183. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator-=, operator-)
  184. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator-=, operator-)
  185. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator*=, operator*)
  186. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator*=, NOT AVAILABLE IN SSE3)
  187. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator/=, operator/)
  188. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator/=, NOT AVAILABLE IN SSE3)
  189. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator^=, operator^)
  190. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator^=, operator^)
  191. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator&=, operator&)
  192. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator&=, operator&)
  193. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator|=, operator|)
  194. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator|=, operator|)
  195. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator==, _mm_cmpeq_ps)
  196. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator==, _mm_cmpeq_epi32)
  197. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>=, _mm_cmpge_ps)
  198. inline Vector<int32_t, 4> operator>=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  199. return Vector<int32_t, 4>(_mm_cmpgt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  200. }
  201. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>, _mm_cmpgt_ps)
  202. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator>, _mm_cmpgt_epi32)
  203. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<=, _mm_cmple_ps)
  204. inline Vector<int32_t, 4> operator<=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  205. return Vector<int32_t, 4>(_mm_cmplt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  206. }
  207. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<, _mm_cmplt_ps)
  208. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator<, _mm_cmplt_epi32)
  209. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator!=, _mm_cmpneq_ps)
  210. inline Vector<int32_t, 4> operator!=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  211. return Vector<int32_t, 4>(_mm_cmpeq_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  212. }
  213. /** `+a` */
  214. inline Vector<float, 4> operator+(const Vector<float, 4>& a) {
  215. return a;
  216. }
  217. inline Vector<int32_t, 4> operator+(const Vector<int32_t, 4>& a) {
  218. return a;
  219. }
  220. /** `-a` */
  221. inline Vector<float, 4> operator-(const Vector<float, 4>& a) {
  222. return 0.f - a;
  223. }
  224. inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {
  225. return 0 - a;
  226. }
  227. /** `++a` */
  228. inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
  229. a += 1.f;
  230. return a;
  231. }
  232. inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
  233. a += 1;
  234. return a;
  235. }
  236. /** `--a` */
  237. inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
  238. a -= 1.f;
  239. return a;
  240. }
  241. inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
  242. a -= 1;
  243. return a;
  244. }
  245. /** `a++` */
  246. inline Vector<float, 4> operator++(Vector<float, 4>& a, int) {
  247. Vector<float, 4> b = a;
  248. ++a;
  249. return b;
  250. }
  251. inline Vector<int32_t, 4> operator++(Vector<int32_t, 4>& a, int) {
  252. Vector<int32_t, 4> b = a;
  253. ++a;
  254. return b;
  255. }
  256. /** `a--` */
  257. inline Vector<float, 4> operator--(Vector<float, 4>& a, int) {
  258. Vector<float, 4> b = a;
  259. --a;
  260. return b;
  261. }
  262. inline Vector<int32_t, 4> operator--(Vector<int32_t, 4>& a, int) {
  263. Vector<int32_t, 4> b = a;
  264. --a;
  265. return b;
  266. }
  267. /** `~a` */
  268. inline Vector<float, 4> operator~(const Vector<float, 4>& a) {
  269. return a ^ Vector<float, 4>::mask();
  270. }
  271. inline Vector<int32_t, 4> operator~(const Vector<int32_t, 4>& a) {
  272. return a ^ Vector<int32_t, 4>::mask();
  273. }
  274. /** `a << b` */
  275. inline Vector<int32_t, 4> operator<<(const Vector<int32_t, 4>& a, const int& b) {
  276. return Vector<int32_t, 4>(_mm_slli_epi32(a.v, b));
  277. }
  278. /** `a >> b` */
  279. inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b) {
  280. return Vector<int32_t, 4>(_mm_srli_epi32(a.v, b));
  281. }
  282. // Typedefs
  283. typedef Vector<float, 4> float_4;
  284. typedef Vector<int32_t, 4> int32_4;
  285. } // namespace simd
  286. } // namespace rack