You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Vector.hpp 9.5KB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. #pragma once
  2. #include <cstring>
  3. #include <pmmintrin.h>
  4. namespace rack {
  5. /** Abstraction of aligned types for SIMD computation
  6. */
  7. namespace simd {
  8. /** Generic class for vector types.
  9. This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  10. Example:
  11. float a[4], b[4];
  12. float_4 a = float_4::load(in);
  13. float_4 b = 2.f * a / (1 - a);
  14. b *= sin(2 * M_PI * a);
  15. b.store(out);
  16. */
  17. template <typename TYPE, int SIZE>
  18. struct Vector;
  19. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  20. */
  21. template <>
  22. struct Vector<float, 4> {
  23. using type = float;
  24. constexpr static int size = 4;
  25. union {
  26. __m128 v;
  27. /** Accessing this array of scalars is slow and defeats the purpose of vectorizing.
  28. */
  29. float s[4];
  30. };
  31. /** Constructs an uninitialized vector. */
  32. Vector() = default;
  33. /** Constructs a vector from a native `__m128` type. */
  34. Vector(__m128 v) : v(v) {}
  35. /** Constructs a vector with all elements set to `x`. */
  36. Vector(float x) {
  37. v = _mm_set1_ps(x);
  38. }
  39. /** Constructs a vector from four scalars. */
  40. Vector(float x1, float x2, float x3, float x4) {
  41. v = _mm_setr_ps(x1, x2, x3, x4);
  42. }
  43. /** Returns a vector with all 0 bits. */
  44. static Vector zero() {
  45. return Vector(_mm_setzero_ps());
  46. }
  47. /** Returns a vector with all 1 bits. */
  48. static Vector mask() {
  49. return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
  50. }
  51. /** Reads an array of 4 values.
  52. On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  53. */
  54. static Vector load(const float* x) {
  55. /*
  56. My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
  57. This post seems to agree. https://stackoverflow.com/a/20265193/272642
  58. I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance).
  59. */
  60. return Vector(_mm_loadu_ps(x));
  61. }
  62. /** Writes an array of 4 values.
  63. On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
  64. */
  65. void store(float* x) {
  66. _mm_storeu_ps(x, v);
  67. }
  68. /** Accessing vector elements individually is slow and defeats the purpose of vectorizing.
  69. However, this operator is convenient when writing simple serial code in a non-bottlenecked section.
  70. */
  71. float& operator[](int i) {
  72. return s[i];
  73. }
  74. const float& operator[](int i) const {
  75. return s[i];
  76. }
  77. // Conversions
  78. Vector(Vector<int32_t, 4> a);
  79. // Casts
  80. static Vector cast(Vector<int32_t, 4> a);
  81. };
  82. template <>
  83. struct Vector<int32_t, 4> {
  84. using type = int32_t;
  85. constexpr static int size = 4;
  86. union {
  87. __m128i v;
  88. int32_t s[4];
  89. };
  90. Vector() = default;
  91. Vector(__m128i v) : v(v) {}
  92. Vector(int32_t x) {
  93. v = _mm_set1_epi32(x);
  94. }
  95. Vector(int32_t x1, int32_t x2, int32_t x3, int32_t x4) {
  96. v = _mm_setr_epi32(x1, x2, x3, x4);
  97. }
  98. static Vector zero() {
  99. return Vector(_mm_setzero_si128());
  100. }
  101. static Vector mask() {
  102. return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
  103. }
  104. static Vector load(const int32_t* x) {
  105. // HACK
  106. // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
  107. return Vector(_mm_loadu_si128((const __m128i*) x));
  108. }
  109. void store(int32_t* x) {
  110. // HACK
  111. // Use _mm_storeu_si128() because GCC doesn't support _mm_storeu_si32()
  112. _mm_storeu_si128((__m128i*) x, v);
  113. }
  114. int32_t& operator[](int i) {
  115. return s[i];
  116. }
  117. const int32_t& operator[](int i) const {
  118. return s[i];
  119. }
  120. Vector(Vector<float, 4> a);
  121. static Vector cast(Vector<float, 4> a);
  122. };
  123. // Conversions and casts
  124. inline Vector<float, 4>::Vector(Vector<int32_t, 4> a) {
  125. v = _mm_cvtepi32_ps(a.v);
  126. }
  127. inline Vector<int32_t, 4>::Vector(Vector<float, 4> a) {
  128. v = _mm_cvttps_epi32(a.v);
  129. }
  130. inline Vector<float, 4> Vector<float, 4>::cast(Vector<int32_t, 4> a) {
  131. return Vector(_mm_castsi128_ps(a.v));
  132. }
  133. inline Vector<int32_t, 4> Vector<int32_t, 4>::cast(Vector<float, 4> a) {
  134. return Vector(_mm_castps_si128(a.v));
  135. }
  136. // Operator overloads
  137. /** `a @ b` */
  138. #define DECLARE_VECTOR_OPERATOR_INFIX(t, s, operator, func) \
  139. inline Vector<t, s> operator(const Vector<t, s>& a, const Vector<t, s>& b) { \
  140. return Vector<t, s>(func(a.v, b.v)); \
  141. }
  142. /** `a @= b` */
  143. #define DECLARE_VECTOR_OPERATOR_INCREMENT(t, s, operator, opfunc) \
  144. inline Vector<t, s>& operator(Vector<t, s>& a, const Vector<t, s>& b) { \
  145. return a = opfunc(a, b); \
  146. }
  147. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator+, _mm_add_ps)
  148. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator+, _mm_add_epi32)
  149. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator-, _mm_sub_ps)
  150. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator-, _mm_sub_epi32)
  151. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator*, _mm_mul_ps)
  152. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator*, NOT AVAILABLE IN SSE3)
  153. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator/, _mm_div_ps)
  154. // DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator/, NOT AVAILABLE IN SSE3)
  155. /* Use these to apply logic, bit masks, and conditions to elements.
  156. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  157. Examples:
  158. Subtract 1 from value if greater than or equal to 1.
  159. x -= (x >= 1.f) & 1.f;
  160. */
  161. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator^, _mm_xor_ps)
  162. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator^, _mm_xor_si128)
  163. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator&, _mm_and_ps)
  164. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator&, _mm_and_si128)
  165. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator|, _mm_or_ps)
  166. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator|, _mm_or_si128)
  167. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator+=, operator+)
  168. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator+=, operator+)
  169. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator-=, operator-)
  170. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator-=, operator-)
  171. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator*=, operator*)
  172. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator*=, NOT AVAILABLE IN SSE3)
  173. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator/=, operator/)
  174. // DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator/=, NOT AVAILABLE IN SSE3)
  175. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator^=, operator^)
  176. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator^=, operator^)
  177. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator&=, operator&)
  178. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator&=, operator&)
  179. DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator|=, operator|)
  180. DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator|=, operator|)
  181. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator==, _mm_cmpeq_ps)
  182. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator==, _mm_cmpeq_epi32)
  183. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>=, _mm_cmpge_ps)
  184. inline Vector<int32_t, 4> operator>=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  185. return Vector<int32_t, 4>(_mm_cmpgt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  186. }
  187. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>, _mm_cmpgt_ps)
  188. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator>, _mm_cmpgt_epi32)
  189. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<=, _mm_cmple_ps)
  190. inline Vector<int32_t, 4> operator<=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  191. return Vector<int32_t, 4>(_mm_cmplt_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  192. }
  193. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<, _mm_cmplt_ps)
  194. DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator<, _mm_cmplt_epi32)
  195. DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator!=, _mm_cmpneq_ps)
  196. inline Vector<int32_t, 4> operator!=(const Vector<int32_t, 4>& a, const Vector<int32_t, 4>& b) {
  197. return Vector<int32_t, 4>(_mm_cmpeq_epi32(a.v, b.v)) ^ Vector<int32_t, 4>::mask();
  198. }
  199. /** `+a` */
  200. inline Vector<float, 4> operator+(const Vector<float, 4>& a) {
  201. return a;
  202. }
  203. inline Vector<int32_t, 4> operator+(const Vector<int32_t, 4>& a) {
  204. return a;
  205. }
  206. /** `-a` */
  207. inline Vector<float, 4> operator-(const Vector<float, 4>& a) {
  208. return 0.f - a;
  209. }
  210. inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {
  211. return 0 - a;
  212. }
  213. /** `++a` */
  214. inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
  215. return a += 1.f;
  216. }
  217. inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
  218. return a += 1;
  219. }
  220. /** `--a` */
  221. inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
  222. return a -= 1.f;
  223. }
  224. inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
  225. return a -= 1;
  226. }
  227. /** `a++` */
  228. inline Vector<float, 4> operator++(Vector<float, 4>& a, int) {
  229. Vector<float, 4> b = a;
  230. ++a;
  231. return b;
  232. }
  233. inline Vector<int32_t, 4> operator++(Vector<int32_t, 4>& a, int) {
  234. Vector<int32_t, 4> b = a;
  235. ++a;
  236. return b;
  237. }
  238. /** `a--` */
  239. inline Vector<float, 4> operator--(Vector<float, 4>& a, int) {
  240. Vector<float, 4> b = a;
  241. --a;
  242. return b;
  243. }
  244. inline Vector<int32_t, 4> operator--(Vector<int32_t, 4>& a, int) {
  245. Vector<int32_t, 4> b = a;
  246. --a;
  247. return b;
  248. }
  249. /** `~a` */
  250. inline Vector<float, 4> operator~(const Vector<float, 4>& a) {
  251. return a ^ Vector<float, 4>::mask();
  252. }
  253. inline Vector<int32_t, 4> operator~(const Vector<int32_t, 4>& a) {
  254. return a ^ Vector<int32_t, 4>::mask();
  255. }
  256. /** `a << b` */
  257. inline Vector<int32_t, 4> operator<<(const Vector<int32_t, 4>& a, const int& b) {
  258. return Vector<int32_t, 4>(_mm_slli_epi32(a.v, b));
  259. }
  260. /** `a >> b` */
  261. inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b) {
  262. return Vector<int32_t, 4>(_mm_srli_epi32(a.v, b));
  263. }
  264. // Typedefs
  265. using float_4 = Vector<float, 4>;
  266. using int32_4 = Vector<int32_t, 4>;
  267. } // namespace simd
  268. } // namespace rack