You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

simd.hpp 5.7KB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. #include "sse_mathfun.h"
  2. #include <cstring>
  3. #include <cmath>
  4. #include <emmintrin.h>
  5. namespace rack {
  6. namespace dsp {
  7. /** Casts an int to float, bitwise without conversion. */
  8. inline float cast_i32_f32(int i) {
  9. static_assert(sizeof(int) == sizeof(float), "int and float must be the same size");
  10. // Should be optimized to two `mov` instructions
  11. float f;
  12. std::memcpy(&f, &i, sizeof(f));
  13. return f;
  14. }
  15. inline int cast_f32_i32(float f) {
  16. float i;
  17. std::memcpy(&i, &f, sizeof(i));
  18. return i;
  19. }
  20. /** Generic class for vector float types.
  21. This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  22. Usage example:
  23. float a[4], b[4];
  24. f32_4 a = f32_4::load(in);
  25. f32_4 b = 2.f * a / (1 - a);
  26. b *= sin(2 * M_PI * a);
  27. b.store(out);
  28. */
  29. template <int N>
  30. struct f32;
  31. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  32. */
  33. template <>
  34. struct f32<4> {
  35. __m128 v;
  36. /** Constructs an uninitialized vector. */
  37. f32<4>() {}
  38. /** Constructs a vector from a native `__m128` type. */
  39. f32<4>(__m128 v) : v(v) {}
  40. /** Constructs a vector with all elements set to `x`. */
  41. f32<4>(float x) {
  42. v = _mm_set_ps1(x);
  43. }
  44. /** Constructs a vector from four values. */
  45. f32<4>(float x1, float x2, float x3, float x4) {
  46. v = _mm_set_ps(x1, x2, x3, x4);
  47. }
  48. /** Reads an array of 4 values. */
  49. static f32<4> load(const float *x) {
  50. return f32<4>(_mm_loadu_ps(x));
  51. }
  52. /** Returns a vector initialized to zero. */
  53. static f32<4> zero() {
  54. return f32<4>(_mm_setzero_ps());
  55. }
  56. /** Writes an array of 4 values. */
  57. void store(float *x) {
  58. _mm_storeu_ps(x, v);
  59. }
  60. };
  61. typedef f32<4> f32_4;
  62. // Operator overloads
  63. /** `a @ b` */
  64. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  65. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  66. return f32_4(func(a.v, b.v)); \
  67. } \
  68. template <typename T> \
  69. f32_4 operator(const T &a, const f32_4 &b) { \
  70. return operator(f32_4(a), b); \
  71. } \
  72. template <typename T> \
  73. f32_4 operator(const f32_4 &a, const T &b) { \
  74. return operator(a, f32_4(b)); \
  75. }
  76. /** `a @= b` */
  77. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  78. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  79. a = opfunc(a, b); \
  80. return a; \
  81. } \
  82. template <typename T> \
  83. f32_4 &operator(f32_4 &a, const T &b) { \
  84. return operator(a, f32_4(b)); \
  85. }
  86. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  87. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  88. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  89. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  90. /**
  91. Use these to apply logic, bit masks, and conditions to elements.
  92. Examples:
  93. Subtract 1 from value if greater than or equal to 1.
  94. x -= (x >= 1.f) & 1.f;
  95. */
  96. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  97. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  98. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  99. /** Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  100. */
  101. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  102. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  103. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  104. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  105. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  106. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  107. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  108. /** `+a` */
  109. inline f32_4 operator+(const f32_4 &a) {
  110. return a;
  111. }
  112. /** `-a` */
  113. inline f32_4 operator-(const f32_4 &a) {
  114. return 0.f - a;
  115. }
  116. /** `++a` */
  117. inline f32_4 &operator++(f32_4 &a) {
  118. a += 1.f;
  119. return a;
  120. }
  121. /** `--a` */
  122. inline f32_4 &operator--(f32_4 &a) {
  123. a -= 1.f;
  124. return a;
  125. }
  126. /** `a++` */
  127. inline f32_4 operator++(f32_4 &a, int) {
  128. f32_4 b = a;
  129. ++a;
  130. return b;
  131. }
  132. /** `a--` */
  133. inline f32_4 operator--(f32_4 &a, int) {
  134. f32_4 b = a;
  135. --a;
  136. return b;
  137. }
  138. /** `~a` */
  139. inline f32_4 operator~(const f32_4 &a) {
  140. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  141. }
  142. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  143. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  144. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  145. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  146. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  147. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  148. // Math functions
  149. inline f32_4 fmax(f32_4 x, f32_4 b) {
  150. return f32_4(_mm_max_ps(x.v, b.v));
  151. }
  152. inline f32_4 fmin(f32_4 x, f32_4 b) {
  153. return f32_4(_mm_min_ps(x.v, b.v));
  154. }
  155. inline f32_4 sqrt(f32_4 x) {
  156. return f32_4(_mm_sqrt_ps(x.v));
  157. }
  158. /** Returns the approximate reciprocal square root.
  159. Much faster than `1/sqrt(x)`.
  160. */
  161. inline f32_4 rsqrt(f32_4 x) {
  162. return f32_4(_mm_rsqrt_ps(x.v));
  163. }
  164. /** Returns the approximate reciprocal.
  165. Much faster than `1/x`.
  166. */
  167. inline f32_4 rcp(f32_4 x) {
  168. return f32_4(_mm_rcp_ps(x.v));
  169. }
  170. inline f32_4 log(f32_4 x) {
  171. return f32_4(sse_mathfun_log_ps(x.v));
  172. }
  173. inline f32_4 exp(f32_4 x) {
  174. return f32_4(sse_mathfun_exp_ps(x.v));
  175. }
  176. inline f32_4 sin(f32_4 x) {
  177. return f32_4(sse_mathfun_sin_ps(x.v));
  178. }
  179. inline f32_4 cos(f32_4 x) {
  180. return f32_4(sse_mathfun_cos_ps(x.v));
  181. }
  182. inline f32_4 floor(f32_4 a) {
  183. return f32_4(sse_mathfun_floor_ps(a.v));
  184. }
  185. inline f32_4 ceil(f32_4 a) {
  186. return f32_4(sse_mathfun_ceil_ps(a.v));
  187. }
  188. inline f32_4 round(f32_4 a) {
  189. return f32_4(sse_mathfun_round_ps(a.v));
  190. }
  191. inline f32_4 fmod(f32_4 a, f32_4 b) {
  192. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  193. }
  194. inline f32_4 fabs(f32_4 a) {
  195. return f32_4(sse_mathfun_fabs_ps(a.v));
  196. }
  197. inline f32_4 trunc(f32_4 a) {
  198. return f32_4(sse_mathfun_trunc_ps(a.v));
  199. }
  200. inline f32_4 pow(f32_4 a, f32_4 b) {
  201. return exp(b * log(a));
  202. }
  203. inline f32_4 pow(float a, f32_4 b) {
  204. return exp(b * std::log(a));
  205. }
  206. } // namespace dsp
  207. } // namespace rack