You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
5.5KB

  1. #include "sse_mathfun.h"
  2. #include <cstring>
  3. #include <emmintrin.h>
  4. namespace rack {
  5. namespace dsp {
  6. /** Casts an int to float, bitwise without conversion. */
  7. inline float cast_i32_f32(int i) {
  8. static_assert(sizeof(int) == sizeof(float), "int and float must be the same size");
  9. // Should be optimized to two `mov` instructions
  10. float f;
  11. std::memcpy(&f, &i, sizeof(f));
  12. return f;
  13. }
  14. inline int cast_f32_i32(float f) {
  15. float i;
  16. std::memcpy(&i, &f, sizeof(i));
  17. return i;
  18. }
  19. /** Generic class for vector float types.
  20. This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  21. Usage example:
  22. float a[4], b[4];
  23. f32_4 a = f32_4::load(in);
  24. f32_4 b = 2.f * a / (1 - a);
  25. b *= sin(2 * M_PI * a);
  26. b.store(out);
  27. */
  28. template <int N>
  29. struct f32;
  30. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  31. */
  32. template <>
  33. struct f32<4> {
  34. __m128 v;
  35. /** Constructs an uninitialized vector. */
  36. f32<4>() {}
  37. /** Constructs a vector from a native `__m128` type. */
  38. f32<4>(__m128 v) : v(v) {}
  39. /** Constructs a vector with all elements set to `x`. */
  40. f32<4>(float x) {
  41. v = _mm_set_ps1(x);
  42. }
  43. /** Constructs a vector from four values. */
  44. f32<4>(float x1, float x2, float x3, float x4) {
  45. v = _mm_set_ps(x1, x2, x3, x4);
  46. }
  47. /** Reads an array of 4 values. */
  48. static f32<4> load(const float *x) {
  49. return f32<4>(_mm_loadu_ps(x));
  50. }
  51. /** Returns a vector initialized to zero. */
  52. static f32<4> zero() {
  53. return f32<4>(_mm_setzero_ps());
  54. }
  55. /** Writes an array of 4 values. */
  56. void store(float *x) {
  57. _mm_storeu_ps(x, v);
  58. }
  59. };
  60. typedef f32<4> f32_4;
  61. // Operator overloads
  62. /** `a @ b` */
  63. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  64. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  65. return f32_4(func(a.v, b.v)); \
  66. } \
  67. template <typename T> \
  68. f32_4 operator(const T &a, const f32_4 &b) { \
  69. return operator(f32_4(a), b); \
  70. } \
  71. template <typename T> \
  72. f32_4 operator(const f32_4 &a, const T &b) { \
  73. return operator(a, f32_4(b)); \
  74. }
  75. /** `a @= b` */
  76. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  77. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  78. a = opfunc(a, b); \
  79. return a; \
  80. } \
  81. template <typename T> \
  82. f32_4 &operator(f32_4 &a, const T &b) { \
  83. return operator(a, f32_4(b)); \
  84. }
  85. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  86. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  87. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  88. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  89. /**
  90. Use these to apply logic, bit masks, and conditions to elements.
  91. Examples:
  92. Subtract 1 from value if greater than or equal to 1.
  93. x -= (x >= 1.f) & 1.f;
  94. */
  95. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  96. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  97. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  98. /** Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  99. */
  100. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  101. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  102. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  103. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  104. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  105. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  106. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  107. /** `+a` */
  108. inline f32_4 operator+(const f32_4 &a) {
  109. return a;
  110. }
  111. /** `-a` */
  112. inline f32_4 operator-(const f32_4 &a) {
  113. return 0.f - a;
  114. }
  115. /** `++a` */
  116. inline f32_4 &operator++(f32_4 &a) {
  117. a += 1.f;
  118. return a;
  119. }
  120. /** `--a` */
  121. inline f32_4 &operator--(f32_4 &a) {
  122. a -= 1.f;
  123. return a;
  124. }
  125. /** `a++` */
  126. inline f32_4 operator++(f32_4 &a, int) {
  127. f32_4 b = a;
  128. ++a;
  129. return b;
  130. }
  131. /** `a--` */
  132. inline f32_4 operator--(f32_4 &a, int) {
  133. f32_4 b = a;
  134. --a;
  135. return b;
  136. }
  137. /** `~a` */
  138. inline f32_4 operator~(const f32_4 &a) {
  139. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  140. }
  141. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  142. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  143. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  144. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  145. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  146. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  147. // Math functions
  148. inline f32_4 fmax(f32_4 x, f32_4 b) {
  149. return f32_4(_mm_max_ps(x.v, b.v));
  150. }
  151. inline f32_4 fmin(f32_4 x, f32_4 b) {
  152. return f32_4(_mm_min_ps(x.v, b.v));
  153. }
  154. inline f32_4 sqrt(f32_4 x) {
  155. return f32_4(_mm_sqrt_ps(x.v));
  156. }
  157. /** Returns the approximate reciprocal square root.
  158. Much faster than `1/sqrt(x)`.
  159. */
  160. inline f32_4 rsqrt(f32_4 x) {
  161. return f32_4(_mm_rsqrt_ps(x.v));
  162. }
  163. /** Returns the approximate reciprocal.
  164. Much faster than `1/x`.
  165. */
  166. inline f32_4 rcp(f32_4 x) {
  167. return f32_4(_mm_rcp_ps(x.v));
  168. }
  169. inline f32_4 log(f32_4 x) {
  170. return f32_4(sse_mathfun_log_ps(x.v));
  171. }
  172. inline f32_4 exp(f32_4 x) {
  173. return f32_4(sse_mathfun_exp_ps(x.v));
  174. }
  175. inline f32_4 sin(f32_4 x) {
  176. return f32_4(sse_mathfun_sin_ps(x.v));
  177. }
  178. inline f32_4 cos(f32_4 x) {
  179. return f32_4(sse_mathfun_cos_ps(x.v));
  180. }
  181. inline f32_4 floor(f32_4 a) {
  182. return f32_4(sse_mathfun_floor_ps(a.v));
  183. }
  184. inline f32_4 ceil(f32_4 a) {
  185. return f32_4(sse_mathfun_ceil_ps(a.v));
  186. }
  187. inline f32_4 round(f32_4 a) {
  188. return f32_4(sse_mathfun_round_ps(a.v));
  189. }
  190. inline f32_4 fmod(f32_4 a, f32_4 b) {
  191. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  192. }
  193. inline f32_4 fabs(f32_4 a) {
  194. return f32_4(sse_mathfun_fabs_ps(a.v));
  195. }
  196. inline f32_4 trunc(f32_4 a) {
  197. return f32_4(sse_mathfun_trunc_ps(a.v));
  198. }
  199. } // namespace dsp
  200. } // namespace rack