You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

254 lines
5.4KB

  1. #include "sse_mathfun.h"
  2. #include <cstring>
  3. #include <emmintrin.h>
  4. namespace rack {
  5. namespace dsp {
  6. /** Casts an int to float, bitwise without conversion. */
  7. inline float cast_i32_f32(int i) {
  8. static_assert(sizeof(int) == sizeof(float), "int and float must be the same size");
  9. // Should be optimized to two `mov` instructions
  10. float f;
  11. std::memcpy(&f, &i, sizeof(f));
  12. return f;
  13. }
  14. inline int cast_f32_i32(float f) {
  15. float i;
  16. std::memcpy(&i, &f, sizeof(i));
  17. return i;
  18. }
  19. /** Generic class for vector float types.
  20. This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  21. Usage example:
  22. float a[4], b[4];
  23. f32_4 a = f32_4::load(in);
  24. f32_4 b = 2.f * a / (1 - a);
  25. b *= sin(2 * M_PI * a);
  26. b.store(out);
  27. */
  28. template <int N>
  29. struct f32;
  30. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  31. */
  32. template <>
  33. struct f32<4> {
  34. __m128 v;
  35. /** Constructs an uninitialized vector. */
  36. f32<4>() {}
  37. /** Constructs a vector from a native `__m128` type. */
  38. f32<4>(__m128 v) : v(v) {}
  39. /** Constructs a vector with all elements set to `x`. */
  40. f32<4>(float x) {
  41. v = _mm_set_ps1(x);
  42. }
  43. /** Reads an array of 4 values. */
  44. static f32<4> load(const float *x) {
  45. return f32<4>(_mm_loadu_ps(x));
  46. }
  47. /** Returns a vector initialized to zero. */
  48. static f32<4> zero() {
  49. return f32<4>(_mm_setzero_ps());
  50. }
  51. /** Writes an array of 4 values. */
  52. void store(float *x) {
  53. _mm_storeu_ps(x, v);
  54. }
  55. };
  56. typedef f32<4> f32_4;
  57. // Operator overloads
  58. /** `a @ b` */
  59. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  60. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  61. return f32_4(func(a.v, b.v)); \
  62. } \
  63. template <typename T> \
  64. f32_4 operator(const T &a, const f32_4 &b) { \
  65. return operator(f32_4(a), b); \
  66. } \
  67. template <typename T> \
  68. f32_4 operator(const f32_4 &a, const T &b) { \
  69. return operator(a, f32_4(b)); \
  70. }
  71. /** `a @= b` */
  72. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  73. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  74. a = opfunc(a, b); \
  75. return a; \
  76. } \
  77. template <typename T> \
  78. f32_4 &operator(f32_4 &a, const T &b) { \
  79. return operator(a, f32_4(b)); \
  80. }
  81. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  82. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  83. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  84. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  85. /**
  86. Use these to apply logic, bit masks, and conditions to elements.
  87. Examples:
  88. Subtract 1 from value if greater than or equal to 1.
  89. x -= (x >= 1.f) & 1.f;
  90. */
  91. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  92. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  93. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  94. /** Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  95. */
  96. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  97. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  98. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  99. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  100. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  101. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  102. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  103. /** `+a` */
  104. inline f32_4 operator+(const f32_4 &a) {
  105. return a;
  106. }
  107. /** `-a` */
  108. inline f32_4 operator-(const f32_4 &a) {
  109. return 0.f - a;
  110. }
  111. /** `++a` */
  112. inline f32_4 &operator++(f32_4 &a) {
  113. a += 1.f;
  114. return a;
  115. }
  116. /** `--a` */
  117. inline f32_4 &operator--(f32_4 &a) {
  118. a -= 1.f;
  119. return a;
  120. }
  121. /** `a++` */
  122. inline f32_4 operator++(f32_4 &a, int) {
  123. f32_4 b = a;
  124. ++a;
  125. return b;
  126. }
  127. /** `a--` */
  128. inline f32_4 operator--(f32_4 &a, int) {
  129. f32_4 b = a;
  130. --a;
  131. return b;
  132. }
  133. /** `~a` */
  134. inline f32_4 operator~(const f32_4 &a) {
  135. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  136. }
  137. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  138. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  139. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  140. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  141. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  142. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  143. // Math functions
  144. inline f32_4 fmax(f32_4 x, f32_4 b) {
  145. return f32_4(_mm_max_ps(x.v, b.v));
  146. }
  147. inline f32_4 fmin(f32_4 x, f32_4 b) {
  148. return f32_4(_mm_min_ps(x.v, b.v));
  149. }
  150. inline f32_4 sqrt(f32_4 x) {
  151. return f32_4(_mm_sqrt_ps(x.v));
  152. }
  153. /** Returns the approximate reciprocal square root.
  154. Much faster than `1/sqrt(x)`.
  155. */
  156. inline f32_4 rsqrt(f32_4 x) {
  157. return f32_4(_mm_rsqrt_ps(x.v));
  158. }
  159. /** Returns the approximate reciprocal.
  160. Much faster than `1/x`.
  161. */
  162. inline f32_4 rcp(f32_4 x) {
  163. return f32_4(_mm_rcp_ps(x.v));
  164. }
  165. inline f32_4 log(f32_4 x) {
  166. return f32_4(sse_mathfun_log_ps(x.v));
  167. }
  168. inline f32_4 exp(f32_4 x) {
  169. return f32_4(sse_mathfun_exp_ps(x.v));
  170. }
  171. inline f32_4 sin(f32_4 x) {
  172. return f32_4(sse_mathfun_sin_ps(x.v));
  173. }
  174. inline f32_4 cos(f32_4 x) {
  175. return f32_4(sse_mathfun_cos_ps(x.v));
  176. }
  177. inline f32_4 floor(f32_4 a) {
  178. return f32_4(sse_mathfun_floor_ps(a.v));
  179. }
  180. inline f32_4 ceil(f32_4 a) {
  181. return f32_4(sse_mathfun_ceil_ps(a.v));
  182. }
  183. inline f32_4 round(f32_4 a) {
  184. return f32_4(sse_mathfun_round_ps(a.v));
  185. }
  186. inline f32_4 fmod(f32_4 a, f32_4 b) {
  187. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  188. }
  189. inline f32_4 fabs(f32_4 a) {
  190. return f32_4(sse_mathfun_fabs_ps(a.v));
  191. }
  192. inline f32_4 trunc(f32_4 a) {
  193. return f32_4(sse_mathfun_trunc_ps(a.v));
  194. }
  195. } // namespace dsp
  196. } // namespace rack