You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

259 lines
5.4KB

  1. #include "sse_mathfun.h"
  2. #include <cstring>
  3. #include <cmath>
  4. #include <x86intrin.h>
  5. namespace rack {
  6. namespace dsp {
  7. /** Casts the literal bits of FROM to TO without type conversion.
  8. API copied from C++20.
  9. Usage example:
  10. printf("%08x\n", bit_cast<int>(1.f)); // Prints 3f800000
  11. */
  12. template <typename TO, typename FROM>
  13. TO bit_cast(const FROM &x) {
  14. static_assert(sizeof(FROM) == sizeof(TO), "types must have equal size");
  15. // Should be optimized to two `mov` instructions
  16. TO y;
  17. std::memcpy(&y, &x, sizeof(x));
  18. return y;
  19. }
  20. /** Generic class for vector float types.
  21. This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  22. Usage example:
  23. float a[4], b[4];
  24. f32_4 a = f32_4::load(in);
  25. f32_4 b = 2.f * a / (1 - a);
  26. b *= sin(2 * M_PI * a);
  27. b.store(out);
  28. */
  29. template <int N>
  30. struct f32;
  31. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  32. */
  33. template <>
  34. struct f32<4> {
  35. __m128 v;
  36. /** Constructs an uninitialized vector. */
  37. f32<4>() {}
  38. /** Constructs a vector from a native `__m128` type. */
  39. f32<4>(__m128 v) : v(v) {}
  40. /** Constructs a vector with all elements set to `x`. */
  41. f32<4>(float x) {
  42. v = _mm_set_ps1(x);
  43. }
  44. /** Constructs a vector from four values. */
  45. f32<4>(float x1, float x2, float x3, float x4) {
  46. v = _mm_set_ps(x1, x2, x3, x4);
  47. }
  48. /** Reads an array of 4 values. */
  49. static f32<4> load(const float *x) {
  50. return f32<4>(_mm_loadu_ps(x));
  51. }
  52. /** Returns a vector initialized to zero. */
  53. static f32<4> zero() {
  54. return f32<4>(_mm_setzero_ps());
  55. }
  56. /** Writes an array of 4 values. */
  57. void store(float *x) {
  58. _mm_storeu_ps(x, v);
  59. }
  60. };
  61. typedef f32<4> f32_4;
  62. // Operator overloads
  63. /** `a @ b` */
  64. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  65. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  66. return f32_4(func(a.v, b.v)); \
  67. }
  68. /** `a @= b` */
  69. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  70. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  71. a = opfunc(a, b); \
  72. return a; \
  73. }
  74. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  75. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  76. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  77. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  78. /**
  79. Use these to apply logic, bit masks, and conditions to elements.
  80. Examples:
  81. Subtract 1 from value if greater than or equal to 1.
  82. x -= (x >= 1.f) & 1.f;
  83. */
  84. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  85. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  86. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  87. /** Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  88. */
  89. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  90. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  91. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  92. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  93. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  94. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  95. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  96. /** `+a` */
  97. inline f32_4 operator+(const f32_4 &a) {
  98. return a;
  99. }
  100. /** `-a` */
  101. inline f32_4 operator-(const f32_4 &a) {
  102. return 0.f - a;
  103. }
  104. /** `++a` */
  105. inline f32_4 &operator++(f32_4 &a) {
  106. a += 1.f;
  107. return a;
  108. }
  109. /** `--a` */
  110. inline f32_4 &operator--(f32_4 &a) {
  111. a -= 1.f;
  112. return a;
  113. }
  114. /** `a++` */
  115. inline f32_4 operator++(f32_4 &a, int) {
  116. f32_4 b = a;
  117. ++a;
  118. return b;
  119. }
  120. /** `a--` */
  121. inline f32_4 operator--(f32_4 &a, int) {
  122. f32_4 b = a;
  123. --a;
  124. return b;
  125. }
  126. /** `~a` */
  127. inline f32_4 operator~(const f32_4 &a) {
  128. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  129. }
  130. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  131. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  132. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  133. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  134. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  135. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  136. // Math functions
  137. inline f32_4 fmax(f32_4 x, f32_4 b) {
  138. return f32_4(_mm_max_ps(x.v, b.v));
  139. }
  140. inline f32_4 fmin(f32_4 x, f32_4 b) {
  141. return f32_4(_mm_min_ps(x.v, b.v));
  142. }
  143. inline f32_4 sqrt(f32_4 x) {
  144. return f32_4(_mm_sqrt_ps(x.v));
  145. }
  146. /** Returns the approximate reciprocal square root.
  147. Much faster than `1/sqrt(x)`.
  148. */
  149. inline f32_4 rsqrt(f32_4 x) {
  150. return f32_4(_mm_rsqrt_ps(x.v));
  151. }
  152. /** Returns the approximate reciprocal.
  153. Much faster than `1/x`.
  154. */
  155. inline f32_4 rcp(f32_4 x) {
  156. return f32_4(_mm_rcp_ps(x.v));
  157. }
  158. inline f32_4 log(f32_4 x) {
  159. return f32_4(sse_mathfun_log_ps(x.v));
  160. }
  161. inline f32_4 exp(f32_4 x) {
  162. return f32_4(sse_mathfun_exp_ps(x.v));
  163. }
  164. inline f32_4 sin(f32_4 x) {
  165. return f32_4(sse_mathfun_sin_ps(x.v));
  166. }
  167. inline f32_4 cos(f32_4 x) {
  168. return f32_4(sse_mathfun_cos_ps(x.v));
  169. }
  170. inline f32_4 floor(f32_4 a) {
  171. return f32_4(sse_mathfun_floor_ps(a.v));
  172. }
  173. inline f32_4 ceil(f32_4 a) {
  174. return f32_4(sse_mathfun_ceil_ps(a.v));
  175. }
  176. inline f32_4 round(f32_4 a) {
  177. return f32_4(sse_mathfun_round_ps(a.v));
  178. }
  179. inline f32_4 fmod(f32_4 a, f32_4 b) {
  180. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  181. }
  182. inline f32_4 fabs(f32_4 a) {
  183. return f32_4(sse_mathfun_fabs_ps(a.v));
  184. }
  185. inline f32_4 trunc(f32_4 a) {
  186. return f32_4(sse_mathfun_trunc_ps(a.v));
  187. }
  188. inline f32_4 pow(f32_4 a, f32_4 b) {
  189. return exp(b * log(a));
  190. }
  191. inline f32_4 pow(float a, f32_4 b) {
  192. return exp(b * std::log(a));
  193. }
  194. } // namespace dsp
  195. } // namespace rack