You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

238 lines
4.9KB

  1. #include "common.hpp"
  2. #include "sse_mathfun.h"
  3. #include <emmintrin.h>
  4. namespace rack {
  5. namespace dsp {
  6. inline float cast_i32_f32(int i) {
  7. float f;
  8. std::memcpy(&f, &i, sizeof(f));
  9. return f;
  10. }
  11. inline int cast_f32_i32(float f) {
  12. float i;
  13. std::memcpy(&i, &f, sizeof(i));
  14. return i;
  15. }
  16. template <int N>
  17. struct f32;
  18. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  19. */
  20. template <>
  21. struct f32<4> {
  22. __m128 v;
  23. /** Constructs an uninitialized vector. */
  24. f32<4>() {}
  25. /** Constructs a vector from a native `__m128` type. */
  26. f32<4>(__m128 v) : v(v) {}
  27. /** Constructs a vector with all elements set to `x`. */
  28. f32<4>(float x) {
  29. v = _mm_set_ps1(x);
  30. }
  31. /** Reads an array of 4 values. */
  32. static f32<4> load(const float *x) {
  33. return f32<4>(_mm_loadu_ps(x));
  34. }
  35. /** Returns a vector initialized to zero. */
  36. static f32<4> zero() {
  37. return f32<4>(_mm_setzero_ps());
  38. }
  39. /** Writes an array of 4 values. */
  40. void store(float *x) {
  41. _mm_storeu_ps(x, v);
  42. }
  43. };
  44. typedef f32<4> f32_4;
  45. // Operator overloads
  46. /** `a operator b` */
  47. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  48. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  49. return f32_4(func(a.v, b.v)); \
  50. } \
  51. template <typename T> \
  52. f32_4 operator(const T &a, const f32_4 &b) { \
  53. return operator(f32_4(a), b); \
  54. } \
  55. template <typename T> \
  56. f32_4 operator(const f32_4 &a, const T &b) { \
  57. return operator(a, f32_4(b)); \
  58. }
  59. /** `a operator b` */
  60. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  61. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  62. a = opfunc(a, b); \
  63. return a; \
  64. } \
  65. template <typename T> \
  66. f32_4 &operator(f32_4 &a, const T &b) { \
  67. return operator(a, f32_4(b)); \
  68. }
  69. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  70. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  71. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  72. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  73. /** Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  74. Use these to apply logic, bit masks, and conditions to elements.
  75. Examples:
  76. Subtract 1 from value if greater than or equal to 1.
  77. x -= (x >= 1.f) & 1.f;
  78. */
  79. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  80. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  81. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  82. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  83. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  84. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  85. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  86. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  87. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  88. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  89. /** `+a` */
  90. inline f32_4 operator+(const f32_4 &a) {
  91. return a;
  92. }
  93. /** `-a` */
  94. inline f32_4 operator-(const f32_4 &a) {
  95. return 0.f - a;
  96. }
  97. /** `++a` */
  98. inline f32_4 &operator++(f32_4 &a) {
  99. a += 1.f;
  100. return a;
  101. }
  102. /** `--a` */
  103. inline f32_4 &operator--(f32_4 &a) {
  104. a -= 1.f;
  105. return a;
  106. }
  107. /** `a++` */
  108. inline f32_4 operator++(f32_4 &a, int) {
  109. f32_4 b = a;
  110. ++a;
  111. return b;
  112. }
  113. /** `a--` */
  114. inline f32_4 operator--(f32_4 &a, int) {
  115. f32_4 b = a;
  116. --a;
  117. return b;
  118. }
  119. /** `~a` */
  120. inline f32_4 operator~(const f32_4 &a) {
  121. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  122. }
  123. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  124. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  125. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  126. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  127. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  128. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  129. // Math functions
  130. inline f32_4 fmax(f32_4 x, f32_4 b) {
  131. return f32_4(_mm_max_ps(x.v, b.v));
  132. }
  133. inline f32_4 fmin(f32_4 x, f32_4 b) {
  134. return f32_4(_mm_min_ps(x.v, b.v));
  135. }
  136. inline f32_4 sqrt(f32_4 x) {
  137. return f32_4(_mm_sqrt_ps(x.v));
  138. }
  139. /** Returns the approximate reciprocal square root.
  140. Much faster than `1/sqrt(x)`.
  141. */
  142. inline f32_4 rsqrt(f32_4 x) {
  143. return f32_4(_mm_rsqrt_ps(x.v));
  144. }
  145. /** Returns the approximate reciprocal.
  146. Much faster than `1/x`.
  147. */
  148. inline f32_4 rcp(f32_4 x) {
  149. return f32_4(_mm_rcp_ps(x.v));
  150. }
  151. inline f32_4 log(f32_4 x) {
  152. return f32_4(sse_mathfun_log_ps(x.v));
  153. }
  154. inline f32_4 exp(f32_4 x) {
  155. return f32_4(sse_mathfun_exp_ps(x.v));
  156. }
  157. inline f32_4 sin(f32_4 x) {
  158. return f32_4(sse_mathfun_sin_ps(x.v));
  159. }
  160. inline f32_4 cos(f32_4 x) {
  161. return f32_4(sse_mathfun_cos_ps(x.v));
  162. }
  163. inline f32_4 floor(f32_4 a) {
  164. return f32_4(sse_mathfun_floor_ps(a.v));
  165. }
  166. inline f32_4 ceil(f32_4 a) {
  167. return f32_4(sse_mathfun_ceil_ps(a.v));
  168. }
  169. inline f32_4 round(f32_4 a) {
  170. return f32_4(sse_mathfun_round_ps(a.v));
  171. }
  172. inline f32_4 fmod(f32_4 a, f32_4 b) {
  173. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  174. }
  175. inline f32_4 fabs(f32_4 a) {
  176. return f32_4(sse_mathfun_fabs_ps(a.v));
  177. }
  178. inline f32_4 trunc(f32_4 a) {
  179. return f32_4(sse_mathfun_trunc_ps(a.v));
  180. }
  181. } // namespace dsp
  182. } // namespace rack