You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

201 lines
4.0KB

  1. #include "sse_mathfun.h"
  2. #include <emmintrin.h>
  3. namespace rack {
  4. namespace dsp {
  5. template <int N>
  6. struct f32;
  7. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values. */
  8. template <>
  9. struct f32<4> {
  10. __m128 v;
  11. f32<4>() {}
  12. f32<4>(__m128 v) : v(v) {}
  13. f32<4>(float x) {
  14. v = _mm_set_ps1(x);
  15. }
  16. /** Reads an array of 4 values. */
  17. static f32<4> load(const float *x) {
  18. return f32<4>(_mm_loadu_ps(x));
  19. }
  20. /** Writes an array of 4 values. */
  21. void store(float *x) {
  22. _mm_storeu_ps(x, v);
  23. }
  24. };
  25. typedef f32<4> f32_4;
  26. // Operator overloads
  27. /** `a operator b` */
  28. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  29. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  30. return f32_4(func(a.v, b.v)); \
  31. } \
  32. template <typename T> \
  33. f32_4 operator(const T &a, const f32_4 &b) { \
  34. return operator(f32_4(a), b); \
  35. } \
  36. template <typename T> \
  37. f32_4 operator(const f32_4 &a, const T &b) { \
  38. return operator(a, f32_4(b)); \
  39. }
  40. /** `a operator b` */
  41. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, func) \
  42. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  43. a.v = func(a.v, b.v); \
  44. return a; \
  45. } \
  46. template <typename T> \
  47. f32_4 &operator(f32_4 &a, const T &b) { \
  48. return operator(a, f32_4(b)); \
  49. }
  50. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  51. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  52. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  53. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  54. /** `+a` */
  55. inline f32_4 operator+(const f32_4 &a) {
  56. return a;
  57. }
  58. /** `-a` */
  59. inline f32_4 operator-(const f32_4 &a) {
  60. return 0.f - a;
  61. }
  62. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, _mm_add_ps);
  63. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, _mm_sub_ps);
  64. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, _mm_mul_ps);
  65. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, _mm_div_ps);
  66. /** `++a` */
  67. inline f32_4 &operator++(f32_4 &a) {
  68. a += 1.f;
  69. return a;
  70. }
  71. /** `--a` */
  72. inline f32_4 &operator--(f32_4 &a) {
  73. a -= 1.f;
  74. return a;
  75. }
  76. /** `a++` */
  77. inline f32_4 operator++(f32_4 &a, int) {
  78. f32_4 b = a;
  79. ++a;
  80. return b;
  81. }
  82. /** `a--` */
  83. inline f32_4 operator--(f32_4 &a, int) {
  84. f32_4 b = a;
  85. --a;
  86. return b;
  87. }
  88. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  89. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  90. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_mul_ps)
  91. /** `~a` */
  92. inline f32_4 operator~(const f32_4 &a) {
  93. return f32_4(_mm_xor_ps(a.v, _mm_cmpeq_ps(a.v, a.v)));
  94. }
  95. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  96. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  97. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  98. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  99. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  100. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  101. // Math functions
  102. inline f32_4 fmax(f32_4 x, f32_4 b) {
  103. return f32_4(_mm_max_ps(x.v, b.v));
  104. }
  105. inline f32_4 fmin(f32_4 x, f32_4 b) {
  106. return f32_4(_mm_min_ps(x.v, b.v));
  107. }
  108. inline f32_4 sqrt(f32_4 x) {
  109. return f32_4(_mm_sqrt_ps(x.v));
  110. }
  111. /** Returns the approximate reciprocal square root.
  112. Much faster than `1/sqrt(x)`.
  113. */
  114. inline f32_4 rsqrt(f32_4 x) {
  115. return f32_4(_mm_rsqrt_ps(x.v));
  116. }
  117. /** Returns the approximate reciprocal.
  118. Much faster than `1/x`.
  119. */
  120. inline f32_4 rcp(f32_4 x) {
  121. return f32_4(_mm_rcp_ps(x.v));
  122. }
  123. inline f32_4 log(f32_4 x) {
  124. return f32_4(sse_mathfun_log_ps(x.v));
  125. }
  126. inline f32_4 exp(f32_4 x) {
  127. return f32_4(sse_mathfun_exp_ps(x.v));
  128. }
  129. inline f32_4 sin(f32_4 x) {
  130. return f32_4(sse_mathfun_sin_ps(x.v));
  131. }
  132. inline f32_4 cos(f32_4 x) {
  133. return f32_4(sse_mathfun_cos_ps(x.v));
  134. }
  135. inline f32_4 floor(f32_4 a) {
  136. return f32_4(sse_mathfun_floor_ps(a.v));
  137. }
  138. inline f32_4 ceil(f32_4 a) {
  139. return f32_4(sse_mathfun_ceil_ps(a.v));
  140. }
  141. inline f32_4 round(f32_4 a) {
  142. return f32_4(sse_mathfun_round_ps(a.v));
  143. }
  144. inline f32_4 fmod(f32_4 a, f32_4 b) {
  145. return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
  146. }
  147. inline f32_4 fabs(f32_4 a) {
  148. return f32_4(sse_mathfun_fabs_ps(a.v));
  149. }
  150. inline f32_4 trunc(f32_4 a) {
  151. return f32_4(sse_mathfun_trunc_ps(a.v));
  152. }
  153. } // namespace dsp
  154. } // namespace rack