You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

109 lines
2.5KB

  1. #include <x86intrin.h>
  2. namespace rack {
  3. namespace simd {
  4. template <int N>
  5. struct f32;
  6. /** Wrapper for `__m128` representing a vector of 4 single-precision float values. */
  7. template <>
  8. struct f32<4> {
  9. __m128 v;
  10. f32<4>() {}
  11. f32<4>(__m128 v) : v(v) {}
  12. f32<4>(float x) {
  13. v = _mm_set_ps1(x);
  14. }
  15. /** Reads an array of 4 values. */
  16. f32<4>(const float *x) {
  17. v = _mm_loadu_ps(x);
  18. }
  19. /** Writes an array of 4 values. */
  20. void store(float *x) {
  21. _mm_storeu_ps(x, v);
  22. }
  23. };
  24. typedef f32<4> f32_4;
  25. // Operator overloads
  26. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  27. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  28. return f32_4(func(a.v, b.v)); \
  29. } \
  30. template <typename T> \
  31. f32_4 operator(const T &a, const f32_4 &b) { \
  32. return operator(f32_4(a), b); \
  33. } \
  34. template <typename T> \
  35. f32_4 operator(const f32_4 &a, const T &b) { \
  36. return operator(a, f32_4(b)); \
  37. }
  38. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, func) \
  39. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  40. a.v = func(a.v, b.v); \
  41. return a; \
  42. } \
  43. template <typename T> \
  44. f32_4 &operator(f32_4 &a, const T &b) { \
  45. return operator(a, f32_4(b)); \
  46. }
  47. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  48. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  49. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  50. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  51. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, _mm_add_ps);
  52. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, _mm_sub_ps);
  53. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, _mm_mul_ps);
  54. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, _mm_div_ps);
  55. // TODO Perhaps return a future i32 type for these, or add casting between multiple simd types
  56. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  57. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  58. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  59. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  60. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  61. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  62. inline f32_4 fmax(f32_4 x, f32_4 b) {
  63. return f32_4(_mm_max_ps(x.v, b.v));
  64. }
  65. inline f32_4 fmin(f32_4 x, f32_4 b) {
  66. return f32_4(_mm_min_ps(x.v, b.v));
  67. }
  68. inline f32_4 sqrt(f32_4 x) {
  69. return f32_4(_mm_sqrt_ps(x.v));
  70. }
  71. /** Returns the approximate reciprocal square root.
  72. Much faster than `1/sqrt(x)`.
  73. */
  74. inline f32_4 rsqrt(f32_4 x) {
  75. return f32_4(_mm_rsqrt_ps(x.v));
  76. }
  77. /** Returns the approximate reciprocal.
  78. Much faster than `1/x`.
  79. */
  80. inline f32_4 rcp(f32_4 x) {
  81. return f32_4(_mm_rcp_ps(x.v));
  82. }
  83. } // namespace simd
  84. } // namespace rack