You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
4.2KB

  1. #pragma once
  2. #include <cstring>
  3. #include <x86intrin.h>
  4. #include <type_traits>
  5. namespace rack {
  6. /** Abstraction of byte-aligned values for SIMD CPU acceleration. */
  7. namespace simd {
  8. /** Casts the literal bits of FROM to TO without type conversion.
  9. API copied from C++20.
  10. Usage example:
  11. printf("%08x\n", bit_cast<int>(1.f)); // Prints 3f800000
  12. */
  13. template <typename TO, typename FROM>
  14. TO bit_cast(const FROM &x) {
  15. static_assert(sizeof(FROM) == sizeof(TO), "types must have equal size");
  16. // Should be optimized to two `mov` instructions
  17. TO y;
  18. std::memcpy(&y, &x, sizeof(x));
  19. return y;
  20. }
  21. /** Generic class for vector float types.
  22. This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
  23. Usage example:
  24. float a[4], b[4];
  25. f32_4 a = f32_4::load(in);
  26. f32_4 b = 2.f * a / (1 - a);
  27. b *= sin(2 * M_PI * a);
  28. b.store(out);
  29. */
  30. template <int N>
  31. struct f32;
  32. /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
  33. */
  34. template <>
  35. struct f32<4> {
  36. __m128 v;
  37. /** Constructs an uninitialized vector. */
  38. f32<4>() {}
  39. /** Constructs a vector from a native `__m128` type. */
  40. f32<4>(__m128 v) : v(v) {}
  41. /** Constructs a vector with all elements set to `x`. */
  42. f32<4>(float x) {
  43. v = _mm_set_ps1(x);
  44. }
  45. /** Constructs a vector from four values. */
  46. f32<4>(float x1, float x2, float x3, float x4) {
  47. v = _mm_set_ps(x1, x2, x3, x4);
  48. }
  49. /** Reads an array of 4 values. */
  50. static f32<4> load(const float *x) {
  51. return f32<4>(_mm_loadu_ps(x));
  52. }
  53. /** Returns a vector initialized to zero. */
  54. static f32<4> zero() {
  55. return f32<4>(_mm_setzero_ps());
  56. }
  57. /** Writes an array of 4 values. */
  58. void store(float *x) {
  59. _mm_storeu_ps(x, v);
  60. }
  61. };
  62. typedef f32<4> f32_4;
  63. // Operator overloads
  64. /** `a @ b` */
  65. #define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
  66. inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
  67. return f32_4(func(a.v, b.v)); \
  68. }
  69. /** `a @= b` */
  70. #define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
  71. inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
  72. a = opfunc(a, b); \
  73. return a; \
  74. }
  75. DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
  76. DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
  77. DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
  78. DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
  79. /* Use these to apply logic, bit masks, and conditions to elements.
  80. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
  81. Examples:
  82. Subtract 1 from value if greater than or equal to 1.
  83. x -= (x >= 1.f) & 1.f;
  84. */
  85. DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
  86. DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
  87. DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_or_ps)
  88. DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
  89. DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
  90. DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
  91. DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
  92. DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
  93. DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
  94. DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
  95. DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
  96. DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
  97. DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
  98. DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
  99. DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
  100. DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
  101. /** `+a` */
  102. inline f32_4 operator+(const f32_4 &a) {
  103. return a;
  104. }
  105. /** `-a` */
  106. inline f32_4 operator-(const f32_4 &a) {
  107. return 0.f - a;
  108. }
  109. /** `++a` */
  110. inline f32_4 &operator++(f32_4 &a) {
  111. a += 1.f;
  112. return a;
  113. }
  114. /** `--a` */
  115. inline f32_4 &operator--(f32_4 &a) {
  116. a -= 1.f;
  117. return a;
  118. }
  119. /** `a++` */
  120. inline f32_4 operator++(f32_4 &a, int) {
  121. f32_4 b = a;
  122. ++a;
  123. return b;
  124. }
  125. /** `a--` */
  126. inline f32_4 operator--(f32_4 &a, int) {
  127. f32_4 b = a;
  128. --a;
  129. return b;
  130. }
  131. /** `~a` */
  132. inline f32_4 operator~(const f32_4 &a) {
  133. f32_4 mask = f32_4::zero();
  134. mask = (mask == mask);
  135. return a ^ mask;
  136. }
  137. // helpful idioms
  138. /** `~a & b` */
  139. inline f32_4 andnot(const f32_4 &a, const f32_4 &b) {
  140. return f32_4(_mm_andnot_ps(a.v, b.v));
  141. }
  142. } // namespace simd
  143. } // namespace rack