diff --git a/include/simd/vector.hpp b/include/simd/vector.hpp index 5be44311..1d98ec5c 100644 --- a/include/simd/vector.hpp +++ b/include/simd/vector.hpp @@ -1,7 +1,6 @@ #pragma once #include #include -#include namespace rack { @@ -11,23 +10,6 @@ namespace rack { namespace simd { -/** Casts the literal bits of FROM to TO without type conversion. -API copied from C++20. - -Usage example: - - printf("%08x\n", bit_cast(1.f)); // Prints 3f800000 -*/ -template -TO bit_cast(const FROM &x) { - static_assert(sizeof(FROM) == sizeof(TO), "types must have equal size"); - // Should be optimized to two `mov` instructions - TO y; - std::memcpy(&y, &x, sizeof(x)); - return y; -} - - /** Generic class for vector types. This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing. @@ -63,7 +45,7 @@ struct Vector { /** Constructs a vector with all elements set to `x`. */ Vector(float x) { - v = _mm_set_ps1(x); + v = _mm_set1_ps(x); } /** Constructs a vector from four values. */ @@ -78,8 +60,7 @@ struct Vector { /** Returns a vector with all 1 bits. */ static Vector mask() { - __m128 zero = _mm_setzero_ps(); - return Vector(_mm_cmpeq_ps(zero, zero)); + return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); } /** Reads an array of 4 values. */ @@ -99,34 +80,76 @@ struct Vector { }; -// Typedefs +template <> +struct Vector { + union { + __m128i v; + int32_t s[4]; + }; + + Vector() {} + Vector(__m128i v) : v(v) {} + Vector(int32_t x) { + v = _mm_set1_epi32(x); + } + Vector(int32_t x1, int32_t x2, int32_t x3, int32_t x4) { + v = _mm_set_epi32(x1, x2, x3, x4); + } + static Vector zero() { + return Vector(_mm_setzero_si128()); + } + static Vector mask() { + return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + } + static Vector load(const int32_t *x) { + // HACK + // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32() + return Vector(_mm_loadu_si128((__m128i*) x)); + } + void store(int32_t *x) { + // HACK + // Use _mm_storeu_si128() because GCC doesn't support _mm_storeu_si32() + _mm_storeu_si128((__m128i*) x, v); + } +}; -typedef Vector float_4; -// typedef Vector double_2; -// typedef Vector int32_4; +// Instructions not available as operators + + +/** `~a & b` */ +inline Vector andnot(const Vector &a, const Vector &b) { + return Vector(_mm_andnot_ps(a.v, b.v)); +} // Operator overloads /** `a @ b` */ -#define DECLARE_FLOAT_4_OPERATOR_INFIX(operator, func) \ - inline float_4 operator(const float_4 &a, const float_4 &b) { \ - return float_4(func(a.v, b.v)); \ +#define DECLARE_VECTOR_OPERATOR_INFIX(t, s, operator, func) \ + inline Vector operator(const Vector &a, const Vector &b) { \ + return Vector(func(a.v, b.v)); \ } /** `a @= b` */ -#define DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator, opfunc) \ - inline float_4 &operator(float_4 &a, const float_4 &b) { \ +#define DECLARE_VECTOR_OPERATOR_INCREMENT(t, s, operator, opfunc) \ + inline Vector &operator(Vector &a, const Vector &b) { \ a = opfunc(a, b); \ return a; \ } -DECLARE_FLOAT_4_OPERATOR_INFIX(operator+, _mm_add_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator-, _mm_sub_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator*, _mm_mul_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator/, _mm_div_ps) +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator+, _mm_add_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator+, _mm_add_epi32) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator-, _mm_sub_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator-, _mm_sub_epi32) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator*, _mm_mul_ps) +// DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator*, NOT AVAILABLE IN SSE3) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator/, _mm_div_ps) +// DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator/, NOT AVAILABLE IN SSE3) /* Use these to apply logic, bit masks, and conditions to elements. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element. @@ -137,74 +160,134 @@ Subtract 1 from value if greater than or equal to 1. x -= (x >= 1.f) & 1.f; */ -DECLARE_FLOAT_4_OPERATOR_INFIX(operator^, _mm_xor_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator&, _mm_and_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator|, _mm_or_ps) - -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator+=, operator+); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator-=, operator-); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator*=, operator*); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator/=, operator/); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator^=, operator^); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator&=, operator&); -DECLARE_FLOAT_4_OPERATOR_INCREMENT(operator|=, operator|); - -DECLARE_FLOAT_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps) -DECLARE_FLOAT_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps) +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator^, _mm_xor_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator^, _mm_xor_si128) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator&, _mm_and_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator&, _mm_and_si128) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator|, _mm_or_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator|, _mm_or_si128) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator+=, operator+) +DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator+=, operator+) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator-=, operator-) +DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator-=, operator-) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator*=, operator*) +// DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator*=, NOT AVAILABLE IN SSE3) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator/=, operator/) +// DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator/=, NOT AVAILABLE IN SSE3) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator^=, operator^) +DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator^=, operator^) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator&=, operator&) +DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator&=, operator&) + +DECLARE_VECTOR_OPERATOR_INCREMENT(float, 4, operator|=, operator|) +DECLARE_VECTOR_OPERATOR_INCREMENT(int32_t, 4, operator|=, operator|) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator==, _mm_cmpeq_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator==, _mm_cmpeq_epi32) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>=, _mm_cmpge_ps) +inline Vector operator>=(const Vector &a, const Vector &b) { + return Vector(_mm_cmpgt_epi32(a.v, b.v)) ^ Vector::mask(); +} + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator>, _mm_cmpgt_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator>, _mm_cmpgt_epi32) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<=, _mm_cmple_ps) +inline Vector operator<=(const Vector &a, const Vector &b) { + return Vector(_mm_cmplt_epi32(a.v, b.v)) ^ Vector::mask(); +} + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator<, _mm_cmplt_ps) +DECLARE_VECTOR_OPERATOR_INFIX(int32_t, 4, operator<, _mm_cmplt_epi32) + +DECLARE_VECTOR_OPERATOR_INFIX(float, 4, operator!=, _mm_cmpneq_ps) +inline Vector operator!=(const Vector &a, const Vector &b) { + return Vector(_mm_cmpeq_epi32(a.v, b.v)) ^ Vector::mask(); +} /** `+a` */ -inline float_4 operator+(const float_4 &a) { +inline Vector operator+(const Vector &a) { + return a; +} +inline Vector operator+(const Vector &a) { return a; } /** `-a` */ -inline float_4 operator-(const float_4 &a) { +inline Vector operator-(const Vector &a) { return 0.f - a; } +inline Vector operator-(const Vector &a) { + return 0 - a; +} /** `++a` */ -inline float_4 &operator++(float_4 &a) { +inline Vector &operator++(Vector &a) { a += 1.f; return a; } +inline Vector &operator++(Vector &a) { + a += 1; + return a; +} /** `--a` */ -inline float_4 &operator--(float_4 &a) { +inline Vector &operator--(Vector &a) { a -= 1.f; return a; } +inline Vector &operator--(Vector &a) { + a -= 1; + return a; +} /** `a++` */ -inline float_4 operator++(float_4 &a, int) { - float_4 b = a; +inline Vector operator++(Vector &a, int) { + Vector b = a; + ++a; + return b; +} +inline Vector operator++(Vector &a, int) { + Vector b = a; ++a; return b; } /** `a--` */ -inline float_4 operator--(float_4 &a, int) { - float_4 b = a; +inline Vector operator--(Vector &a, int) { + Vector b = a; + --a; + return b; +} +inline Vector operator--(Vector &a, int) { + Vector b = a; --a; return b; } /** `~a` */ -inline float_4 operator~(const float_4 &a) { - return a ^ float_4::mask(); +inline Vector operator~(const Vector &a) { + return a ^ Vector::mask(); +} +inline Vector operator~(const Vector &a) { + return a ^ Vector::mask(); } -// Instructions not available as operators +// Typedefs -/** `~a & b` */ -inline float_4 andnot(const float_4 &a, const float_4 &b) { - return float_4(_mm_andnot_ps(a.v, b.v)); -} +typedef Vector float_4; +typedef Vector int32_4; } // namespace simd