From 1b168ce3a331e6d2e4b0a682b348056c2e5a4fad Mon Sep 17 00:00:00 2001 From: Andrew Belt Date: Mon, 20 May 2019 00:58:40 -0400 Subject: [PATCH] Rename f32_4 to float4 --- include/simd/functions.hpp | 85 +++++++++++++------------- include/simd/vector.hpp | 119 +++++++++++++++++++------------------ 2 files changed, 103 insertions(+), 101 deletions(-) diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp index 0ac446ef..9707c393 100644 --- a/include/simd/functions.hpp +++ b/include/simd/functions.hpp @@ -24,101 +24,101 @@ Example: using std::fmax; -inline f32_4 fmax(f32_4 x, f32_4 b) { - return f32_4(_mm_max_ps(x.v, b.v)); +inline float4 fmax(float4 x, float4 b) { + return float4(_mm_max_ps(x.v, b.v)); } using std::fmin; -inline f32_4 fmin(f32_4 x, f32_4 b) { - return f32_4(_mm_min_ps(x.v, b.v)); +inline float4 fmin(float4 x, float4 b) { + return float4(_mm_min_ps(x.v, b.v)); } using std::sqrt; -inline f32_4 sqrt(f32_4 x) { - return f32_4(_mm_sqrt_ps(x.v)); +inline float4 sqrt(float4 x) { + return float4(_mm_sqrt_ps(x.v)); } using std::log; -inline f32_4 log(f32_4 x) { - return f32_4(sse_mathfun_log_ps(x.v)); +inline float4 log(float4 x) { + return float4(sse_mathfun_log_ps(x.v)); } using std::log10; -inline f32_4 log10(f32_4 x) { - return f32_4(sse_mathfun_log_ps(x.v)) / std::log(10.f); +inline float4 log10(float4 x) { + return float4(sse_mathfun_log_ps(x.v)) / std::log(10.f); } using std::log2; -inline f32_4 log2(f32_4 x) { - return f32_4(sse_mathfun_log_ps(x.v)) / std::log(2.f); +inline float4 log2(float4 x) { + return float4(sse_mathfun_log_ps(x.v)) / std::log(2.f); } using std::exp; -inline f32_4 exp(f32_4 x) { - return f32_4(sse_mathfun_exp_ps(x.v)); +inline float4 exp(float4 x) { + return float4(sse_mathfun_exp_ps(x.v)); } using std::sin; -inline f32_4 sin(f32_4 x) { - return f32_4(sse_mathfun_sin_ps(x.v)); +inline float4 sin(float4 x) { + return float4(sse_mathfun_sin_ps(x.v)); } using std::cos; -inline f32_4 cos(f32_4 x) { - return f32_4(sse_mathfun_cos_ps(x.v)); +inline float4 cos(float4 x) { + return float4(sse_mathfun_cos_ps(x.v)); } using std::floor; -inline f32_4 floor(f32_4 a) { - return f32_4(sse_mathfun_floor_ps(a.v)); +inline float4 floor(float4 a) { + return float4(sse_mathfun_floor_ps(a.v)); } using std::ceil; -inline f32_4 ceil(f32_4 a) { - return f32_4(sse_mathfun_ceil_ps(a.v)); +inline float4 ceil(float4 a) { + return float4(sse_mathfun_ceil_ps(a.v)); } using std::round; -inline f32_4 round(f32_4 a) { - return f32_4(sse_mathfun_round_ps(a.v)); +inline float4 round(float4 a) { + return float4(sse_mathfun_round_ps(a.v)); } using std::fmod; -inline f32_4 fmod(f32_4 a, f32_4 b) { - return f32_4(sse_mathfun_fmod_ps(a.v, b.v)); +inline float4 fmod(float4 a, float4 b) { + return float4(sse_mathfun_fmod_ps(a.v, b.v)); } using std::fabs; -inline f32_4 fabs(f32_4 a) { - return f32_4(sse_mathfun_fabs_ps(a.v)); +inline float4 fabs(float4 a) { + return float4(sse_mathfun_fabs_ps(a.v)); } using std::trunc; -inline f32_4 trunc(f32_4 a) { - return f32_4(sse_mathfun_trunc_ps(a.v)); +inline float4 trunc(float4 a) { + return float4(sse_mathfun_trunc_ps(a.v)); } using std::pow; -inline f32_4 pow(f32_4 a, f32_4 b) { +inline float4 pow(float4 a, float4 b) { return exp(b * log(a)); } -inline f32_4 pow(float a, f32_4 b) { +inline float4 pow(float a, float4 b) { return exp(b * std::log(a)); } @@ -129,44 +129,43 @@ inline float ifelse(bool cond, float a, float b) { } /** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */ -inline f32_4 ifelse(f32_4 mask, f32_4 a, f32_4 b) { +inline float4 ifelse(float4 mask, float4 a, float4 b) { return (a & mask) | andnot(mask, b); } - /** Returns the approximate reciprocal square root. Much faster than `1/sqrt(x)`. */ -inline f32_4 rsqrt(f32_4 x) { - return f32_4(_mm_rsqrt_ps(x.v)); +inline float4 rsqrt(float4 x) { + return float4(_mm_rsqrt_ps(x.v)); } /** Returns the approximate reciprocal. Much faster than `1/x`. */ -inline f32_4 rcp(f32_4 x) { - return f32_4(_mm_rcp_ps(x.v)); +inline float4 rcp(float4 x) { + return float4(_mm_rcp_ps(x.v)); } // From math.hpp using math::clamp; -inline f32_4 clamp(f32_4 x, f32_4 a, f32_4 b) { +inline float4 clamp(float4 x, float4 a, float4 b) { return fmin(fmax(x, a), b); } using math::rescale; -inline f32_4 rescale(f32_4 x, f32_4 xMin, f32_4 xMax, f32_4 yMin, f32_4 yMax) { +inline float4 rescale(float4 x, float4 xMin, float4 xMax, float4 yMin, float4 yMax) { return yMin + (x - xMin) / (xMax - xMin) * (yMax - yMin); } using math::sgn; -inline f32_4 sgn(f32_4 x) { - f32_4 signbit = x & -0.f; - f32_4 nonzero = (x != 0.f); +inline float4 sgn(float4 x) { + float4 signbit = x & -0.f; + float4 nonzero = (x != 0.f); return signbit | (nonzero & 1.f); } diff --git a/include/simd/vector.hpp b/include/simd/vector.hpp index fea42387..67ee805d 100644 --- a/include/simd/vector.hpp +++ b/include/simd/vector.hpp @@ -28,52 +28,52 @@ TO bit_cast(const FROM &x) { } -/** Generic class for vector float types. +/** Generic class for vector types. -This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing. +This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing. Usage example: float a[4], b[4]; - f32_4 a = f32_4::load(in); - f32_4 b = 2.f * a / (1 - a); + float4 a = float4::load(in); + float4 b = 2.f * a / (1 - a); b *= sin(2 * M_PI * a); b.store(out); */ -template -struct f32; +template +struct Vector; /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values. */ template <> -struct f32<4> { +struct Vector { __m128 v; /** Constructs an uninitialized vector. */ - f32<4>() {} + Vector() {} /** Constructs a vector from a native `__m128` type. */ - f32<4>(__m128 v) : v(v) {} + Vector(__m128 v) : v(v) {} /** Constructs a vector with all elements set to `x`. */ - f32<4>(float x) { + Vector(float x) { v = _mm_set_ps1(x); } /** Constructs a vector from four values. */ - f32<4>(float x1, float x2, float x3, float x4) { + Vector(float x1, float x2, float x3, float x4) { v = _mm_set_ps(x1, x2, x3, x4); } - /** Reads an array of 4 values. */ - static f32<4> load(const float *x) { - return f32<4>(_mm_loadu_ps(x)); + /** Returns a vector initialized to zero. */ + static Vector zero() { + return Vector(_mm_setzero_ps()); } - /** Returns a vector initialized to zero. */ - static f32<4> zero() { - return f32<4>(_mm_setzero_ps()); + /** Reads an array of 4 values. */ + static Vector load(const float *x) { + return Vector(_mm_loadu_ps(x)); } /** Writes an array of 4 values. */ @@ -83,29 +83,32 @@ struct f32<4> { }; -typedef f32<4> f32_4; +// Typedefs + + +typedef Vector float4; // Operator overloads /** `a @ b` */ -#define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \ - inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \ - return f32_4(func(a.v, b.v)); \ +#define DECLARE_FLOAT4_OPERATOR_INFIX(operator, func) \ + inline float4 operator(const float4 &a, const float4 &b) { \ + return float4(func(a.v, b.v)); \ } /** `a @= b` */ -#define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \ - inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \ +#define DECLARE_FLOAT4_OPERATOR_INCREMENT(operator, opfunc) \ + inline float4 &operator(float4 &a, const float4 &b) { \ a = opfunc(a, b); \ return a; \ } -DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator+, _mm_add_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator-, _mm_sub_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator*, _mm_mul_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator/, _mm_div_ps) /* Use these to apply logic, bit masks, and conditions to elements. Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element. @@ -116,75 +119,75 @@ Subtract 1 from value if greater than or equal to 1. x -= (x >= 1.f) & 1.f; */ -DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_or_ps) - -DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+); -DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-); -DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*); -DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/); -DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^); -DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&); -DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|); - -DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps) -DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator^, _mm_xor_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator&, _mm_and_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator|, _mm_or_ps) + +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator+=, operator+); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator-=, operator-); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator*=, operator*); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator/=, operator/); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator^=, operator^); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator&=, operator&); +DECLARE_FLOAT4_OPERATOR_INCREMENT(operator|=, operator|); + +DECLARE_FLOAT4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator<=, _mm_cmple_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator<, _mm_cmplt_ps) +DECLARE_FLOAT4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps) /** `+a` */ -inline f32_4 operator+(const f32_4 &a) { +inline float4 operator+(const float4 &a) { return a; } /** `-a` */ -inline f32_4 operator-(const f32_4 &a) { +inline float4 operator-(const float4 &a) { return 0.f - a; } /** `++a` */ -inline f32_4 &operator++(f32_4 &a) { +inline float4 &operator++(float4 &a) { a += 1.f; return a; } /** `--a` */ -inline f32_4 &operator--(f32_4 &a) { +inline float4 &operator--(float4 &a) { a -= 1.f; return a; } /** `a++` */ -inline f32_4 operator++(f32_4 &a, int) { - f32_4 b = a; +inline float4 operator++(float4 &a, int) { + float4 b = a; ++a; return b; } /** `a--` */ -inline f32_4 operator--(f32_4 &a, int) { - f32_4 b = a; +inline float4 operator--(float4 &a, int) { + float4 b = a; --a; return b; } /** `~a` */ -inline f32_4 operator~(const f32_4 &a) { - f32_4 mask = f32_4::zero(); +inline float4 operator~(const float4 &a) { + float4 mask = float4::zero(); mask = (mask == mask); return a ^ mask; } -// helpful idioms +// Instructions not available as operators /** `~a & b` */ -inline f32_4 andnot(const f32_4 &a, const f32_4 &b) { - return f32_4(_mm_andnot_ps(a.v, b.v)); +inline float4 andnot(const float4 &a, const float4 &b) { + return float4(_mm_andnot_ps(a.v, b.v)); }