From 1b168ce3a331e6d2e4b0a682b348056c2e5a4fad Mon Sep 17 00:00:00 2001
From: Andrew Belt <andrewpbelt@gmail.com>
Date: Mon, 20 May 2019 00:58:40 -0400
Subject: [PATCH] Rename f32_4 to float4

---
 include/simd/functions.hpp |  85 +++++++++++++-------------
 include/simd/vector.hpp    | 119 +++++++++++++++++++------------------
 2 files changed, 103 insertions(+), 101 deletions(-)

diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp
index 0ac446ef..9707c393 100644
--- a/include/simd/functions.hpp
+++ b/include/simd/functions.hpp
@@ -24,101 +24,101 @@ Example:
 
 using std::fmax;
 
-inline f32_4 fmax(f32_4 x, f32_4 b) {
-	return f32_4(_mm_max_ps(x.v, b.v));
+inline float4 fmax(float4 x, float4 b) {
+	return float4(_mm_max_ps(x.v, b.v));
 }
 
 using std::fmin;
 
-inline f32_4 fmin(f32_4 x, f32_4 b) {
-	return f32_4(_mm_min_ps(x.v, b.v));
+inline float4 fmin(float4 x, float4 b) {
+	return float4(_mm_min_ps(x.v, b.v));
 }
 
 using std::sqrt;
 
-inline f32_4 sqrt(f32_4 x) {
-	return f32_4(_mm_sqrt_ps(x.v));
+inline float4 sqrt(float4 x) {
+	return float4(_mm_sqrt_ps(x.v));
 }
 
 using std::log;
 
-inline f32_4 log(f32_4 x) {
-	return f32_4(sse_mathfun_log_ps(x.v));
+inline float4 log(float4 x) {
+	return float4(sse_mathfun_log_ps(x.v));
 }
 
 using std::log10;
 
-inline f32_4 log10(f32_4 x) {
-	return f32_4(sse_mathfun_log_ps(x.v)) / std::log(10.f);
+inline float4 log10(float4 x) {
+	return float4(sse_mathfun_log_ps(x.v)) / std::log(10.f);
 }
 
 using std::log2;
 
-inline f32_4 log2(f32_4 x) {
-	return f32_4(sse_mathfun_log_ps(x.v)) / std::log(2.f);
+inline float4 log2(float4 x) {
+	return float4(sse_mathfun_log_ps(x.v)) / std::log(2.f);
 }
 
 using std::exp;
 
-inline f32_4 exp(f32_4 x) {
-	return f32_4(sse_mathfun_exp_ps(x.v));
+inline float4 exp(float4 x) {
+	return float4(sse_mathfun_exp_ps(x.v));
 }
 
 using std::sin;
 
-inline f32_4 sin(f32_4 x) {
-	return f32_4(sse_mathfun_sin_ps(x.v));
+inline float4 sin(float4 x) {
+	return float4(sse_mathfun_sin_ps(x.v));
 }
 
 using std::cos;
 
-inline f32_4 cos(f32_4 x) {
-	return f32_4(sse_mathfun_cos_ps(x.v));
+inline float4 cos(float4 x) {
+	return float4(sse_mathfun_cos_ps(x.v));
 }
 
 using std::floor;
 
-inline f32_4 floor(f32_4 a) {
-	return f32_4(sse_mathfun_floor_ps(a.v));
+inline float4 floor(float4 a) {
+	return float4(sse_mathfun_floor_ps(a.v));
 }
 
 using std::ceil;
 
-inline f32_4 ceil(f32_4 a) {
-	return f32_4(sse_mathfun_ceil_ps(a.v));
+inline float4 ceil(float4 a) {
+	return float4(sse_mathfun_ceil_ps(a.v));
 }
 
 using std::round;
 
-inline f32_4 round(f32_4 a) {
-	return f32_4(sse_mathfun_round_ps(a.v));
+inline float4 round(float4 a) {
+	return float4(sse_mathfun_round_ps(a.v));
 }
 
 using std::fmod;
 
-inline f32_4 fmod(f32_4 a, f32_4 b) {
-	return f32_4(sse_mathfun_fmod_ps(a.v, b.v));
+inline float4 fmod(float4 a, float4 b) {
+	return float4(sse_mathfun_fmod_ps(a.v, b.v));
 }
 
 using std::fabs;
 
-inline f32_4 fabs(f32_4 a) {
-	return f32_4(sse_mathfun_fabs_ps(a.v));
+inline float4 fabs(float4 a) {
+	return float4(sse_mathfun_fabs_ps(a.v));
 }
 
 using std::trunc;
 
-inline f32_4 trunc(f32_4 a) {
-	return f32_4(sse_mathfun_trunc_ps(a.v));
+inline float4 trunc(float4 a) {
+	return float4(sse_mathfun_trunc_ps(a.v));
 }
 
 using std::pow;
 
-inline f32_4 pow(f32_4 a, f32_4 b) {
+inline float4 pow(float4 a, float4 b) {
 	return exp(b * log(a));
 }
 
-inline f32_4 pow(float a, f32_4 b) {
+inline float4 pow(float a, float4 b) {
 	return exp(b * std::log(a));
 }
 
@@ -129,44 +129,43 @@ inline float ifelse(bool cond, float a, float b) {
 }
 
 /** Given a mask, returns a if mask is 0xffffffff per element, b if mask is 0x00000000 */
-inline f32_4 ifelse(f32_4 mask, f32_4 a, f32_4 b) {
+inline float4 ifelse(float4 mask, float4 a, float4 b) {
 	return (a & mask) | andnot(mask, b);
 }
 
-
 /** Returns the approximate reciprocal square root.
 Much faster than `1/sqrt(x)`.
 */
-inline f32_4 rsqrt(f32_4 x) {
-	return f32_4(_mm_rsqrt_ps(x.v));
+inline float4 rsqrt(float4 x) {
+	return float4(_mm_rsqrt_ps(x.v));
 }
 
 /** Returns the approximate reciprocal.
 Much faster than `1/x`.
 */
-inline f32_4 rcp(f32_4 x) {
-	return f32_4(_mm_rcp_ps(x.v));
+inline float4 rcp(float4 x) {
+	return float4(_mm_rcp_ps(x.v));
 }
 
 // From math.hpp
 
 using math::clamp;
 
-inline f32_4 clamp(f32_4 x, f32_4 a, f32_4 b) {
+inline float4 clamp(float4 x, float4 a, float4 b) {
 	return fmin(fmax(x, a), b);
 }
 
 using math::rescale;
 
-inline f32_4 rescale(f32_4 x, f32_4 xMin, f32_4 xMax, f32_4 yMin, f32_4 yMax) {
+inline float4 rescale(float4 x, float4 xMin, float4 xMax, float4 yMin, float4 yMax) {
 	return yMin + (x - xMin) / (xMax - xMin) * (yMax - yMin);
 }
 
 using math::sgn;
 
-inline f32_4 sgn(f32_4 x) {
-	f32_4 signbit = x & -0.f;
-	f32_4 nonzero = (x != 0.f);
+inline float4 sgn(float4 x) {
+	float4 signbit = x & -0.f;
+	float4 nonzero = (x != 0.f);
 	return signbit | (nonzero & 1.f);
 }
 
diff --git a/include/simd/vector.hpp b/include/simd/vector.hpp
index fea42387..67ee805d 100644
--- a/include/simd/vector.hpp
+++ b/include/simd/vector.hpp
@@ -28,52 +28,52 @@ TO bit_cast(const FROM &x) {
 }
 
 
-/** Generic class for vector float types.
+/** Generic class for vector types.
 
-This class is designed to be used just like `float` scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
+This class is designed to be used just like you use scalars, with extra features for handling bitwise logic, conditions, loading, and storing.
 
 Usage example:
 
 	float a[4], b[4];
-	f32_4 a = f32_4::load(in);
-	f32_4 b = 2.f * a / (1 - a);
+	float4 a = float4::load(in);
+	float4 b = 2.f * a / (1 - a);
 	b *= sin(2 * M_PI * a);
 	b.store(out);
 */
-template <int N>
-struct f32;
+template <typename T, int N>
+struct Vector;
 
 
 /** Wrapper for `__m128` representing an aligned vector of 4 single-precision float values.
 */
 template <>
-struct f32<4> {
+struct Vector<float, 4> {
 	__m128 v;
 
 	/** Constructs an uninitialized vector. */
-	f32<4>() {}
+	Vector<float, 4>() {}
 
 	/** Constructs a vector from a native `__m128` type. */
-	f32<4>(__m128 v) : v(v) {}
+	Vector<float, 4>(__m128 v) : v(v) {}
 
 	/** Constructs a vector with all elements set to `x`. */
-	f32<4>(float x) {
+	Vector<float, 4>(float x) {
 		v = _mm_set_ps1(x);
 	}
 
 	/** Constructs a vector from four values. */
-	f32<4>(float x1, float x2, float x3, float x4) {
+	Vector<float, 4>(float x1, float x2, float x3, float x4) {
 		v = _mm_set_ps(x1, x2, x3, x4);
 	}
 
-	/** Reads an array of 4 values. */
-	static f32<4> load(const float *x) {
-		return f32<4>(_mm_loadu_ps(x));
+	/** Returns a vector initialized to zero. */
+	static Vector<float, 4> zero() {
+		return Vector<float, 4>(_mm_setzero_ps());
 	}
 
-	/** Returns a vector initialized to zero. */
-	static f32<4> zero() {
-		return f32<4>(_mm_setzero_ps());
+	/** Reads an array of 4 values. */
+	static Vector<float, 4> load(const float *x) {
+		return Vector<float, 4>(_mm_loadu_ps(x));
 	}
 
 	/** Writes an array of 4 values. */
@@ -83,29 +83,32 @@ struct f32<4> {
 };
 
 
-typedef f32<4> f32_4;
+// Typedefs
+
+
+typedef Vector<float, 4> float4;
 
 
 // Operator overloads
 
 
 /** `a @ b` */
-#define DECLARE_F32_4_OPERATOR_INFIX(operator, func) \
-	inline f32_4 operator(const f32_4 &a, const f32_4 &b) { \
-		return f32_4(func(a.v, b.v)); \
+#define DECLARE_FLOAT4_OPERATOR_INFIX(operator, func) \
+	inline float4 operator(const float4 &a, const float4 &b) { \
+		return float4(func(a.v, b.v)); \
 	}
 
 /** `a @= b` */
-#define DECLARE_F32_4_OPERATOR_INCREMENT(operator, opfunc) \
-	inline f32_4 &operator(f32_4 &a, const f32_4 &b) { \
+#define DECLARE_FLOAT4_OPERATOR_INCREMENT(operator, opfunc) \
+	inline float4 &operator(float4 &a, const float4 &b) { \
 		a = opfunc(a, b); \
 		return a; \
 	}
 
-DECLARE_F32_4_OPERATOR_INFIX(operator+, _mm_add_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator-, _mm_sub_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator*, _mm_mul_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator/, _mm_div_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator+, _mm_add_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator-, _mm_sub_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator*, _mm_mul_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator/, _mm_div_ps)
 
 /* Use these to apply logic, bit masks, and conditions to elements.
 Boolean operators on vectors give 0x00000000 for false and 0xffffffff for true, for each vector element.
@@ -116,75 +119,75 @@ Subtract 1 from value if greater than or equal to 1.
 
 	x -= (x >= 1.f) & 1.f;
 */
-DECLARE_F32_4_OPERATOR_INFIX(operator^, _mm_xor_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator&, _mm_and_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator|, _mm_or_ps)
-
-DECLARE_F32_4_OPERATOR_INCREMENT(operator+=, operator+);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator-=, operator-);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator*=, operator*);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator/=, operator/);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator^=, operator^);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator&=, operator&);
-DECLARE_F32_4_OPERATOR_INCREMENT(operator|=, operator|);
-
-DECLARE_F32_4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
-DECLARE_F32_4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator^, _mm_xor_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator&, _mm_and_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator|, _mm_or_ps)
+
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator+=, operator+);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator-=, operator-);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator*=, operator*);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator/=, operator/);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator^=, operator^);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator&=, operator&);
+DECLARE_FLOAT4_OPERATOR_INCREMENT(operator|=, operator|);
+
+DECLARE_FLOAT4_OPERATOR_INFIX(operator==, _mm_cmpeq_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator>=, _mm_cmpge_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator>, _mm_cmpgt_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator<=, _mm_cmple_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator<, _mm_cmplt_ps)
+DECLARE_FLOAT4_OPERATOR_INFIX(operator!=, _mm_cmpneq_ps)
 
 /** `+a` */
-inline f32_4 operator+(const f32_4 &a) {
+inline float4 operator+(const float4 &a) {
 	return a;
 }
 
 /** `-a` */
-inline f32_4 operator-(const f32_4 &a) {
+inline float4 operator-(const float4 &a) {
 	return 0.f - a;
 }
 
 /** `++a` */
-inline f32_4 &operator++(f32_4 &a) {
+inline float4 &operator++(float4 &a) {
 	a += 1.f;
 	return a;
 }
 
 /** `--a` */
-inline f32_4 &operator--(f32_4 &a) {
+inline float4 &operator--(float4 &a) {
 	a -= 1.f;
 	return a;
 }
 
 /** `a++` */
-inline f32_4 operator++(f32_4 &a, int) {
-	f32_4 b = a;
+inline float4 operator++(float4 &a, int) {
+	float4 b = a;
 	++a;
 	return b;
 }
 
 /** `a--` */
-inline f32_4 operator--(f32_4 &a, int) {
-	f32_4 b = a;
+inline float4 operator--(float4 &a, int) {
+	float4 b = a;
 	--a;
 	return b;
 }
 
 /** `~a` */
-inline f32_4 operator~(const f32_4 &a) {
-	f32_4 mask = f32_4::zero();
+inline float4 operator~(const float4 &a) {
+	float4 mask = float4::zero();
 	mask = (mask == mask);
 	return a ^ mask;
 }
 
 
-// helpful idioms
+// Instructions not available as operators
 
 
 /** `~a & b` */
-inline f32_4 andnot(const f32_4 &a, const f32_4 &b) {
-	return f32_4(_mm_andnot_ps(a.v, b.v));
+inline float4 andnot(const float4 &a, const float4 &b) {
+	return float4(_mm_andnot_ps(a.v, b.v));
 }