Change `movemask(int32_4)` to operate on elements rather than bytes. Tweak Vector documentation and clean up syntax.

5 years ago · a04018a330
--- a/include/simd/vector.hpp
+++ b/include/simd/vector.hpp
@@ -30,7 +30,7 @@ struct Vector;
 */
 template <>
 struct Vector<float, 4> {
 	typedef float type;
 	using type = float;
 	constexpr static int size = 4;
 	union {
@@ -56,35 +56,38 @@ struct Vector<float, 4> {
 		v = _mm_setr_ps(x1, x2, x3, x4);
 	}
 	/** Returns a vector initialized to zero. */
 	/** Returns a vector with all 0 bits. */
 	static Vector zero() {
 		return Vector();
 		return Vector(_mm_setzero_ps());
 	}
 	/** Returns a vector with all 1 bits. */
 	static Vector mask() {
 		return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
 		return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
 	}
 	/** Reads an array of 4 values.
 	On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
 	On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
 	*/
 	static Vector load(const float* x) {
 		/*
 		My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
 		This post seems to agree. https://stackoverflow.com/a/20265193/272642
 		So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower).
 		I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance).
 		*/
 		return Vector(_mm_loadu_ps(x));
 	}
 	/** Writes an array of 4 values.
 	On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
 	On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
 	*/
 	void store(float* x) {
 		_mm_storeu_ps(x, v);
 	}
 	/** Accessing vector elements individually is slow and defeats the purpose of vectorizing.
 	However, this operator is convenient when writing simple serial code in a non-bottlenecked section.
 	*/
 	float& operator[](int i) {
 		return s[i];
 	}
@@ -101,7 +104,7 @@ struct Vector<float, 4> {
 template <>
 struct Vector<int32_t, 4> {
 	typedef int32_t type;
 	using type = int32_t;
 	constexpr static int size = 4;
 	union {
@@ -118,7 +121,7 @@ struct Vector<int32_t, 4> {
 		v = _mm_setr_epi32(x1, x2, x3, x4);
 	}
 	static Vector zero() {
 		return Vector();
 		return Vector(_mm_setzero_si128());
 	}
 	static Vector mask() {
 		return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
@@ -126,7 +129,7 @@ struct Vector<int32_t, 4> {
 	static Vector load(const int32_t* x) {
 		// HACK
 		// Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
 		return Vector(_mm_loadu_si128((__m128i*) x));
 		return Vector(_mm_loadu_si128((const __m128i*) x));
 	}
 	void store(int32_t* x) {
 		// HACK
@@ -179,11 +182,11 @@ inline int movemask(const Vector<float, 4>& a) {
 	return _mm_movemask_ps(a.v);
 }
 /** Returns an integer with each bit corresponding to the most significant bit of each byte.
 For example, `movemask(int32_4::mask())` returns 0xffff.
 /** Returns an integer with each bit corresponding to the most significant bit of each element.
 For example, `movemask(int32_4::mask())` returns 0xf.
 */
 inline int movemask(const Vector<int32_t, 4>& a) {
 	return _mm_movemask_epi8(a.v);
 	return _mm_movemask_ps(_mm_castsi128_ps(a.v));
 }
@@ -295,22 +298,18 @@ inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {
 /** `++a` */
 inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
 	a += 1.f;
 	return a;
 	return a += 1.f;
 }
 inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
 	a += 1;
 	return a;
 	return a += 1;
 }
 /** `--a` */
 inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
 	a -= 1.f;
 	return a;
 	return a -= 1.f;
 }
 inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
 	a -= 1;
 	return a;
 	return a -= 1;
 }
 /** `a++` */
@@ -359,8 +358,8 @@ inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b)
 // Typedefs
 typedef Vector<float, 4> float_4;
 typedef Vector<int32_t, 4> int32_4;
 using float_4 = Vector<float, 4>;
 using int32_4 = Vector<int32_t, 4>;
 } // namespace simd