Browse Source

Change `movemask(int32_4)` to operate on elements rather than bytes. Tweak Vector documentation and clean up syntax.

tags/v2.0.0
Andrew Belt 5 years ago
parent
commit
a04018a330
1 changed files with 22 additions and 23 deletions
  1. +22
    -23
      include/simd/vector.hpp

+ 22
- 23
include/simd/vector.hpp View File

@@ -30,7 +30,7 @@ struct Vector;
*/ */
template <> template <>
struct Vector<float, 4> { struct Vector<float, 4> {
typedef float type;
using type = float;
constexpr static int size = 4; constexpr static int size = 4;


union { union {
@@ -56,35 +56,38 @@ struct Vector<float, 4> {
v = _mm_setr_ps(x1, x2, x3, x4); v = _mm_setr_ps(x1, x2, x3, x4);
} }


/** Returns a vector initialized to zero. */
/** Returns a vector with all 0 bits. */
static Vector zero() { static Vector zero() {
return Vector();
return Vector(_mm_setzero_ps());
} }


/** Returns a vector with all 1 bits. */ /** Returns a vector with all 1 bits. */
static Vector mask() { static Vector mask() {
return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
} }


/** Reads an array of 4 values. /** Reads an array of 4 values.
On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
*/ */
static Vector load(const float* x) { static Vector load(const float* x) {
/* /*
My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned. My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
This post seems to agree. https://stackoverflow.com/a/20265193/272642 This post seems to agree. https://stackoverflow.com/a/20265193/272642
So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower).
I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance).
*/ */
return Vector(_mm_loadu_ps(x)); return Vector(_mm_loadu_ps(x));
} }


/** Writes an array of 4 values. /** Writes an array of 4 values.
On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
*/ */
void store(float* x) { void store(float* x) {
_mm_storeu_ps(x, v); _mm_storeu_ps(x, v);
} }


/** Accessing vector elements individually is slow and defeats the purpose of vectorizing.
However, this operator is convenient when writing simple serial code in a non-bottlenecked section.
*/
float& operator[](int i) { float& operator[](int i) {
return s[i]; return s[i];
} }
@@ -101,7 +104,7 @@ struct Vector<float, 4> {


template <> template <>
struct Vector<int32_t, 4> { struct Vector<int32_t, 4> {
typedef int32_t type;
using type = int32_t;
constexpr static int size = 4; constexpr static int size = 4;


union { union {
@@ -118,7 +121,7 @@ struct Vector<int32_t, 4> {
v = _mm_setr_epi32(x1, x2, x3, x4); v = _mm_setr_epi32(x1, x2, x3, x4);
} }
static Vector zero() { static Vector zero() {
return Vector();
return Vector(_mm_setzero_si128());
} }
static Vector mask() { static Vector mask() {
return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
@@ -126,7 +129,7 @@ struct Vector<int32_t, 4> {
static Vector load(const int32_t* x) { static Vector load(const int32_t* x) {
// HACK // HACK
// Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32() // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
return Vector(_mm_loadu_si128((__m128i*) x));
return Vector(_mm_loadu_si128((const __m128i*) x));
} }
void store(int32_t* x) { void store(int32_t* x) {
// HACK // HACK
@@ -179,11 +182,11 @@ inline int movemask(const Vector<float, 4>& a) {
return _mm_movemask_ps(a.v); return _mm_movemask_ps(a.v);
} }


/** Returns an integer with each bit corresponding to the most significant bit of each byte.
For example, `movemask(int32_4::mask())` returns 0xffff.
/** Returns an integer with each bit corresponding to the most significant bit of each element.
For example, `movemask(int32_4::mask())` returns 0xf.
*/ */
inline int movemask(const Vector<int32_t, 4>& a) { inline int movemask(const Vector<int32_t, 4>& a) {
return _mm_movemask_epi8(a.v);
return _mm_movemask_ps(_mm_castsi128_ps(a.v));
} }




@@ -295,22 +298,18 @@ inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {


/** `++a` */ /** `++a` */
inline Vector<float, 4>& operator++(Vector<float, 4>& a) { inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
a += 1.f;
return a;
return a += 1.f;
} }
inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) { inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
a += 1;
return a;
return a += 1;
} }


/** `--a` */ /** `--a` */
inline Vector<float, 4>& operator--(Vector<float, 4>& a) { inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
a -= 1.f;
return a;
return a -= 1.f;
} }
inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) { inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
a -= 1;
return a;
return a -= 1;
} }


/** `a++` */ /** `a++` */
@@ -359,8 +358,8 @@ inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b)
// Typedefs // Typedefs




typedef Vector<float, 4> float_4;
typedef Vector<int32_t, 4> int32_4;
using float_4 = Vector<float, 4>;
using int32_4 = Vector<int32_t, 4>;




} // namespace simd } // namespace simd


Loading…
Cancel
Save