Browse Source

Change `movemask(int32_4)` to operate on elements rather than bytes. Tweak Vector documentation and clean up syntax.

tags/v2.0.0
Andrew Belt 4 years ago
parent
commit
a04018a330
1 changed files with 22 additions and 23 deletions
  1. +22
    -23
      include/simd/vector.hpp

+ 22
- 23
include/simd/vector.hpp View File

@@ -30,7 +30,7 @@ struct Vector;
*/
template <>
struct Vector<float, 4> {
typedef float type;
using type = float;
constexpr static int size = 4;

union {
@@ -56,35 +56,38 @@ struct Vector<float, 4> {
v = _mm_setr_ps(x1, x2, x3, x4);
}

/** Returns a vector initialized to zero. */
/** Returns a vector with all 0 bits. */
static Vector zero() {
return Vector();
return Vector(_mm_setzero_ps());
}

/** Returns a vector with all 1 bits. */
static Vector mask() {
return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())));
}

/** Reads an array of 4 values.
On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
*/
static Vector load(const float* x) {
/*
My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned.
This post seems to agree. https://stackoverflow.com/a/20265193/272642
So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower).
I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance).
*/
return Vector(_mm_loadu_ps(x));
}

/** Writes an array of 4 values.
On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`.
*/
void store(float* x) {
_mm_storeu_ps(x, v);
}

/** Accessing vector elements individually is slow and defeats the purpose of vectorizing.
However, this operator is convenient when writing simple serial code in a non-bottlenecked section.
*/
float& operator[](int i) {
return s[i];
}
@@ -101,7 +104,7 @@ struct Vector<float, 4> {

template <>
struct Vector<int32_t, 4> {
typedef int32_t type;
using type = int32_t;
constexpr static int size = 4;

union {
@@ -118,7 +121,7 @@ struct Vector<int32_t, 4> {
v = _mm_setr_epi32(x1, x2, x3, x4);
}
static Vector zero() {
return Vector();
return Vector(_mm_setzero_si128());
}
static Vector mask() {
return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
@@ -126,7 +129,7 @@ struct Vector<int32_t, 4> {
static Vector load(const int32_t* x) {
// HACK
// Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32()
return Vector(_mm_loadu_si128((__m128i*) x));
return Vector(_mm_loadu_si128((const __m128i*) x));
}
void store(int32_t* x) {
// HACK
@@ -179,11 +182,11 @@ inline int movemask(const Vector<float, 4>& a) {
return _mm_movemask_ps(a.v);
}

/** Returns an integer with each bit corresponding to the most significant bit of each byte.
For example, `movemask(int32_4::mask())` returns 0xffff.
/** Returns an integer with each bit corresponding to the most significant bit of each element.
For example, `movemask(int32_4::mask())` returns 0xf.
*/
inline int movemask(const Vector<int32_t, 4>& a) {
return _mm_movemask_epi8(a.v);
return _mm_movemask_ps(_mm_castsi128_ps(a.v));
}


@@ -295,22 +298,18 @@ inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) {

/** `++a` */
inline Vector<float, 4>& operator++(Vector<float, 4>& a) {
a += 1.f;
return a;
return a += 1.f;
}
inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) {
a += 1;
return a;
return a += 1;
}

/** `--a` */
inline Vector<float, 4>& operator--(Vector<float, 4>& a) {
a -= 1.f;
return a;
return a -= 1.f;
}
inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) {
a -= 1;
return a;
return a -= 1;
}

/** `a++` */
@@ -359,8 +358,8 @@ inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b)
// Typedefs


typedef Vector<float, 4> float_4;
typedef Vector<int32_t, 4> int32_4;
using float_4 = Vector<float, 4>;
using int32_4 = Vector<int32_t, 4>;


} // namespace simd


Loading…
Cancel
Save