| @@ -30,7 +30,7 @@ struct Vector; | |||
| */ | |||
| template <> | |||
| struct Vector<float, 4> { | |||
| typedef float type; | |||
| using type = float; | |||
| constexpr static int size = 4; | |||
| union { | |||
| @@ -56,35 +56,38 @@ struct Vector<float, 4> { | |||
| v = _mm_setr_ps(x1, x2, x3, x4); | |||
| } | |||
| /** Returns a vector initialized to zero. */ | |||
| /** Returns a vector with all 0 bits. */ | |||
| static Vector zero() { | |||
| return Vector(); | |||
| return Vector(_mm_setzero_ps()); | |||
| } | |||
| /** Returns a vector with all 1 bits. */ | |||
| static Vector mask() { | |||
| return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); | |||
| return Vector(_mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()))); | |||
| } | |||
| /** Reads an array of 4 values. | |||
| On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | |||
| On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | |||
| */ | |||
| static Vector load(const float* x) { | |||
| /* | |||
| My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned. | |||
| This post seems to agree. https://stackoverflow.com/a/20265193/272642 | |||
| So use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although it will be slower). | |||
| I therefore use _mm_loadu_ps() for generality, so you can load unaligned arrays using the same function (although load aligned arrays if you can for best performance). | |||
| */ | |||
| return Vector(_mm_loadu_ps(x)); | |||
| } | |||
| /** Writes an array of 4 values. | |||
| On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | |||
| On little-endian machines (e.g. x86_64), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | |||
| */ | |||
| void store(float* x) { | |||
| _mm_storeu_ps(x, v); | |||
| } | |||
| /** Accessing vector elements individually is slow and defeats the purpose of vectorizing. | |||
| However, this operator is convenient when writing simple serial code in a non-bottlenecked section. | |||
| */ | |||
| float& operator[](int i) { | |||
| return s[i]; | |||
| } | |||
| @@ -101,7 +104,7 @@ struct Vector<float, 4> { | |||
| template <> | |||
| struct Vector<int32_t, 4> { | |||
| typedef int32_t type; | |||
| using type = int32_t; | |||
| constexpr static int size = 4; | |||
| union { | |||
| @@ -118,7 +121,7 @@ struct Vector<int32_t, 4> { | |||
| v = _mm_setr_epi32(x1, x2, x3, x4); | |||
| } | |||
| static Vector zero() { | |||
| return Vector(); | |||
| return Vector(_mm_setzero_si128()); | |||
| } | |||
| static Vector mask() { | |||
| return Vector(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); | |||
| @@ -126,7 +129,7 @@ struct Vector<int32_t, 4> { | |||
| static Vector load(const int32_t* x) { | |||
| // HACK | |||
| // Use _mm_loadu_si128() because GCC doesn't support _mm_loadu_si32() | |||
| return Vector(_mm_loadu_si128((__m128i*) x)); | |||
| return Vector(_mm_loadu_si128((const __m128i*) x)); | |||
| } | |||
| void store(int32_t* x) { | |||
| // HACK | |||
| @@ -179,11 +182,11 @@ inline int movemask(const Vector<float, 4>& a) { | |||
| return _mm_movemask_ps(a.v); | |||
| } | |||
| /** Returns an integer with each bit corresponding to the most significant bit of each byte. | |||
| For example, `movemask(int32_4::mask())` returns 0xffff. | |||
| /** Returns an integer with each bit corresponding to the most significant bit of each element. | |||
| For example, `movemask(int32_4::mask())` returns 0xf. | |||
| */ | |||
| inline int movemask(const Vector<int32_t, 4>& a) { | |||
| return _mm_movemask_epi8(a.v); | |||
| return _mm_movemask_ps(_mm_castsi128_ps(a.v)); | |||
| } | |||
| @@ -295,22 +298,18 @@ inline Vector<int32_t, 4> operator-(const Vector<int32_t, 4>& a) { | |||
| /** `++a` */ | |||
| inline Vector<float, 4>& operator++(Vector<float, 4>& a) { | |||
| a += 1.f; | |||
| return a; | |||
| return a += 1.f; | |||
| } | |||
| inline Vector<int32_t, 4>& operator++(Vector<int32_t, 4>& a) { | |||
| a += 1; | |||
| return a; | |||
| return a += 1; | |||
| } | |||
| /** `--a` */ | |||
| inline Vector<float, 4>& operator--(Vector<float, 4>& a) { | |||
| a -= 1.f; | |||
| return a; | |||
| return a -= 1.f; | |||
| } | |||
| inline Vector<int32_t, 4>& operator--(Vector<int32_t, 4>& a) { | |||
| a -= 1; | |||
| return a; | |||
| return a -= 1; | |||
| } | |||
| /** `a++` */ | |||
| @@ -359,8 +358,8 @@ inline Vector<int32_t, 4> operator>>(const Vector<int32_t, 4>& a, const int& b) | |||
| // Typedefs | |||
| typedef Vector<float, 4> float_4; | |||
| typedef Vector<int32_t, 4> int32_4; | |||
| using float_4 = Vector<float, 4>; | |||
| using int32_4 = Vector<int32_t, 4>; | |||
| } // namespace simd | |||