From a079bf1c632501a12d1456af086133dd3209aec9 Mon Sep 17 00:00:00 2001 From: Andrew Belt Date: Wed, 1 Apr 2020 01:53:01 -0400 Subject: [PATCH] Fix horrible correctness bug in movemaskInverse. --- include/simd/functions.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp index c5e99990..8ca1bfef 100644 --- a/include/simd/functions.hpp +++ b/include/simd/functions.hpp @@ -34,17 +34,20 @@ inline float_4 rcp(float_4 x) { return float_4(_mm_rcp_ps(x.v)); } -/** Given a mask `a`, returns a vector with each element either 0's or 1's depending on the mask bit. +/** Returns a vector where element N is all 1's if the N'th bit of `a` is 1, or all 0's if the N'th bit of `a` is 0. */ template T movemaskInverse(int a); template <> -inline float_4 movemaskInverse(int x) { - __m128i msk8421 = _mm_set_epi32(8, 4, 2, 1); - __m128i x_bc = _mm_set1_epi32(x); - __m128i t = _mm_and_si128(x_bc, msk8421); - return float_4(_mm_castsi128_ps(_mm_cmpeq_epi32(x_bc, t))); +inline int32_4 movemaskInverse(int a) { + int32_4 msk8421 = int32_4(1, 2, 4, 8); + return (msk8421 & int32_4(a)) == msk8421; +} + +template <> +inline float_4 movemaskInverse(int a) { + return float_4::cast(movemaskInverse(a)); }