|  |  | @@ -63,7 +63,14 @@ struct Vector<float, 4> { | 
		
	
		
			
			|  |  |  | return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | /** Reads an array of 4 values. */ | 
		
	
		
			
			|  |  |  | /** Constructs a vector from four values in reverse. */ | 
		
	
		
			
			|  |  |  | static Vector setr(float x1, float x2, float x3, float x4) { | 
		
	
		
			
			|  |  |  | return Vector(_mm_setr_ps(x1, x2, x3, x4)); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | /** Reads an array of 4 values. | 
		
	
		
			
			|  |  |  | On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | 
		
	
		
			
			|  |  |  | */ | 
		
	
		
			
			|  |  |  | static Vector load(const float *x) { | 
		
	
		
			
			|  |  |  | /* | 
		
	
		
			
			|  |  |  | My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned. | 
		
	
	
		
			
				|  |  | @@ -73,7 +80,9 @@ struct Vector<float, 4> { | 
		
	
		
			
			|  |  |  | return Vector(_mm_loadu_ps(x)); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | /** Writes an array of 4 values. */ | 
		
	
		
			
			|  |  |  | /** Writes an array of 4 values. | 
		
	
		
			
			|  |  |  | On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | 
		
	
		
			
			|  |  |  | */ | 
		
	
		
			
			|  |  |  | void store(float *x) { | 
		
	
		
			
			|  |  |  | _mm_storeu_ps(x, v); | 
		
	
		
			
			|  |  |  | } | 
		
	
	
		
			
				|  |  | 
 |