|  | @@ -63,7 +63,14 @@ struct Vector<float, 4> { | 
														
													
														
															
																|  |  | return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); |  |  | return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); | 
														
													
														
															
																|  |  | } |  |  | } | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | /** Reads an array of 4 values. */ |  |  |  | 
														
													
														
															
																|  |  |  |  |  | /** Constructs a vector from four values in reverse. */ | 
														
													
														
															
																|  |  |  |  |  | static Vector setr(float x1, float x2, float x3, float x4) { | 
														
													
														
															
																|  |  |  |  |  | return Vector(_mm_setr_ps(x1, x2, x3, x4)); | 
														
													
														
															
																|  |  |  |  |  | } | 
														
													
														
															
																|  |  |  |  |  | 
 | 
														
													
														
															
																|  |  |  |  |  | /** Reads an array of 4 values. | 
														
													
														
															
																|  |  |  |  |  | On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | 
														
													
														
															
																|  |  |  |  |  | */ | 
														
													
														
															
																|  |  | static Vector load(const float *x) { |  |  | static Vector load(const float *x) { | 
														
													
														
															
																|  |  | /* |  |  | /* | 
														
													
														
															
																|  |  | My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned. |  |  | My benchmarks show that _mm_loadu_ps() performs equally as fast as _mm_load_ps() when data is actually aligned. | 
														
													
												
													
														
															
																|  | @@ -73,7 +80,9 @@ struct Vector<float, 4> { | 
														
													
														
															
																|  |  | return Vector(_mm_loadu_ps(x)); |  |  | return Vector(_mm_loadu_ps(x)); | 
														
													
														
															
																|  |  | } |  |  | } | 
														
													
														
															
																|  |  | 
 |  |  | 
 | 
														
													
														
															
																|  |  | /** Writes an array of 4 values. */ |  |  |  | 
														
													
														
															
																|  |  |  |  |  | /** Writes an array of 4 values. | 
														
													
														
															
																|  |  |  |  |  | On little-endian machines (e.g. x86), the order is reversed, so `x[0]` corresponds to `vector.s[3]`. | 
														
													
														
															
																|  |  |  |  |  | */ | 
														
													
														
															
																|  |  | void store(float *x) { |  |  | void store(float *x) { | 
														
													
														
															
																|  |  | _mm_storeu_ps(x, v); |  |  | _mm_storeu_ps(x, v); | 
														
													
														
															
																|  |  | } |  |  | } | 
														
													
												
													
														
															
																|  | 
 |