Collection of DPF-based plugins for packaging
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

199 lines
6.2KB

  1. /// @ref core
  2. /// @file glm/gtc/quaternion_simd.inl
  3. #if GLM_ARCH & GLM_ARCH_SSE2_BIT
  4. namespace glm{
  5. namespace detail
  6. {
  7. /*
  8. template<qualifier Q>
  9. struct compute_quat_mul<float, Q, true>
  10. {
  11. static tquat<float, Q> call(tquat<float, Q> const& q1, tquat<float, Q> const& q2)
  12. {
  13. // SSE2 STATS: 11 shuffle, 8 mul, 8 add
  14. // SSE4 STATS: 3 shuffle, 4 mul, 4 dpps
  15. __m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
  16. __m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
  17. __m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
  18. __m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data);
  19. # if GLM_ARCH & GLM_ARCH_SSE41_BIT
  20. __m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff);
  21. __m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff);
  22. __m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff);
  23. __m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
  24. # else
  25. __m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f));
  26. __m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4));
  27. __m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
  28. __m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f));
  29. __m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5));
  30. __m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
  31. __m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f));
  32. __m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6));
  33. __m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
  34. __m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
  35. __m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7));
  36. __m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
  37. #endif
  38. // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
  39. // the final code below. I'll keep this here for reference - maybe somebody else can do something better...
  40. //
  41. //__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0));
  42. //__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0));
  43. //
  44. //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));
  45. tquat<float, Q> Result;
  46. _mm_store_ss(&Result.x, add4);
  47. _mm_store_ss(&Result.y, add5);
  48. _mm_store_ss(&Result.z, add6);
  49. _mm_store_ss(&Result.w, add7);
  50. return Result;
  51. }
  52. };
  53. */
  54. template<qualifier Q>
  55. struct compute_dot<tquat<float, Q>, float, true>
  56. {
  57. static GLM_FUNC_QUALIFIER float call(tquat<float, Q> const& x, tquat<float, Q> const& y)
  58. {
  59. return _mm_cvtss_f32(glm_vec1_dot(x.data, y.data));
  60. }
  61. };
  62. template<qualifier Q>
  63. struct compute_quat_add<float, Q, true>
  64. {
  65. static tquat<float, Q> call(tquat<float, Q> const& q, tquat<float, Q> const& p)
  66. {
  67. tquat<float, Q> Result;
  68. Result.data = _mm_add_ps(q.data, p.data);
  69. return Result;
  70. }
  71. };
  72. # if GLM_ARCH & GLM_ARCH_AVX_BIT
  73. template<qualifier Q>
  74. struct compute_quat_add<double, Q, true>
  75. {
  76. static tquat<double, Q> call(tquat<double, Q> const& a, tquat<double, Q> const& b)
  77. {
  78. tquat<double, Q> Result;
  79. Result.data = _mm256_add_pd(a.data, b.data);
  80. return Result;
  81. }
  82. };
  83. # endif
  84. template<qualifier Q>
  85. struct compute_quat_sub<float, Q, true>
  86. {
  87. static tquat<float, Q> call(tquat<float, Q> const& q, tquat<float, Q> const& p)
  88. {
  89. vec<4, float, Q> Result;
  90. Result.data = _mm_sub_ps(q.data, p.data);
  91. return Result;
  92. }
  93. };
  94. # if GLM_ARCH & GLM_ARCH_AVX_BIT
  95. template<qualifier Q>
  96. struct compute_quat_sub<double, Q, true>
  97. {
  98. static tquat<double, Q> call(tquat<double, Q> const& a, tquat<double, Q> const& b)
  99. {
  100. tquat<double, Q> Result;
  101. Result.data = _mm256_sub_pd(a.data, b.data);
  102. return Result;
  103. }
  104. };
  105. # endif
  106. template<qualifier Q>
  107. struct compute_quat_mul_scalar<float, Q, true>
  108. {
  109. static tquat<float, Q> call(tquat<float, Q> const& q, float s)
  110. {
  111. vec<4, float, Q> Result;
  112. Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s));
  113. return Result;
  114. }
  115. };
  116. # if GLM_ARCH & GLM_ARCH_AVX_BIT
  117. template<qualifier Q>
  118. struct compute_quat_mul_scalar<double, Q, true>
  119. {
  120. static tquat<double, Q> call(tquat<double, Q> const& q, double s)
  121. {
  122. tquat<double, Q> Result;
  123. Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s));
  124. return Result;
  125. }
  126. };
  127. # endif
  128. template<qualifier Q>
  129. struct compute_quat_div_scalar<float, Q, true>
  130. {
  131. static tquat<float, Q> call(tquat<float, Q> const& q, float s)
  132. {
  133. vec<4, float, Q> Result;
  134. Result.data = _mm_div_ps(q.data, _mm_set_ps1(s));
  135. return Result;
  136. }
  137. };
  138. # if GLM_ARCH & GLM_ARCH_AVX_BIT
  139. template<qualifier Q>
  140. struct compute_quat_div_scalar<double, Q, true>
  141. {
  142. static tquat<double, Q> call(tquat<double, Q> const& q, double s)
  143. {
  144. tquat<double, Q> Result;
  145. Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s));
  146. return Result;
  147. }
  148. };
  149. # endif
  150. template<qualifier Q>
  151. struct compute_quat_mul_vec4<float, Q, true>
  152. {
  153. static vec<4, float, Q> call(tquat<float, Q> const& q, vec<4, float, Q> const& v)
  154. {
  155. __m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3));
  156. __m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1));
  157. __m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2));
  158. __m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1));
  159. __m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2));
  160. __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
  161. __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
  162. __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
  163. __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));
  164. __m128 const two = _mm_set1_ps(2.0f);
  165. uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two));
  166. uuv = _mm_mul_ps(uuv, two);
  167. vec<4, float, Q> Result;
  168. Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
  169. return Result;
  170. }
  171. };
  172. }//namespace detail
  173. }//namespace glm
  174. #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT