From 166099e82d00784761688c72ff6f6b24b7f77304 Mon Sep 17 00:00:00 2001 From: Christophe Riccio Date: Thu, 14 Jul 2016 20:52:29 +0200 Subject: [PATCH] Added generic quaternion SIMD optimizations --- glm/detail/type_vec4.hpp | 2 +- glm/gtc/quaternion.hpp | 12 ++- glm/gtc/quaternion.inl | 83 +++++++++++---- glm/gtc/quaternion_simd.inl | 198 ++++++++++++++++++++++++++++++++++++ 4 files changed, 274 insertions(+), 21 deletions(-) create mode 100644 glm/gtc/quaternion_simd.inl diff --git a/glm/detail/type_vec4.hpp b/glm/detail/type_vec4.hpp index 5b5c7b3a..995e9db6 100644 --- a/glm/detail/type_vec4.hpp +++ b/glm/detail/type_vec4.hpp @@ -57,7 +57,7 @@ namespace glm # ifdef GLM_SWIZZLE GLM_SWIZZLE_GEN_VEC_FROM_VEC4(T, P, tvec4, tvec2, tvec3, tvec4) # endif//GLM_SWIZZLE -# endif//GLM_LANG +# endif // -- Component accesses -- diff --git a/glm/gtc/quaternion.hpp b/glm/gtc/quaternion.hpp index 5b39cff7..179722a8 100644 --- a/glm/gtc/quaternion.hpp +++ b/glm/gtc/quaternion.hpp @@ -40,7 +40,15 @@ namespace glm // -- Data -- - T x, y, z, w; +# if GLM_HAS_UNRESTRICTED_UNIONS + union + { + struct { T x, y, z, w;}; + typename detail::storage::value>::type data; + }; +# else + T x, y, z, w; +# endif // -- Component accesses -- @@ -97,6 +105,8 @@ namespace glm template GLM_FUNC_DECL tquat & operator+=(tquat const & q); template + GLM_FUNC_DECL tquat & operator-=(tquat const & q); + template GLM_FUNC_DECL tquat & operator*=(tquat const & q); template GLM_FUNC_DECL tquat & operator*=(U s); diff --git a/glm/gtc/quaternion.inl b/glm/gtc/quaternion.inl index bd609840..06257994 100644 --- a/glm/gtc/quaternion.inl +++ b/glm/gtc/quaternion.inl @@ -12,12 +12,57 @@ namespace detail template struct compute_dot { - static GLM_FUNC_QUALIFIER T call(tquat const & x, tquat const & y) + static GLM_FUNC_QUALIFIER T call(tquat const& x, tquat const& y) { tvec4 tmp(x.x * y.x, x.y * y.y, x.z * y.z, x.w * y.w); return (tmp.x + tmp.y) + (tmp.z + tmp.w); } }; + + template + struct compute_quat_add + { + static tquat call(tquat const& q, tquat const& p) + { + return tquat(q.w + p.w, q.x + p.x, q.y + p.y, q.z + p.z); + } + }; + + template + struct compute_quat_sub + { + static tquat call(tquat const& q, tquat const& p) + { + return tquat(q.w - p.w, q.x - p.x, q.y - p.y, q.z - p.z); + } + }; + + template + struct compute_quat_mul_scalar + { + static tquat call(tquat const& q, T s) + { + return tquat(q.w * s, q.x * s, q.y * s, q.z * s); + } + }; + + template + struct compute_quat_div_scalar + { + static tquat call(tquat const& q, T s) + { + return tquat(q.w / s, q.x / s, q.y / s, q.z / s); + } + }; + + template + struct compute_quat_mul_vec4 + { + static tvec4 call(tquat const & q, tvec4 const & v) + { + return tvec4(q * tvec3(v), v.w); + } + }; }//namespace detail // -- Component accesses -- @@ -198,13 +243,16 @@ namespace detail template template - GLM_FUNC_QUALIFIER tquat & tquat::operator+=(tquat const & q) + GLM_FUNC_QUALIFIER tquat & tquat::operator+=(tquat const& q) { - this->w += static_cast(q.w); - this->x += static_cast(q.x); - this->y += static_cast(q.y); - this->z += static_cast(q.z); - return *this; + return (*this = detail::compute_quat_add::value>::call(*this, tquat(q))); + } + + template + template + GLM_FUNC_QUALIFIER tquat & tquat::operator-=(tquat const& q) + { + return (*this = detail::compute_quat_sub::value>::call(*this, tquat(q))); } template @@ -225,22 +273,14 @@ namespace detail template GLM_FUNC_QUALIFIER tquat & tquat::operator*=(U s) { - this->w *= static_cast(s); - this->x *= static_cast(s); - this->y *= static_cast(s); - this->z *= static_cast(s); - return *this; + return (*this = detail::compute_quat_mul_scalar::value>::call(*this, static_cast(s))); } template template GLM_FUNC_QUALIFIER tquat & tquat::operator/=(U s) { - this->w /= static_cast(s); - this->x /= static_cast(s); - this->y /= static_cast(s); - this->z /= static_cast(s); - return *this; + return (*this = detail::compute_quat_div_scalar::value>::call(*this, static_cast(s))); } // -- Unary bit operators -- @@ -288,9 +328,9 @@ namespace detail } template - GLM_FUNC_QUALIFIER tvec4 operator*(tquat const & q, tvec4 const & v) + GLM_FUNC_QUALIFIER tvec4 operator*(tquat const& q, tvec4 const& v) { - return tvec4(q * tvec3(v), v.w); + return detail::compute_quat_mul_vec4::value>::call(q, v); } template @@ -738,3 +778,8 @@ namespace detail return Result; } }//namespace glm + +#if GLM_ARCH != GLM_ARCH_PURE && GLM_HAS_UNRESTRICTED_UNIONS +# include "quaternion_simd.inl" +#endif + diff --git a/glm/gtc/quaternion_simd.inl b/glm/gtc/quaternion_simd.inl new file mode 100644 index 00000000..cca874bb --- /dev/null +++ b/glm/gtc/quaternion_simd.inl @@ -0,0 +1,198 @@ +/// @ref core +/// @file glm/gtc/quaternion_simd.inl + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + +namespace glm{ +namespace detail +{ +/* + template + struct compute_quat_mul + { + static tquat call(tquat const& q1, tquat const& q2) + { + // SSE2 STATS: 11 shuffle, 8 mul, 8 add + // SSE4 STATS: 3 shuffle, 4 mul, 4 dpps + + __m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); + __m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + __m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data); + +# if GLM_ARCH & GLM_ARCH_SSE41_BIT + __m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); + __m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); + __m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); + __m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); +# else + __m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + __m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4)); + __m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + + __m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + __m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5)); + __m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + + __m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + __m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6)); + __m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + + __m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + __m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7)); + __m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); + #endif + + // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than + // the final code below. I'll keep this here for reference - maybe somebody else can do something better... + // + //__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0)); + //__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0)); + // + //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0)); + + tquat Result(uninitialize); + _mm_store_ss(&Result.x, add4); + _mm_store_ss(&Result.y, add5); + _mm_store_ss(&Result.z, add6); + _mm_store_ss(&Result.w, add7); + return Result; + } + }; +*/ + + template + struct compute_dot + { + static GLM_FUNC_QUALIFIER float call(tquat const& x, tquat const& y) + { + return _mm_cvtss_f32(glm_vec1_dot(x.data, y.data)); + } + }; + + template + struct compute_quat_add + { + static tquat call(tquat const& q, tquat const& p) + { + tquat Result(uninitialize); + Result.data = _mm_add_ps(q.data, p.data); + return Result; + } + }; + +# if GLM_ARCH & GLM_ARCH_AVX_BIT + template + struct compute_quat_add + { + static tquat call(tquat const & a, tquat const & b) + { + tquat Result(uninitialize); + Result.data = _mm256_add_pd(a.data, b.data); + return Result; + } + }; +# endif + + template + struct compute_quat_sub + { + static tquat call(tquat const& q, tquat const& p) + { + tvec4 Result(uninitialize); + Result.data = _mm_sub_ps(q.data, p.data); + return Result; + } + }; + +# if GLM_ARCH & GLM_ARCH_AVX_BIT + template + struct compute_quat_sub + { + static tquat call(tquat const & a, tquat const & b) + { + tquat Result(uninitialize); + Result.data = _mm256_sub_pd(a.data, b.data); + return Result; + } + }; +# endif + + template + struct compute_quat_mul_scalar + { + static tquat call(tquat const& q, float s) + { + tvec4 Result(uninitialize); + Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s)); + return Result; + } + }; + +# if GLM_ARCH & GLM_ARCH_AVX_BIT + template + struct compute_quat_mul_scalar + { + static tquat call(tquat const& q, double s) + { + tquat Result(uninitialize); + Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s)); + return Result; + } + }; +# endif + + template + struct compute_quat_div_scalar + { + static tquat call(tquat const& q, float s) + { + tvec4 Result(uninitialize); + Result.data = _mm_div_ps(q.data, _mm_set_ps1(s)); + return Result; + } + }; + +# if GLM_ARCH & GLM_ARCH_AVX_BIT + template + struct compute_quat_div_scalar + { + static tquat call(tquat const& q, double s) + { + tquat Result(uninitialize); + Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s)); + return Result; + } + }; +# endif + + template + struct compute_quat_mul_vec4 + { + static tvec4 call(tquat const& q, tvec4 const& v) + { + __m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2)); + + __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0)); + __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0)); + + __m128 const two = _mm_set1_ps(2.0f); + uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two)); + uuv = _mm_mul_ps(uuv, two); + + tvec4 Result(uninitialize); + Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); + return Result; + } + }; +}//namespace detail +}//namespace glm + +#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT +