From 7563a8bc4db0ffabceef4210a7b4deb10c17e2d0 Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Tue, 23 Apr 2013 15:36:12 +1000 Subject: [PATCH] Add initial implementation of SIMD optimized quaternions. A few things here can probably be improved by people a lot smarter then me, but for the most part things are generally faster. A few notes: - A fquatSIMD can be converted to a fmat4x4SIMD using mat4SIMD_cast(). - A tquat can be converted to a fquatSIMD using quatSIMD_cast(). - Some functions are virtually the same as their scalar counterparts because I've just not been able to get them faster. - Only the basic functions are implemented. Future plans include fast, approximate normalize, length and mix/slerp functions. --- glm/gtx/simd_quat.hpp | 291 +++++++++++++++++++++++ glm/gtx/simd_quat.inl | 532 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 823 insertions(+) create mode 100644 glm/gtx/simd_quat.hpp create mode 100644 glm/gtx/simd_quat.inl diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp new file mode 100644 index 00000000..962b9413 --- /dev/null +++ b/glm/gtx/simd_quat.hpp @@ -0,0 +1,291 @@ +/////////////////////////////////////////////////////////////////////////////////// +/// OpenGL Mathematics (glm.g-truc.net) +/// +/// Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +/// THE SOFTWARE. +/// +/// @ref gtx_simd_quat +/// @file glm/gtx/simd_quat.hpp +/// @date 2009-05-07 / 2011-06-07 +/// @author Christophe Riccio +/// +/// @see core (dependence) +/// +/// @defgroup gtx_simd_vec4 GLM_GTX_simd_quat +/// @ingroup gtx +/// +/// @brief SIMD implementation of quat type. +/// +/// need to be included to use these functionalities. +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef GLM_GTX_simd_quat +#define GLM_GTX_simd_quat GLM_VERSION + +// Dependency: +#include "../glm.hpp" +#include "../gtc/quaternion.hpp" + +#if(GLM_ARCH != GLM_ARCH_PURE) + +#if(GLM_ARCH & GLM_ARCH_SSE2) +# include "../core/intrinsic_common.hpp" +# include "../core/intrinsic_geometric.hpp" +# include "../gtx/simd_mat4.hpp" +#else +# error "GLM: GLM_GTX_simd_quat requires compiler support of SSE2 through intrinsics" +#endif + +#if(defined(GLM_MESSAGES) && !defined(glm_ext)) +# pragma message("GLM: GLM_GTX_simd_quat extension included") +#endif + + +// Warning silencer for nameless struct/union. +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(push) +# pragma warning(disable:4201) // warning C4201: nonstandard extension used : nameless struct/union +#endif + + +namespace glm{ +namespace detail +{ + /// Quaternion implemented using SIMD SEE intrinsics. + /// \ingroup gtx_simd_vec4 + GLM_ALIGNED_STRUCT(16) fquatSIMD + { + enum ctor{null}; + typedef __m128 value_type; + typedef std::size_t size_type; + static size_type value_size(); + + typedef fquatSIMD type; + typedef tquat bool_type; + +#ifdef GLM_SIMD_ENABLE_XYZW_UNION + union + { + __m128 Data; + struct {float x, y, z, w;}; + }; +#else + __m128 Data; +#endif + + ////////////////////////////////////// + // Implicit basic constructors + + fquatSIMD(); + fquatSIMD(__m128 const & Data); + fquatSIMD(fquatSIMD const & q); + + ////////////////////////////////////// + // Explicit basic constructors + + explicit fquatSIMD( + ctor); + explicit fquatSIMD( + float const & w, + float const & x, + float const & y, + float const & z); + explicit fquatSIMD( + quat const & v); + explicit fquatSIMD( + vec3 const & eulerAngles); + + + ////////////////////////////////////// + // Unary arithmetic operators + + fquatSIMD& operator =(fquatSIMD const & q); + fquatSIMD& operator*=(float const & s); + fquatSIMD& operator/=(float const & s); + }; + + + ////////////////////////////////////// + // Arithmetic operators + + detail::fquatSIMD operator- ( + detail::fquatSIMD const & q); + + detail::fquatSIMD operator+ ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fvec4SIMD operator* ( + detail::fquatSIMD const & q, + detail::fvec4SIMD const & v); + + detail::fvec4SIMD operator* ( + detail::fvec4SIMD const & v, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + float s); + + detail::fquatSIMD operator* ( + float s, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator/ ( + detail::fquatSIMD const & q, + float s); + +}//namespace detail + + typedef glm::detail::fquatSIMD simdQuat; + + /// @addtogroup gtx_simd_quat + /// @{ + + //! Convert a simdQuat to a quat. + //! (From GLM_GTX_simd_quat extension) + quat quat_cast( + detail::fquatSIMD const & x); + + //! Convert a simdMat4 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + detail::fquatSIMD quatSIMD_cast( + detail::fmat4x4SIMD const & m); + + //! Convert a simdQuat to a simdMat4 + //! (From GLM_GTX_simd_quat extension) + detail::fmat4x4SIMD mat4SIMD_cast( + detail::fquatSIMD const & q); + + + /// Returns the length of the quaternion. + /// + /// @see gtc_quaternion + float length( + detail::fquatSIMD const & x); + + /// Returns the normalized quaternion. + /// + /// @see gtc_quaternion + detail::fquatSIMD normalize( + detail::fquatSIMD const & x); + + /// Returns dot product of q1 and q2, i.e., q1[0] * q2[0] + q1[1] * q2[1] + ... + /// + /// @see gtc_quaternion + float dot( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation is oriented and the rotation is performed at constant speed. + /// For short path spherical linear interpolation, use the slerp function. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + /// @see - slerp(detail::tquat const & x, detail::tquat const & y, T const & a) + detail::fquatSIMD mix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Linear interpolation of two quaternions. + /// The interpolation is oriented. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined in the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD lerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation always take the short path and the rotation is performed at constant speed. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD slerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Returns the q conjugate. + /// + /// @see gtc_quaternion + detail::fquatSIMD conjugate( + detail::fquatSIMD const & q); + + /// Returns the q inverse. + /// + /// @see gtc_quaternion + detail::fquatSIMD inverse( + detail::fquatSIMD const & q); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param axis Axis of the quaternion, must be normalized. + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + vec3 const & axis); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param x x component of the x-axis, x, y, z must be a normalized axis + /// @param y y component of the y-axis, x, y, z must be a normalized axis + /// @param z z component of the z-axis, x, y, z must be a normalized axis + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + float const & x, + float const & y, + float const & z); + + + /// @} +}//namespace glm + +#include "simd_quat.inl" + + +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(pop) +#endif + + +#endif//(GLM_ARCH != GLM_ARCH_PURE) + +#endif//GLM_GTX_simd_quat diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl new file mode 100644 index 00000000..f9df7a94 --- /dev/null +++ b/glm/gtx/simd_quat.inl @@ -0,0 +1,532 @@ +/////////////////////////////////////////////////////////////////////////////////////////////////// +// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Created : 2013-04-22 +// Updated : 2013-04-22 +// Licence : This source is under MIT License +// File : glm/gtx/simd_quat.inl +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +namespace glm{ +namespace detail{ + + +////////////////////////////////////// +// Debugging +#if 0 +void print(__m128 v) +{ + GLM_ALIGN(16) float result[4]; + _mm_store_ps(result, v); + + printf("__m128: %f %f %f %f\n", result[0], result[1], result[2], result[3]); +} + +void print(const fvec4SIMD &v) +{ + printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w); +} +#endif + + +////////////////////////////////////// +// Implicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD() +#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT + : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f)) +#endif +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) : + Data(Data) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) : + Data(q.Data) +{} + + +////////////////////////////////////// +// Explicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) : + Data(_mm_set_ps(w, z, y, x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) : + Data(_mm_set_ps(q.w, q.z, q.y, q.x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles) +{ + vec3 c = glm::cos(eulerAngles * 0.5f); + vec3 s = glm::sin(eulerAngles * 0.5f); + + Data = _mm_set_ps( + (c.x * c.y * c.z) + (s.x * s.y * s.z), + (c.x * c.y * s.z) - (s.x * s.y * c.z), + (c.x * s.y * c.z) + (s.x * c.y * s.z), + (s.x * c.y * c.z) - (c.x * s.y * s.z)); +} + + +////////////////////////////////////// +// Unary arithmetic operators + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q) +{ + this->Data = q.Data; + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s) +{ + this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s)); + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s) +{ + this->Data = _mm_div_ps(Data, _mm_set1_ps(s)); + return *this; +} + + + +// negate operator +GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f))); +} + +// operator+ +GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2) +{ + return fquatSIMD(_mm_add_ps(q1.Data, q2.Data)); +} + +//operator* +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2) +{ + // SSE2 STATS: + // 11 shuffle + // 8 mul + // 8 add + + // SSE3 STATS: + // 3 shuffle + // 8 mul + // 8 add + + // SSE4 STATS: + // 3 shuffle + // 8 mul + // 4 dpps + + __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); + __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); + __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + + +# if((GLM_ARCH & GLM_ARCH_SSE4)) + __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff); +# elif((GLM_ARCH & GLM_ARCH_SSE3)) + __m128 add0 = _mm_hadd_ps(mul0, mul0); + add0 = _mm_hadd_ps(add0, add0); + __m128 add1 = _mm_hadd_ps(mul1, mul1); + add1 = _mm_hadd_ps(add1, add1); + __m128 add2 = _mm_hadd_ps(mul2, mul2); + add2 = _mm_hadd_ps(add2, add2); + __m128 add3 = _mm_hadd_ps(mul3, mul3); + add3 = _mm_hadd_ps(add3, add3); +# else + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); +#endif + + + + // I had tried something clever here using shuffles to produce the final result, but it turns out that using + // _mm_store_* is consistently quicker in my tests. I've kept the shuffling code below just in case. + + float w; + float x; + float y; + float z; + + _mm_store_ss(&w, add0); + _mm_store_ss(&x, add1); + _mm_store_ss(&y, add2); + _mm_store_ss(&z, add3); + + return detail::fquatSIMD(w, x, y, z); + + + //return _mm_shuffle_ps(_mm_shuffle_ps(add1, add2, 0), + // _mm_shuffle_ps(add3, add0, 0), + // _MM_SHUFFLE(2, 0, 2, 0)); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) +{ + __m128 uv = detail::sse_xpd_ps(q.Data, v.Data); + __m128 uuv = detail::sse_xpd_ps(q.Data, uv); + + uv = _mm_mul_ps(uv, _mm_mul_ps(_mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)), _mm_set1_ps(2.0f))); + uuv = _mm_mul_ps(uuv, _mm_set1_ps(2.0f)); + + return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q) +{ + return inverse(q) * v; +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s))); +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data)); +} + + +//operator/ +GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s))); +} + + +}//namespace detail + + +GLM_FUNC_QUALIFIER quat quat_cast +( + detail::fquatSIMD const & x +) +{ + GLM_ALIGN(16) quat Result; + _mm_store_ps(&Result[0], x.Data); + return Result; +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::fmat4x4SIMD const & m +) +{ + // Scalar implementation for now. + + GLM_ALIGN(16) float m0[4]; + GLM_ALIGN(16) float m1[4]; + GLM_ALIGN(16) float m2[4]; + GLM_ALIGN(16) float m3[4]; + + _mm_store_ps(m0, m[0].Data); + _mm_store_ps(m1, m[1].Data); + _mm_store_ps(m2, m[2].Data); + _mm_store_ps(m3, m[3].Data); + + + float trace = m0[0] + m1[1] + m2[2] + 1.0f; + if (trace > 0) + { + float s = 0.5f / sqrt(trace); + + return _mm_set_ps( + 0.25f / s, + (m0[1] - m1[0]) * s, + (m2[0] - m0[2]) * s, + (m1[2] - m2[1]) * s); + } + else + { + if (m0[0] > m1[1]) + { + if (m0[0] > m2[2]) + { + // X is biggest. + float s = sqrt(m0[0] - m1[1] - m2[2] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m1[2] - m2[1]) * s, + (m2[0] + m0[2]) * s, + (m0[1] + m1[0]) * s, + 0.5f * s); + } + } + else + { + if (m1[1] > m2[2]) + { + // Y is biggest. + float s = sqrt(m1[1] - m0[0] - m2[2] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m2[0] - m0[2]) * s, + (m1[2] + m2[1]) * s, + 0.5f * s, + (m0[1] + m1[0]) * s); + } + } + + // Z is biggest. + float s = sqrt(m2[2] - m0[0] - m1[1] + 1.0f) * 0.5f; + + return _mm_set_ps( + (m0[1] - m1[0]) * s, + 0.5f * s, + (m1[2] + m2[1]) * s, + (m2[0] + m0[2]) * s); + } +} + + +GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast +( + detail::fquatSIMD const & q +) +{ + detail::fmat4x4SIMD result; + + __m128 _wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 _xyzw = q.Data; + __m128 _zxyw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw); + __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _tmp0 = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw)); + _tmp0 = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw)); + + __m128 _tmp1 = _mm_mul_ps(_yzxw2, _xyzw); + _tmp1 = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww)); + + __m128 _tmp2 = _mm_mul_ps(_zxyw2, _xyzw); + _tmp2 = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww)); + + + // There's probably a better, more politically correct way of doing this... + result[0].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp2)[0], + reinterpret_cast(&_tmp1)[0], + reinterpret_cast(&_tmp0)[0]); + + result[1].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp1)[1], + reinterpret_cast(&_tmp0)[1], + reinterpret_cast(&_tmp2)[1]); + + result[2].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp0)[2], + reinterpret_cast(&_tmp2)[2], + reinterpret_cast(&_tmp1)[2]); + + result[3].Data = _mm_set_ps( + 1.0f, + 0.0f, + 0.0f, + 0.0f); + + + return result; +} + + + +GLM_FUNC_QUALIFIER float length +( + detail::fquatSIMD const & q +) +{ + return glm::sqrt(dot(q, q)); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD normalize +( + detail::fquatSIMD const & q +) +{ + return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q))); +} + +GLM_FUNC_QUALIFIER float dot +( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2 +) +{ + float result; + _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data)); + + return result; +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD mix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + // Compared to the naive SIMD implementation below, this scalar version is consistently faster. A non-naive SSE-optimized implementation + // will most likely be faster, but that'll need to be left to people much smarter than I. + // + // The issue, I think, is loading the __m128 variables with initial data. Can probably be replaced with an SSE-optimized approximation of + // glm::sin(). Maybe a fastMix() function would be better for that? + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + + + //__m128 s0 = _mm_set1_ps(glm::sin((1.0f - a) * angle)); + //__m128 s1 = _mm_set1_ps(glm::sin(a * angle)); + //__m128 d = _mm_set1_ps(1.0f / glm::sin(angle)); + // + //return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD lerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + // Lerp is only defined in [0, 1] + assert(a >= 0.0f); + assert(a <= 1.0f); + + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD slerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + + // If cosTheta < 0, the interpolation will take the long way around the sphere. + // To fix this, one quat must be negated. + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + // Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + // See mix() above for an explanation to the rationale behind this non-SSE implementation. + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate +( + detail::fquatSIMD const & q +) +{ + return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD inverse +( + detail::fquatSIMD const & q +) +{ + return conjugate(q) / dot(q, q); +} + + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + vec3 const & v +) +{ +#ifdef GLM_FORCE_RADIANS + float a(angle); +#else + float a(glm::radians(angle)); +#endif + float s = glm::sin(a * 0.5f); + + return _mm_set_ps( + glm::cos(a * 0.5f), + v.z * s, + v.y * s, + v.x * s); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + float const & x, + float const & y, + float const & z +) +{ + return angleAxisSIMD(angle, vec3(x, y, z)); +} + + +}//namespace glm