diff --git a/glm/gtx/simd_quat.hpp b/glm/gtx/simd_quat.hpp new file mode 100644 index 00000000..a3921cb3 --- /dev/null +++ b/glm/gtx/simd_quat.hpp @@ -0,0 +1,343 @@ +/////////////////////////////////////////////////////////////////////////////////// +/// OpenGL Mathematics (glm.g-truc.net) +/// +/// Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +/// THE SOFTWARE. +/// +/// @ref gtx_simd_quat +/// @file glm/gtx/simd_quat.hpp +/// @date 2009-05-07 / 2011-06-07 +/// @author Christophe Riccio +/// +/// @see core (dependence) +/// +/// @defgroup gtx_simd_vec4 GLM_GTX_simd_quat +/// @ingroup gtx +/// +/// @brief SIMD implementation of quat type. +/// +/// need to be included to use these functionalities. +/////////////////////////////////////////////////////////////////////////////////// + +#ifndef GLM_GTX_simd_quat +#define GLM_GTX_simd_quat GLM_VERSION + +// Dependency: +#include "../glm.hpp" +#include "../gtc/quaternion.hpp" +#include "../gtx/fast_trigonometry.hpp" + +#if(GLM_ARCH != GLM_ARCH_PURE) + +#if(GLM_ARCH & GLM_ARCH_SSE2) +# include "../core/intrinsic_common.hpp" +# include "../core/intrinsic_geometric.hpp" +# include "../gtx/simd_mat4.hpp" +#else +# error "GLM: GLM_GTX_simd_quat requires compiler support of SSE2 through intrinsics" +#endif + +#if(defined(GLM_MESSAGES) && !defined(glm_ext)) +# pragma message("GLM: GLM_GTX_simd_quat extension included") +#endif + + +// Warning silencer for nameless struct/union. +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(push) +# pragma warning(disable:4201) // warning C4201: nonstandard extension used : nameless struct/union +#endif + + +namespace glm{ +namespace detail +{ + /// Quaternion implemented using SIMD SEE intrinsics. + /// \ingroup gtx_simd_vec4 + GLM_ALIGNED_STRUCT(16) fquatSIMD + { + enum ctor{null}; + typedef __m128 value_type; + typedef std::size_t size_type; + static size_type value_size(); + + typedef fquatSIMD type; + typedef tquat bool_type; + +#ifdef GLM_SIMD_ENABLE_XYZW_UNION + union + { + __m128 Data; + struct {float x, y, z, w;}; + }; +#else + __m128 Data; +#endif + + ////////////////////////////////////// + // Implicit basic constructors + + fquatSIMD(); + fquatSIMD(__m128 const & Data); + fquatSIMD(fquatSIMD const & q); + + ////////////////////////////////////// + // Explicit basic constructors + + explicit fquatSIMD( + ctor); + explicit fquatSIMD( + float const & w, + float const & x, + float const & y, + float const & z); + explicit fquatSIMD( + quat const & v); + explicit fquatSIMD( + vec3 const & eulerAngles); + + + ////////////////////////////////////// + // Unary arithmetic operators + + fquatSIMD& operator =(fquatSIMD const & q); + fquatSIMD& operator*=(float const & s); + fquatSIMD& operator/=(float const & s); + }; + + + ////////////////////////////////////// + // Arithmetic operators + + detail::fquatSIMD operator- ( + detail::fquatSIMD const & q); + + detail::fquatSIMD operator+ ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + detail::fquatSIMD const & p); + + detail::fvec4SIMD operator* ( + detail::fquatSIMD const & q, + detail::fvec4SIMD const & v); + + detail::fvec4SIMD operator* ( + detail::fvec4SIMD const & v, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator* ( + detail::fquatSIMD const & q, + float s); + + detail::fquatSIMD operator* ( + float s, + detail::fquatSIMD const & q); + + detail::fquatSIMD operator/ ( + detail::fquatSIMD const & q, + float s); + +}//namespace detail + + typedef glm::detail::fquatSIMD simdQuat; + + /// @addtogroup gtx_simd_quat + /// @{ + + //! Convert a simdQuat to a quat. + //! (From GLM_GTX_simd_quat extension) + quat quat_cast( + detail::fquatSIMD const & x); + + //! Convert a simdMat4 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + detail::fquatSIMD quatSIMD_cast( + detail::fmat4x4SIMD const & m); + + //! Converts a mat4 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + template + detail::fquatSIMD quatSIMD_cast( + detail::tmat4x4 const & m); + + //! Converts a mat3 to a simdQuat. + //! (From GLM_GTX_simd_quat extension) + template + detail::fquatSIMD quatSIMD_cast( + detail::tmat3x3 const & m); + + //! Convert a simdQuat to a simdMat4 + //! (From GLM_GTX_simd_quat extension) + detail::fmat4x4SIMD mat4SIMD_cast( + detail::fquatSIMD const & q); + + //! Converts a simdQuat to a standard mat4. + //! (From GLM_GTX_simd_quat extension) + mat4 mat4_cast( + detail::fquatSIMD const & q); + + + /// Returns the length of the quaternion. + /// + /// @see gtc_quaternion + float length( + detail::fquatSIMD const & x); + + /// Returns the normalized quaternion. + /// + /// @see gtc_quaternion + detail::fquatSIMD normalize( + detail::fquatSIMD const & x); + + /// Returns dot product of q1 and q2, i.e., q1[0] * q2[0] + q1[1] * q2[1] + ... + /// + /// @see gtc_quaternion + float dot( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation is oriented and the rotation is performed at constant speed. + /// For short path spherical linear interpolation, use the slerp function. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD mix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Linear interpolation of two quaternions. + /// The interpolation is oriented. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined in the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD lerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Spherical linear interpolation of two quaternions. + /// The interpolation always take the short path and the rotation is performed at constant speed. + /// + /// @param x A quaternion + /// @param y A quaternion + /// @param a Interpolation factor. The interpolation is defined beyond the range [0, 1]. + /// @tparam T Value type used to build the quaternion. Supported: half, float or double. + /// @see gtc_quaternion + detail::fquatSIMD slerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + + /// Faster spherical linear interpolation of two unit length quaternions. + /// + /// This is the same as mix(), except for two rules: + /// 1) The two quaternions must be unit length. + /// 2) The interpolation factor (a) must be in the range [0, 1]. + /// + /// This will use the equivalent to fastAcos() and fastSin(). + /// + /// @see gtc_quaternion + /// @see - mix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastMix( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + /// Identical to fastMix() except takes the shortest path. + /// + /// The same rules apply here as those in fastMix(). Both quaternions must be unit length and 'a' must be + /// in the range [0, 1]. + /// + /// @see - fastMix(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + /// @see - slerp(detail::fquatSIMD const & x, detail::fquatSIMD const & y, T const & a) + detail::fquatSIMD fastSlerp( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a); + + + /// Returns the q conjugate. + /// + /// @see gtc_quaternion + detail::fquatSIMD conjugate( + detail::fquatSIMD const & q); + + /// Returns the q inverse. + /// + /// @see gtc_quaternion + detail::fquatSIMD inverse( + detail::fquatSIMD const & q); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param axis Axis of the quaternion, must be normalized. + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + vec3 const & axis); + + /// Build a quaternion from an angle and a normalized axis. + /// + /// @param angle Angle expressed in radians if GLM_FORCE_RADIANS is define or degrees otherwise. + /// @param x x component of the x-axis, x, y, z must be a normalized axis + /// @param y y component of the y-axis, x, y, z must be a normalized axis + /// @param z z component of the z-axis, x, y, z must be a normalized axis + /// + /// @see gtc_quaternion + detail::fquatSIMD angleAxisSIMD( + float const & angle, + float const & x, + float const & y, + float const & z); + + + // TODO: Move this to somewhere more appropriate. Used with fastMix() and fastSlerp(). + /// Performs the equivalent of glm::fastSin() on each component of the given __m128. + __m128 fastSin(__m128 x); + + + /// @} +}//namespace glm + +#include "simd_quat.inl" + + +#if (GLM_COMPILER & GLM_COMPILER_VC) +# pragma warning(pop) +#endif + + +#endif//(GLM_ARCH != GLM_ARCH_PURE) + +#endif//GLM_GTX_simd_quat diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl new file mode 100644 index 00000000..d96f48d4 --- /dev/null +++ b/glm/gtx/simd_quat.inl @@ -0,0 +1,628 @@ +/////////////////////////////////////////////////////////////////////////////////////////////////// +// OpenGL Mathematics Copyright (c) 2005 - 2013 G-Truc Creation (www.g-truc.net) +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Created : 2013-04-22 +// Updated : 2013-04-22 +// Licence : This source is under MIT License +// File : glm/gtx/simd_quat.inl +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +namespace glm{ +namespace detail{ + + +////////////////////////////////////// +// Debugging +#if 0 +void print(__m128 v) +{ + GLM_ALIGN(16) float result[4]; + _mm_store_ps(result, v); + + printf("__m128: %f %f %f %f\n", result[0], result[1], result[2], result[3]); +} + +void print(const fvec4SIMD &v) +{ + printf("fvec4SIMD: %f %f %f %f\n", v.x, v.y, v.z, v.w); +} +#endif + + +////////////////////////////////////// +// Implicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD() +#ifdef GLM_SIMD_ENABLE_DEFAULT_INIT + : Data(_mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f)) +#endif +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(__m128 const & Data) : + Data(Data) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(fquatSIMD const & q) : + Data(q.Data) +{} + + +////////////////////////////////////// +// Explicit basic constructors + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(float const & w, float const & x, float const & y, float const & z) : + Data(_mm_set_ps(w, z, y, x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(quat const & q) : + Data(_mm_set_ps(q.w, q.z, q.y, q.x)) +{} + +GLM_FUNC_QUALIFIER fquatSIMD::fquatSIMD(vec3 const & eulerAngles) +{ + vec3 c = glm::cos(eulerAngles * 0.5f); + vec3 s = glm::sin(eulerAngles * 0.5f); + + Data = _mm_set_ps( + (c.x * c.y * c.z) + (s.x * s.y * s.z), + (c.x * c.y * s.z) - (s.x * s.y * c.z), + (c.x * s.y * c.z) + (s.x * c.y * s.z), + (s.x * c.y * c.z) - (c.x * s.y * s.z)); +} + + +////////////////////////////////////// +// Unary arithmetic operators + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator=(fquatSIMD const & q) +{ + this->Data = q.Data; + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator*=(float const & s) +{ + this->Data = _mm_mul_ps(this->Data, _mm_set_ps1(s)); + return *this; +} + +GLM_FUNC_QUALIFIER fquatSIMD& fquatSIMD::operator/=(float const & s) +{ + this->Data = _mm_div_ps(Data, _mm_set1_ps(s)); + return *this; +} + + + +// negate operator +GLM_FUNC_QUALIFIER fquatSIMD operator- (fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(-1.0f, -1.0f, -1.0f, -1.0f))); +} + +// operator+ +GLM_FUNC_QUALIFIER fquatSIMD operator+ (fquatSIMD const & q1, fquatSIMD const & q2) +{ + return fquatSIMD(_mm_add_ps(q1.Data, q2.Data)); +} + +//operator* +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & q2) +{ + // SSE2 STATS: + // 11 shuffle + // 8 mul + // 8 add + + // SSE4 STATS: + // 3 shuffle + // 4 mul + // 4 dpps + + __m128 mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); + __m128 mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); + __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); + __m128 mul3 = _mm_mul_ps(q1.Data, q2.Data); + +# if((GLM_ARCH & GLM_ARCH_SSE4)) + __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); +# else + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); +#endif + + + // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than + // the final code below. I'll keep this here for reference - maybe somebody else can do something better... + // + //__m128 xxyy = _mm_shuffle_ps(add0, add1, _MM_SHUFFLE(0, 0, 0, 0)); + //__m128 zzww = _mm_shuffle_ps(add2, add3, _MM_SHUFFLE(0, 0, 0, 0)); + // + //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0)); + + float x; + float y; + float z; + float w; + + _mm_store_ss(&x, add0); + _mm_store_ss(&y, add1); + _mm_store_ss(&z, add2); + _mm_store_ss(&w, add3); + + return detail::fquatSIMD(w, x, y, z); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fquatSIMD const & q, fvec4SIMD const & v) +{ + static const __m128 two = _mm_set1_ps(2.0f); + + __m128 q_wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 q_swp0 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 q_swp1 = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 v_swp0 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 v_swp1 = _mm_shuffle_ps(v.Data, v.Data, _MM_SHUFFLE(3, 1, 0, 2)); + + __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0)); + __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1)); + __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0)); + + + uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two)); + uuv = _mm_mul_ps(uuv, two); + + return _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); +} + +GLM_FUNC_QUALIFIER fvec4SIMD operator* (fvec4SIMD const & v, fquatSIMD const & q) +{ + return inverse(q) * v; +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_mul_ps(q.Data, _mm_set1_ps(s))); +} + +GLM_FUNC_QUALIFIER fquatSIMD operator* (float s, fquatSIMD const & q) +{ + return fquatSIMD(_mm_mul_ps(_mm_set1_ps(s), q.Data)); +} + + +//operator/ +GLM_FUNC_QUALIFIER fquatSIMD operator/ (fquatSIMD const & q, float s) +{ + return fquatSIMD(_mm_div_ps(q.Data, _mm_set1_ps(s))); +} + + +}//namespace detail + + +GLM_FUNC_QUALIFIER quat quat_cast +( + detail::fquatSIMD const & x +) +{ + GLM_ALIGN(16) quat Result; + _mm_store_ps(&Result[0], x.Data); + + return Result; +} + +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast_impl(const T m0[], const T m1[], const T m2[]) +{ + T trace = m0[0] + m1[1] + m2[2] + T(1.0); + if (trace > T(0)) + { + T s = T(0.5) / sqrt(trace); + + return _mm_set_ps( + static_cast(T(0.25) / s), + static_cast((m0[1] - m1[0]) * s), + static_cast((m2[0] - m0[2]) * s), + static_cast((m1[2] - m2[1]) * s)); + } + else + { + if (m0[0] > m1[1]) + { + if (m0[0] > m2[2]) + { + // X is biggest. + T s = sqrt(m0[0] - m1[1] - m2[2] + T(1.0)) * T(0.5); + + return _mm_set_ps( + static_cast((m1[2] - m2[1]) * s), + static_cast((m2[0] + m0[2]) * s), + static_cast((m0[1] + m1[0]) * s), + static_cast(T(0.5) * s)); + } + } + else + { + if (m1[1] > m2[2]) + { + // Y is biggest. + T s = sqrt(m1[1] - m0[0] - m2[2] + T(1.0)) * T(0.5); + + return _mm_set_ps( + static_cast((m2[0] - m0[2]) * s), + static_cast((m1[2] + m2[1]) * s), + static_cast(T(0.5) * s), + static_cast((m0[1] + m1[0]) * s)); + } + } + + // Z is biggest. + T s = sqrt(m2[2] - m0[0] - m1[1] + T(1.0)) * T(0.5); + + return _mm_set_ps( + static_cast((m0[1] - m1[0]) * s), + static_cast(T(0.5) * s), + static_cast((m1[2] + m2[1]) * s), + static_cast((m2[0] + m0[2]) * s)); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::fmat4x4SIMD const & m +) +{ + // Scalar implementation for now. + GLM_ALIGN(16) float m0[4]; + GLM_ALIGN(16) float m1[4]; + GLM_ALIGN(16) float m2[4]; + + _mm_store_ps(m0, m[0].Data); + _mm_store_ps(m1, m[1].Data); + _mm_store_ps(m2, m[2].Data); + + return quatSIMD_cast_impl(m0, m1, m2); +} + +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::tmat4x4 const & m +) +{ + return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]); +} + +template +GLM_FUNC_QUALIFIER detail::fquatSIMD quatSIMD_cast +( + detail::tmat3x3 const & m +) +{ + return quatSIMD_cast_impl(&m[0][0], &m[1][0], &m[2][0]); +} + + +GLM_FUNC_QUALIFIER detail::fmat4x4SIMD mat4SIMD_cast +( + detail::fquatSIMD const & q +) +{ + detail::fmat4x4SIMD result; + + __m128 _wwww = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 _xyzw = q.Data; + __m128 _zxyw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw = _mm_shuffle_ps(q.Data, q.Data, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _xyzw2 = _mm_add_ps(_xyzw, _xyzw); + __m128 _zxyw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 1, 0, 2)); + __m128 _yzxw2 = _mm_shuffle_ps(_xyzw2, _xyzw2, _MM_SHUFFLE(3, 0, 2, 1)); + + __m128 _tmp0 = _mm_sub_ps(_mm_set1_ps(1.0f), _mm_mul_ps(_yzxw2, _yzxw)); + _tmp0 = _mm_sub_ps(_tmp0, _mm_mul_ps(_zxyw2, _zxyw)); + + __m128 _tmp1 = _mm_mul_ps(_yzxw2, _xyzw); + _tmp1 = _mm_add_ps(_tmp1, _mm_mul_ps(_zxyw2, _wwww)); + + __m128 _tmp2 = _mm_mul_ps(_zxyw2, _xyzw); + _tmp2 = _mm_sub_ps(_tmp2, _mm_mul_ps(_yzxw2, _wwww)); + + + // There's probably a better, more politically correct way of doing this... + result[0].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp2)[0], + reinterpret_cast(&_tmp1)[0], + reinterpret_cast(&_tmp0)[0]); + + result[1].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp1)[1], + reinterpret_cast(&_tmp0)[1], + reinterpret_cast(&_tmp2)[1]); + + result[2].Data = _mm_set_ps( + 0.0f, + reinterpret_cast(&_tmp0)[2], + reinterpret_cast(&_tmp2)[2], + reinterpret_cast(&_tmp1)[2]); + + result[3].Data = _mm_set_ps( + 1.0f, + 0.0f, + 0.0f, + 0.0f); + + + return result; +} + +GLM_FUNC_QUALIFIER mat4 mat4_cast +( + detail::fquatSIMD const & q +) +{ + return mat4_cast(mat4SIMD_cast(q)); +} + + + +GLM_FUNC_QUALIFIER float length +( + detail::fquatSIMD const & q +) +{ + return glm::sqrt(dot(q, q)); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD normalize +( + detail::fquatSIMD const & q +) +{ + return _mm_mul_ps(q.Data, _mm_set1_ps(1.0f / length(q))); +} + +GLM_FUNC_QUALIFIER float dot +( + detail::fquatSIMD const & q1, + detail::fquatSIMD const & q2 +) +{ + float result; + _mm_store_ss(&result, detail::sse_dot_ps(q1.Data, q2.Data)); + + return result; +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD mix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD lerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + // Lerp is only defined in [0, 1] + assert(a >= 0.0f); + assert(a <= 1.0f); + + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD slerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + + // If cosTheta < 0, the interpolation will take the long way around the sphere. + // To fix this, one quat must be negated. + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + // Perform a linear interpolation when cosTheta is close to 1 to avoid side effect of sin(angle) becoming a zero denominator + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::acos(cosTheta); + + + float s0 = glm::sin((1.0f - a) * angle); + float s1 = glm::sin(a * angle); + float d = 1.0f / glm::sin(angle); + + return (s0 * x + s1 * y) * d; + } +} + + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastMix +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + float cosTheta = dot(x, y); + + if (cosTheta > 1.0f - glm::epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD fastSlerp +( + detail::fquatSIMD const & x, + detail::fquatSIMD const & y, + float const & a +) +{ + detail::fquatSIMD z = y; + + float cosTheta = dot(x, y); + if (cosTheta < 0.0f) + { + z = -y; + cosTheta = -cosTheta; + } + + + if(cosTheta > 1.0f - epsilon()) + { + return _mm_add_ps(x.Data, _mm_mul_ps(_mm_set1_ps(a), _mm_sub_ps(y.Data, x.Data))); + } + else + { + float angle = glm::fastAcos(cosTheta); + + + __m128 s = glm::fastSin(_mm_set_ps((1.0f - a) * angle, a * angle, angle, 0.0f)); + + __m128 s0 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(3, 3, 3, 3)); + __m128 s1 = _mm_shuffle_ps(s, s, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 d = _mm_div_ps(_mm_set1_ps(1.0f), _mm_shuffle_ps(s, s, _MM_SHUFFLE(1, 1, 1, 1))); + + return _mm_mul_ps(_mm_add_ps(_mm_mul_ps(s0, x.Data), _mm_mul_ps(s1, y.Data)), d); + } +} + + + +GLM_FUNC_QUALIFIER detail::fquatSIMD conjugate +( + detail::fquatSIMD const & q +) +{ + return detail::fquatSIMD(_mm_mul_ps(q.Data, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f))); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD inverse +( + detail::fquatSIMD const & q +) +{ + return conjugate(q) / dot(q, q); +} + + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + vec3 const & v +) +{ +#ifdef GLM_FORCE_RADIANS + float a(angle); +#else + float a(glm::radians(angle)); +#endif + float s = glm::sin(a * 0.5f); + + return _mm_set_ps( + glm::cos(a * 0.5f), + v.z * s, + v.y * s, + v.x * s); +} + +GLM_FUNC_QUALIFIER detail::fquatSIMD angleAxisSIMD +( + float const & angle, + float const & x, + float const & y, + float const & z +) +{ + return angleAxisSIMD(angle, vec3(x, y, z)); +} + + +GLM_FUNC_QUALIFIER __m128 fastSin(__m128 x) +{ + static const __m128 c0 = _mm_set1_ps(0.16666666666666666666666666666667f); + static const __m128 c1 = _mm_set1_ps(0.00833333333333333333333333333333f); + static const __m128 c2 = _mm_set1_ps(0.00019841269841269841269841269841f); + + __m128 x3 = _mm_mul_ps(x, _mm_mul_ps(x, x)); + __m128 x5 = _mm_mul_ps(x3, _mm_mul_ps(x, x)); + __m128 x7 = _mm_mul_ps(x5, _mm_mul_ps(x, x)); + + __m128 y0 = _mm_mul_ps(x3, c0); + __m128 y1 = _mm_mul_ps(x5, c1); + __m128 y2 = _mm_mul_ps(x7, c2); + + return _mm_sub_ps(_mm_add_ps(_mm_sub_ps(x, y0), y1), y2); +} + + +}//namespace glm