mirror of
https://github.com/g-truc/glm.git
synced 2024-11-26 02:04:35 +00:00
Added SSE optimization 'platforms'
This commit is contained in:
parent
d7755485c8
commit
178f736fb0
@ -32,6 +32,8 @@ namespace detail{
|
|||||||
//roundEven
|
//roundEven
|
||||||
__m128 sse_rde_ps(__m128 v);
|
__m128 sse_rde_ps(__m128 v);
|
||||||
|
|
||||||
|
__m128 sse_rnd_ps(__m128 x);
|
||||||
|
|
||||||
__m128 sse_ceil_ps(__m128 v);
|
__m128 sse_ceil_ps(__m128 v);
|
||||||
|
|
||||||
__m128 sse_frc_ps(__m128 x);
|
__m128 sse_frc_ps(__m128 x);
|
||||||
|
@ -153,7 +153,7 @@ inline __m128 sse_sgn_ps(__m128 x)
|
|||||||
//floor
|
//floor
|
||||||
inline __m128 sse_flr_ps(__m128 x)
|
inline __m128 sse_flr_ps(__m128 x)
|
||||||
{
|
{
|
||||||
__m128 rnd0 = _mm_rnd_ps(x);
|
__m128 rnd0 = sse_rnd_ps(x);
|
||||||
__m128 cmp0 = _mm_cmplt_ps(x, rnd0);
|
__m128 cmp0 = _mm_cmplt_ps(x, rnd0);
|
||||||
__m128 and0 = _mm_and_ps(cmp0, glm::detail::_ps_1);
|
__m128 and0 = _mm_and_ps(cmp0, glm::detail::_ps_1);
|
||||||
__m128 sub0 = _mm_sub_ps(rnd0, and0);
|
__m128 sub0 = _mm_sub_ps(rnd0, and0);
|
||||||
@ -185,7 +185,7 @@ inline __m128 sse_rde_ps(__m128 v)
|
|||||||
|
|
||||||
inline __m128 sse_ceil_ps(__m128 x)
|
inline __m128 sse_ceil_ps(__m128 x)
|
||||||
{
|
{
|
||||||
__m128 rnd0 = _mm_rnd_ps(x);
|
__m128 rnd0 = sse_rnd_ps(x);
|
||||||
__m128 cmp0 = _mm_cmpgt_ps(x, rnd0);
|
__m128 cmp0 = _mm_cmpgt_ps(x, rnd0);
|
||||||
__m128 and0 = _mm_and_ps(cmp0, glm::detail::_ps_1);
|
__m128 and0 = _mm_and_ps(cmp0, glm::detail::_ps_1);
|
||||||
__m128 add0 = _mm_add_ps(rnd0, and0);
|
__m128 add0 = _mm_add_ps(rnd0, and0);
|
||||||
@ -194,7 +194,7 @@ inline __m128 sse_ceil_ps(__m128 x)
|
|||||||
|
|
||||||
inline __m128 sse_frc_ps(__m128 x)
|
inline __m128 sse_frc_ps(__m128 x)
|
||||||
{
|
{
|
||||||
__m128 flr0 = _mm_flr_ps(x);
|
__m128 flr0 = sse_flr_ps(x);
|
||||||
__m128 sub0 = _mm_sub_ps(x, flr0);
|
__m128 sub0 = _mm_sub_ps(x, flr0);
|
||||||
return sub0;
|
return sub0;
|
||||||
}
|
}
|
||||||
@ -202,7 +202,7 @@ inline __m128 sse_frc_ps(__m128 x)
|
|||||||
inline __m128 sse_mod_ps(__m128 x, __m128 y)
|
inline __m128 sse_mod_ps(__m128 x, __m128 y)
|
||||||
{
|
{
|
||||||
__m128 div0 = _mm_div_ps(x, y);
|
__m128 div0 = _mm_div_ps(x, y);
|
||||||
__m128 flr0 = _mm_flr_ps(div0);
|
__m128 flr0 = sse_flr_ps(div0);
|
||||||
__m128 mul0 = _mm_mul_ps(y, flr0);
|
__m128 mul0 = _mm_mul_ps(y, flr0);
|
||||||
__m128 sub0 = _mm_sub_ps(x, mul0);
|
__m128 sub0 = _mm_sub_ps(x, mul0);
|
||||||
return sub0;
|
return sub0;
|
||||||
@ -247,7 +247,7 @@ inline __m128 sse_ssp_ps(__m128 edge0, __m128 edge1, __m128 x)
|
|||||||
__m128 sub0 = _mm_sub_ps(x, edge0);
|
__m128 sub0 = _mm_sub_ps(x, edge0);
|
||||||
__m128 sub1 = _mm_sub_ps(edge1, edge0);
|
__m128 sub1 = _mm_sub_ps(edge1, edge0);
|
||||||
__m128 div0 = _mm_sub_ps(sub0, sub1);
|
__m128 div0 = _mm_sub_ps(sub0, sub1);
|
||||||
__m128 clp0 = _mm_clp_ps(div0, glm::detail::zero, glm::detail::one);
|
__m128 clp0 = sse_clp_ps(div0, glm::detail::zero, glm::detail::one);
|
||||||
__m128 mul0 = _mm_mul_ps(glm::detail::two, clp0);
|
__m128 mul0 = _mm_mul_ps(glm::detail::two, clp0);
|
||||||
__m128 sub2 = _mm_sub_ps(glm::detail::three, mul0);
|
__m128 sub2 = _mm_sub_ps(glm::detail::three, mul0);
|
||||||
__m128 mul1 = _mm_mul_ps(clp0, clp0);
|
__m128 mul1 = _mm_mul_ps(clp0, clp0);
|
||||||
|
@ -11,23 +11,23 @@ namespace glm{
|
|||||||
namespace detail{
|
namespace detail{
|
||||||
|
|
||||||
//length
|
//length
|
||||||
inline __m128 _mm_len_ps(__m128 x)
|
inline __m128 sse_len_ps(__m128 x)
|
||||||
{
|
{
|
||||||
__m128 dot0 = _mm_dot_ps(x, x);
|
__m128 dot0 = sse_dot_ps(x, x);
|
||||||
__m128 sqt0 = _mm_sqrt_ps(dot0);
|
__m128 sqt0 = _mm_sqrt_ps(dot0);
|
||||||
return sqt0;
|
return sqt0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//distance
|
//distance
|
||||||
inline __m128 _mm_dst_ps(__m128 p0, __m128 p1)
|
inline __m128 sse_dst_ps(__m128 p0, __m128 p1)
|
||||||
{
|
{
|
||||||
__m128 sub0 = _mm_sub_ps(p0, p1);
|
__m128 sub0 = _mm_sub_ps(p0, p1);
|
||||||
__m128 len0 = _mm_len_ps(sub0);
|
__m128 len0 = sse_len_ps(sub0);
|
||||||
return len0;
|
return len0;
|
||||||
}
|
}
|
||||||
|
|
||||||
//dot
|
//dot
|
||||||
inline __m128 _mm_dot_ps(__m128 v1, __m128 v2)
|
inline __m128 sse_dot_ps(__m128 v1, __m128 v2)
|
||||||
{
|
{
|
||||||
__m128 mul0 = _mm_mul_ps(v1, v2);
|
__m128 mul0 = _mm_mul_ps(v1, v2);
|
||||||
__m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
|
__m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
|
||||||
@ -38,7 +38,7 @@ inline __m128 _mm_dot_ps(__m128 v1, __m128 v2)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// SSE1
|
// SSE1
|
||||||
inline __m128 _mm_dot_ss(__m128 v1, __m128 v2)
|
inline __m128 sse_dot_ss(__m128 v1, __m128 v2)
|
||||||
{
|
{
|
||||||
__m128 mul0 = _mm_mul_ps(v1, v2);
|
__m128 mul0 = _mm_mul_ps(v1, v2);
|
||||||
__m128 mov0 = _mm_movehl_ps(mul0, mul0);
|
__m128 mov0 = _mm_movehl_ps(mul0, mul0);
|
||||||
@ -49,7 +49,7 @@ inline __m128 _mm_dot_ss(__m128 v1, __m128 v2)
|
|||||||
}
|
}
|
||||||
|
|
||||||
//cross
|
//cross
|
||||||
inline __m128 _mm_xpd_ps(__m128 v1, __m128 v2)
|
inline __m128 sse_xpd_ps(__m128 v1, __m128 v2)
|
||||||
{
|
{
|
||||||
__m128 swp0 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1));
|
__m128 swp0 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 0, 2, 1));
|
||||||
__m128 swp1 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 1, 0, 2));
|
__m128 swp1 = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3, 1, 0, 2));
|
||||||
@ -64,7 +64,7 @@ inline __m128 _mm_xpd_ps(__m128 v1, __m128 v2)
|
|||||||
//normalize
|
//normalize
|
||||||
inline __m128 _mm_nrm_ps(__m128 v)
|
inline __m128 _mm_nrm_ps(__m128 v)
|
||||||
{
|
{
|
||||||
__m128 dot0 = _mm_dot_ps(v, v);
|
__m128 dot0 = sse_dot_ps(v, v);
|
||||||
__m128 isr0 = _mm_rsqrt_ps(dot0);
|
__m128 isr0 = _mm_rsqrt_ps(dot0);
|
||||||
__m128 mul0 = _mm_mul_ps(v, isr0);
|
__m128 mul0 = _mm_mul_ps(v, isr0);
|
||||||
return mul0;
|
return mul0;
|
||||||
@ -79,8 +79,8 @@ inline __m128 _mm_ffd_ps(__m128 N, __m128 I, __m128 Nref)
|
|||||||
//__m128 mix0 = _mm_mix_ps(N, neg0, sgn0);
|
//__m128 mix0 = _mm_mix_ps(N, neg0, sgn0);
|
||||||
//return mix0;
|
//return mix0;
|
||||||
|
|
||||||
__m128 dot0 = _mm_dot_ps(Nref, I);
|
__m128 dot0 = sse_dot_ps(Nref, I);
|
||||||
__m128 sgn0 = _mm_sgn_ps(dot0);
|
__m128 sgn0 = sse_sgn_ps(dot0);
|
||||||
__m128 mul0 = _mm_mul_ps(sgn0, glm::detail::minus_one);
|
__m128 mul0 = _mm_mul_ps(sgn0, glm::detail::minus_one);
|
||||||
__m128 mul1 = _mm_mul_ps(N, mul0);
|
__m128 mul1 = _mm_mul_ps(N, mul0);
|
||||||
return mul1;
|
return mul1;
|
||||||
@ -89,7 +89,7 @@ inline __m128 _mm_ffd_ps(__m128 N, __m128 I, __m128 Nref)
|
|||||||
//reflect
|
//reflect
|
||||||
inline __m128 _mm_rfe_ps(__m128 I, __m128 N)
|
inline __m128 _mm_rfe_ps(__m128 I, __m128 N)
|
||||||
{
|
{
|
||||||
__m128 dot0 = _mm_dot_ps(N, I);
|
__m128 dot0 = sse_dot_ps(N, I);
|
||||||
__m128 mul0 = _mm_mul_ps(N, dot0);
|
__m128 mul0 = _mm_mul_ps(N, dot0);
|
||||||
__m128 mul1 = _mm_mul_ps(mul0, glm::detail::two);
|
__m128 mul1 = _mm_mul_ps(mul0, glm::detail::two);
|
||||||
__m128 sub0 = _mm_sub_ps(I, mul1);
|
__m128 sub0 = _mm_sub_ps(I, mul1);
|
||||||
@ -99,7 +99,7 @@ inline __m128 _mm_rfe_ps(__m128 I, __m128 N)
|
|||||||
//refract
|
//refract
|
||||||
inline __m128 _mm_rfa_ps(__m128 I, __m128 N, __m128 eta)
|
inline __m128 _mm_rfa_ps(__m128 I, __m128 N, __m128 eta)
|
||||||
{
|
{
|
||||||
__m128 dot0 = _mm_dot_ps(N, I);
|
__m128 dot0 = sse_dot_ps(N, I);
|
||||||
__m128 mul0 = _mm_mul_ps(eta, eta);
|
__m128 mul0 = _mm_mul_ps(eta, eta);
|
||||||
__m128 mul1 = _mm_mul_ps(dot0, dot0);
|
__m128 mul1 = _mm_mul_ps(dot0, dot0);
|
||||||
__m128 sub0 = _mm_sub_ps(glm::detail::one, mul0);
|
__m128 sub0 = _mm_sub_ps(glm::detail::one, mul0);
|
||||||
|
@ -31,6 +31,10 @@ namespace detail
|
|||||||
|
|
||||||
void sse_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4]);
|
void sse_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4]);
|
||||||
|
|
||||||
|
__m128 sse_det_ps(__m128 const m[4]);
|
||||||
|
|
||||||
|
__m128 sse_slow_det_ps(__m128 const m[4]);
|
||||||
|
|
||||||
}//namespace detail
|
}//namespace detail
|
||||||
}//namespace glm
|
}//namespace glm
|
||||||
|
|
||||||
|
@ -10,13 +10,11 @@
|
|||||||
namespace glm{
|
namespace glm{
|
||||||
namespace detail{
|
namespace detail{
|
||||||
|
|
||||||
static const __m128 one = _mm_set_ps1(1.0f);
|
|
||||||
static const __m128 pi = _mm_set_ps1(3.141592653589793238462643383279f);
|
|
||||||
static const __m128 _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f);
|
static const __m128 _m128_rad_ps = _mm_set_ps1(3.141592653589793238462643383279f / 180.f);
|
||||||
static const __m128 _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f);
|
static const __m128 _m128_deg_ps = _mm_set_ps1(180.f / 3.141592653589793238462643383279f);
|
||||||
|
|
||||||
template <typename matType>
|
template <typename matType>
|
||||||
inline matType _mm_comp_mul_ps
|
inline matType sse_comp_mul_ps
|
||||||
(
|
(
|
||||||
__m128 const in1[4],
|
__m128 const in1[4],
|
||||||
__m128 const in2[4],
|
__m128 const in2[4],
|
||||||
@ -29,7 +27,7 @@ inline matType _mm_comp_mul_ps
|
|||||||
out[3] = _mm_mul_ps(in1[3], in2[3]);
|
out[3] = _mm_mul_ps(in1[3], in2[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_add_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
inline void sse_add_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
out[0] = _mm_add_ps(in1[0], in2[0]);
|
out[0] = _mm_add_ps(in1[0], in2[0]);
|
||||||
@ -39,7 +37,7 @@ inline void _mm_add_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_sub_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
inline void sse_sub_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
out[0] = _mm_sub_ps(in1[0], in2[0]);
|
out[0] = _mm_sub_ps(in1[0], in2[0]);
|
||||||
@ -49,7 +47,7 @@ inline void _mm_sub_ps(__m128 in1[4], __m128 in2[4], __m128 out[4])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128 _mm_mul_ps(__m128 m[4], __m128 v)
|
inline __m128 sse_mul_ps(__m128 m[4], __m128 v)
|
||||||
{
|
{
|
||||||
__m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
|
__m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
__m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
|
__m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
@ -68,7 +66,7 @@ inline __m128 _mm_mul_ps(__m128 m[4], __m128 v)
|
|||||||
return a2;
|
return a2;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128 _mm_mul_ps(__m128 v, __m128 m[4])
|
inline __m128 sse_mul_ps(__m128 v, __m128 m[4])
|
||||||
{
|
{
|
||||||
__m128 i0 = m[0];
|
__m128 i0 = m[0];
|
||||||
__m128 i1 = m[1];
|
__m128 i1 = m[1];
|
||||||
@ -95,7 +93,7 @@ inline __m128 _mm_mul_ps(__m128 v, __m128 m[4])
|
|||||||
return f2;
|
return f2;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_mul_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
|
inline void sse_mul_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
__m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
|
__m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
@ -171,7 +169,7 @@ inline void _mm_mul_ps(__m128 const in1[4], __m128 const in2[4], __m128 out[4])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_transpose_ps(__m128 const in[4], __m128 out[4])
|
inline void sse_transpose_ps(__m128 const in[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
__m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
|
__m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
|
||||||
__m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
|
__m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
|
||||||
@ -184,7 +182,7 @@ inline void _mm_transpose_ps(__m128 const in[4], __m128 out[4])
|
|||||||
out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
|
out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __m128 _mm_det_ps(__m128 const in[4])
|
inline __m128 sse_slow_det_ps(__m128 const in[4])
|
||||||
{
|
{
|
||||||
__m128 Fac0;
|
__m128 Fac0;
|
||||||
{
|
{
|
||||||
@ -406,18 +404,15 @@ inline __m128 _mm_det_ps(__m128 const in[4])
|
|||||||
// + m[0][1] * Inverse[1][0]
|
// + m[0][1] * Inverse[1][0]
|
||||||
// + m[0][2] * Inverse[2][0]
|
// + m[0][2] * Inverse[2][0]
|
||||||
// + m[0][3] * Inverse[3][0];
|
// + m[0][3] * Inverse[3][0];
|
||||||
__m128 Det0 = _mm_dot_ps(in[0], Row2);
|
__m128 Det0 = sse_dot_ps(in[0], Row2);
|
||||||
return Det0;
|
return Det0;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
inline __m128 sse_det_ps
|
||||||
inline typename detail::tmat4x4<T>::value_type _mm_det2_ps
|
|
||||||
(
|
(
|
||||||
__m128 const & m[4]
|
__m128 const m[4]
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
GLM_STATIC_ASSERT(detail::type<T>::is_float, "'determinant' only accept floating-point inputs");
|
|
||||||
|
|
||||||
//T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
|
//T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
|
||||||
//T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
|
//T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
|
||||||
//T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
|
//T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
|
||||||
@ -428,20 +423,20 @@ inline typename detail::tmat4x4<T>::value_type _mm_det2_ps
|
|||||||
// First 2 columns
|
// First 2 columns
|
||||||
__m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
|
__m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
|
||||||
__m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
|
__m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
|
||||||
__m128 MulA = __mm_mul_ps(Swp2A, Swp3A);
|
__m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
|
||||||
|
|
||||||
// Second 2 columns
|
// Second 2 columns
|
||||||
__m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
|
__m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
|
||||||
__m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
|
__m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
|
||||||
__m128 MulB = __mm_mul_ps(Swp2A, Swp3A);
|
__m128 MulB = _mm_mul_ps(Swp2A, Swp3A);
|
||||||
|
|
||||||
// Columns subtraction
|
// Columns subtraction
|
||||||
__m128 SubAB = __mm_sub_ps(MulA, MulB);
|
__m128 SubAB = _mm_sub_ps(MulA, MulB);
|
||||||
|
|
||||||
// Last 2 rows
|
// Last 2 rows
|
||||||
__m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(1, 2, 0, 0));
|
__m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(1, 2, 0, 0));
|
||||||
__m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 0, 1, 2));
|
__m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 0, 1, 2));
|
||||||
__m128 MulC = __mm_mul_ps(Swp2C, Swp3C);
|
__m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
|
||||||
__m128 SwpD = __mm_hl_ps(MulC);
|
__m128 SwpD = __mm_hl_ps(MulC);
|
||||||
|
|
||||||
//detail::tvec4<T> DetCof(
|
//detail::tvec4<T> DetCof(
|
||||||
@ -472,10 +467,10 @@ inline typename detail::tmat4x4<T>::value_type _mm_det2_ps
|
|||||||
// + m[0][2] * DetCof[2]
|
// + m[0][2] * DetCof[2]
|
||||||
// + m[0][3] * DetCof[3];
|
// + m[0][3] * DetCof[3];
|
||||||
|
|
||||||
return _mm_dot_ps(m[0], Signed);
|
return sse_dot_ps(m[0], Signed);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_inverse_ps(__m128 const in[4], __m128 out[4])
|
inline void sse_inverse_ps(__m128 const in[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
__m128 Fac0;
|
__m128 Fac0;
|
||||||
{
|
{
|
||||||
@ -708,7 +703,7 @@ inline void _mm_inverse_ps(__m128 const in[4], __m128 out[4])
|
|||||||
out[3] = _mm_mul_ps(Inv3, Rcp0);
|
out[3] = _mm_mul_ps(Inv3, Rcp0);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void _mm_inverse_fast_ps(__m128 const in[4], __m128 out[4])
|
inline void sse_inverse_fast_ps(__m128 const in[4], __m128 out[4])
|
||||||
{
|
{
|
||||||
__m128 Fac0;
|
__m128 Fac0;
|
||||||
{
|
{
|
||||||
@ -930,7 +925,7 @@ inline void _mm_inverse_fast_ps(__m128 const in[4], __m128 out[4])
|
|||||||
// + m[0][1] * Inverse[1][0]
|
// + m[0][1] * Inverse[1][0]
|
||||||
// + m[0][2] * Inverse[2][0]
|
// + m[0][2] * Inverse[2][0]
|
||||||
// + m[0][3] * Inverse[3][0];
|
// + m[0][3] * Inverse[3][0];
|
||||||
__m128 Det0 = _mm_dot_ps(in[0], Row2);
|
__m128 Det0 = sse_dot_ps(in[0], Row2);
|
||||||
__m128 Rcp0 = _mm_rcp_ps(Det0);
|
__m128 Rcp0 = _mm_rcp_ps(Det0);
|
||||||
//__m128 Rcp0 = _mm_div_ps(one, Det0);
|
//__m128 Rcp0 = _mm_div_ps(one, Det0);
|
||||||
// Inverse /= Determinant;
|
// Inverse /= Determinant;
|
||||||
@ -941,7 +936,7 @@ inline void _mm_inverse_fast_ps(__m128 const in[4], __m128 out[4])
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void _mm_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
|
void sse_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
|
||||||
{
|
{
|
||||||
float a = glm::radians(Angle);
|
float a = glm::radians(Angle);
|
||||||
float c = cos(a);
|
float c = cos(a);
|
||||||
@ -1008,10 +1003,10 @@ void _mm_rotate_ps(__m128 const in[4], float Angle, float const v[3], __m128 out
|
|||||||
//Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
|
//Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
|
||||||
//Result[3] = m[3];
|
//Result[3] = m[3];
|
||||||
//return Result;
|
//return Result;
|
||||||
_mm_mul_ps(in, Result, out);
|
sse_mul_ps(in, Result, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
void _mm_outer_ps(__m128 const & c, __m128 const & r, __m128 out[4])
|
void sse_outer_ps(__m128 const & c, __m128 const & r, __m128 out[4])
|
||||||
{
|
{
|
||||||
out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0));
|
out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0));
|
||||||
out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1));
|
out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1));
|
||||||
|
@ -234,7 +234,7 @@ namespace simd_mat4
|
|||||||
inline detail::fmat4x4SIMD simd_transpose(detail::fmat4x4SIMD const & m)
|
inline detail::fmat4x4SIMD simd_transpose(detail::fmat4x4SIMD const & m)
|
||||||
{
|
{
|
||||||
detail::fmat4x4SIMD result;
|
detail::fmat4x4SIMD result;
|
||||||
_mm_transpose_ps(&m[0].Data, &result[0].Data);
|
sse_transpose_ps(&m[0].Data, &result[0].Data);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -246,7 +246,7 @@ namespace simd_mat4
|
|||||||
inline detail::fmat4x4SIMD simd_inverse(detail::fmat4x4SIMD const & m)
|
inline detail::fmat4x4SIMD simd_inverse(detail::fmat4x4SIMD const & m)
|
||||||
{
|
{
|
||||||
detail::fmat4x4SIMD result;
|
detail::fmat4x4SIMD result;
|
||||||
_mm_inverse_ps(&m[0].Data, &result[0].Data);
|
sse_inverse_ps(&m[0].Data, &result[0].Data);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
}//namespace simd_mat4
|
}//namespace simd_mat4
|
||||||
|
102
glm/setup.hpp
102
glm/setup.hpp
@ -108,7 +108,7 @@
|
|||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
|
|
||||||
#if defined(_WIN64)
|
#if defined(_M_X64)
|
||||||
#define GLM_MODEL GLM_MODEL_64
|
#define GLM_MODEL GLM_MODEL_64
|
||||||
#else
|
#else
|
||||||
#define GLM_MODEL GLM_MODEL_32
|
#define GLM_MODEL GLM_MODEL_32
|
||||||
@ -222,42 +222,80 @@
|
|||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Compiler instruction set
|
// Compiler instruction set
|
||||||
|
|
||||||
#define GLM_INSTRUCTION_SET_NULL 0x00000000 //
|
//#define GLM_INSTRUCTION_SET_NULL 0x00000000 //
|
||||||
#define GLM_INSTRUCTION_SET_PURE 0x00000001 // x86intrin.h
|
#define GLM_INSTRUCTION_SET_PURE 0x00000001 // x86intrin.h
|
||||||
#define GLM_INSTRUCTION_SET_MMX 0x00000002 // mmintrin.h (MMX)
|
//#define GLM_INSTRUCTION_SET_MMX 0x00000002 // mmintrin.h (MMX)
|
||||||
#define GLM_INSTRUCTION_SET_3DNOW 0x00000004 // mm3dnow.h (3DNOW!)
|
//#define GLM_INSTRUCTION_SET_3DNOW 0x00000004 // mm3dnow.h (3DNOW!)
|
||||||
#define GLM_INSTRUCTION_SET_SSE 0x00000008 // xmmintrin.h (SSE + MMX)
|
//#define GLM_INSTRUCTION_SET_SSE 0x00000008 // xmmintrin.h (SSE + MMX)
|
||||||
#define GLM_INSTRUCTION_SET_SSE2 0x00000010 // emmintrin.h (SSE2 + SSE)
|
#define GLM_INSTRUCTION_SET_SSE2 0x00000010 // emmintrin.h (SSE2 + SSE)
|
||||||
#define GLM_INSTRUCTION_SET_SSE3 0x00000020 // pmmintrin.h (SSE3 + SSE2 + SSE1)
|
#define GLM_INSTRUCTION_SET_SSE3 0x00000020 | GLM_INSTRUCTION_SET_SSE2 // pmmintrin.h (SSE3 + SSE2 + SSE1)
|
||||||
#define GLM_INSTRUCTION_SET_SSSE3 0x00000040 // tmmintrin.h (SSSE3 + SSE3 + SSE2 + SSE1)
|
//#define GLM_INSTRUCTION_SET_SSSE3 0x00000040 // tmmintrin.h (SSSE3 + SSE3 + SSE2 + SSE1)
|
||||||
#define GLM_INSTRUCTION_SET_POPCNT 0x00000080 // popcntintrin.h
|
//#define GLM_INSTRUCTION_SET_POPCNT 0x00000080 // popcntintrin.h
|
||||||
#define GLM_INSTRUCTION_SET_SSE4A 0x00000100 // ammintrin.h (SSE4A + POPCNT + SSE3 + SSE2 + SSE)
|
//#define GLM_INSTRUCTION_SET_SSE4A 0x00000100 // ammintrin.h (SSE4A + POPCNT + SSE3 + SSE2 + SSE)
|
||||||
#define GLM_INSTRUCTION_SET_SSE4_1 0x00000200 // smmintrin.h (SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
//#define GLM_INSTRUCTION_SET_SSE4_1 0x00000200 // smmintrin.h (SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
||||||
#define GLM_INSTRUCTION_SET_SSE4_2 0x00000400 // nmmintrin.h (SSE4_2 + SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
//#define GLM_INSTRUCTION_SET_SSE4_2 0x00000400 // nmmintrin.h (SSE4_2 + SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
||||||
#define GLM_INSTRUCTION_SET_AES 0x00000800 // wmmintrin.h (AES + PCLMUL + SSE2 + SSE1)
|
//#define GLM_INSTRUCTION_SET_AES 0x00000800 // wmmintrin.h (AES + PCLMUL + SSE2 + SSE1)
|
||||||
#define GLM_INSTRUCTION_SET_PCLMUL 0x00001000 // wmmintrin.h (AES + PCLMUL + SSE2 + SSE1)
|
//#define GLM_INSTRUCTION_SET_PCLMUL 0x00001000 // wmmintrin.h (AES + PCLMUL + SSE2 + SSE1)
|
||||||
#define GLM_INSTRUCTION_SET_AVX 0x00002000 // immintrin.h (AES + PCLMUL + SSE4_2 + SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
#define GLM_INSTRUCTION_SET_AVX 0x00002000 | GLM_INSTRUCTION_SET_SSE3 // immintrin.h (AES + PCLMUL + SSE4_2 + SSE4_1 + SSSE3 + SSE3 + SSE2 + SSE)
|
||||||
|
|
||||||
#if(defined(GLM_COMPILER) && (GLM_COMPILER & GLM_COMPILER_GCC))
|
/////////////////
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_NULL
|
// Platform
|
||||||
#elif(defined(GLM_COMPILER) && (GLM_COMPILER & GLM_COMPILER_VC))
|
|
||||||
# if(GLM_MODEL == GLM_MODEL_64)
|
#define GLM_SUPPORT_PURE 0
|
||||||
# ifdef _M_CEE_PURE
|
#define GLM_SUPPORT_SSE2 1
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_PURE
|
#define GLM_SUPPORT_SSE3 2
|
||||||
# else
|
#define GLM_SUPPORT_AVX 3
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_MMX | GLM_INSTRUCTION_SET_SSE
|
|
||||||
# endif
|
#if(GLM_COMPILER & GLM_COMPILER_VC)
|
||||||
|
# if(GLM_COMPILER >= GLM_COMPILER_VC2010)
|
||||||
|
# define GLM_SUPPORT GLM_SUPPORT_SSE3 //GLM_SUPPORT_AVX (Require SP1)
|
||||||
|
# elif(GLM_COMPILER >= GLM_COMPILER_VC2008)
|
||||||
|
# define GLM_SUPPORT GLM_SUPPORT_SSE3
|
||||||
|
# elif(GLM_COMPILER >= GLM_COMPILER_VC2005)
|
||||||
|
# define GLM_SUPPORT GLM_SUPPORT_SSE2
|
||||||
# else
|
# else
|
||||||
# ifdef _M_CEE_PURE
|
# define GLM_SUPPORT GLM_SUPPORT_PURE
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_PURE
|
# endif
|
||||||
# else
|
#elif(GLM_COMPILER & GLM_COMPILER_GCC)
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_NULL
|
# if(GLM_COMPILER >= GLM_COMPILER_GCC44)
|
||||||
# endif
|
# define GLM_SUPPORT GLM_SUPPORT_AVX
|
||||||
|
# elif(GLM_COMPILER >= GLM_COMPILER_GCC40)
|
||||||
|
# define GLM_SUPPORT GLM_SUPPORT_SSE3
|
||||||
|
# else
|
||||||
|
# define GLM_SUPPORT GLM_SUPPORT_PURE
|
||||||
# endif
|
# endif
|
||||||
#else
|
#else
|
||||||
# define GLM_INSTRUCTION_SET GLM_INSTRUCTION_SET_PURE
|
# define GLM_SUPPORT GLM_SUPPORT_PURE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define GLM_PLATFORM_PURE 0
|
||||||
|
#define GLM_PLATFORM_SSE2 1
|
||||||
|
#define GLM_PLATFORM_SSE3 2
|
||||||
|
#define GLM_PLATFORM_AVX 3
|
||||||
|
|
||||||
|
#ifdef GLM_INSTRUCTION_SET
|
||||||
|
# if((GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_AVX) && GLM_SUPPORT >= GLM_SUPPORT_AVX)
|
||||||
|
# include <immintrin.h>
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_AVX
|
||||||
|
# elif((GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE3) && GLM_SUPPORT >= GLM_SUPPORT_SSE3)
|
||||||
|
# include <pmmintrin.h>
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_SSE3
|
||||||
|
# elif((GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE2) && GLM_SUPPORT >= GLM_SUPPORT_SSE2)
|
||||||
|
# include <emmintrin.h>
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_SSE2
|
||||||
|
# else
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_PURE
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# if(GLM_MODEL == GLM_MODEL_64)
|
||||||
|
# include <emmintrin.h>
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_SSE2
|
||||||
|
# else
|
||||||
|
# define GLM_PLATFORM GLM_PLATFORM_PURE
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
#if(GLM_INSTRUCTION_SET != GLM_INSTRUCTION_SET_NULL)
|
#if(GLM_INSTRUCTION_SET != GLM_INSTRUCTION_SET_NULL)
|
||||||
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_MMX)
|
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_MMX)
|
||||||
# include <mmintrin.h>
|
# include <mmintrin.h>
|
||||||
@ -278,10 +316,10 @@
|
|||||||
# include <tmmintrin.h>
|
# include <tmmintrin.h>
|
||||||
# endif
|
# endif
|
||||||
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_POPCNT)
|
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_POPCNT)
|
||||||
# include <popcntintrin.h>
|
//# include <popcntintrin.h>
|
||||||
# endif
|
# endif
|
||||||
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE4A)
|
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE4A)
|
||||||
# include <ammintrin.h>
|
//# include <ammintrin.h>
|
||||||
# endif
|
# endif
|
||||||
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE4_1)
|
# if(GLM_INSTRUCTION_SET & GLM_INSTRUCTION_SET_SSE4_1)
|
||||||
# include <smmintrin.h>
|
# include <smmintrin.h>
|
||||||
@ -299,7 +337,7 @@
|
|||||||
# include <immintrin.h>
|
# include <immintrin.h>
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
// Swizzle operators
|
// Swizzle operators
|
||||||
|
|
||||||
|
@ -30,8 +30,8 @@ int main(int argc, void* argv[])
|
|||||||
glm::simd_vec4(0.5f, 3.0f, 0.6f, 0.02f),
|
glm::simd_vec4(0.5f, 3.0f, 0.6f, 0.02f),
|
||||||
glm::simd_vec4(0.2f, 0.4f, 2.0f, 0.03f),
|
glm::simd_vec4(0.2f, 0.4f, 2.0f, 0.03f),
|
||||||
glm::simd_vec4(4.0f, 3.0f, 2.0f, 1.00f));
|
glm::simd_vec4(4.0f, 3.0f, 2.0f, 1.00f));
|
||||||
__m128 DetB = _mm_slow_det_ps(&IdentityB.Data[0].Data);
|
__m128 DetB = sse_slow_det_ps(&IdentityB.Data[0].Data);
|
||||||
__m128 DetC = _mm_det_ps(&IdentityB.Data[0].Data);
|
__m128 DetC = sse_det_ps(&IdentityB.Data[0].Data);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user