From 197b6c96d8a068fcd1002b0731def6bcbb2af5fc Mon Sep 17 00:00:00 2001 From: Dave Reid Date: Wed, 24 Apr 2013 07:58:41 +1000 Subject: [PATCH] Improve efficiency of operator*(fquatSIMD, fquatSIMD) in SSE4 mode. Now only requires 3 shuffle, 4 mul and 4 dpps. --- glm/gtx/simd_quat.inl | 57 ++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/glm/gtx/simd_quat.inl b/glm/gtx/simd_quat.inl index f9df7a94..25848a57 100644 --- a/glm/gtx/simd_quat.inl +++ b/glm/gtx/simd_quat.inl @@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & // SSE4 STATS: // 3 shuffle - // 8 mul + // 4 mul // 4 dpps __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); @@ -130,35 +130,36 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const & __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); - mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); - mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); - mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); - mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); - - # if((GLM_ARCH & GLM_ARCH_SSE4)) - __m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff); - __m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff); - __m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff); - __m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff); -# elif((GLM_ARCH & GLM_ARCH_SSE3)) - __m128 add0 = _mm_hadd_ps(mul0, mul0); - add0 = _mm_hadd_ps(add0, add0); - __m128 add1 = _mm_hadd_ps(mul1, mul1); - add1 = _mm_hadd_ps(add1, add1); - __m128 add2 = _mm_hadd_ps(mul2, mul2); - add2 = _mm_hadd_ps(add2, add2); - __m128 add3 = _mm_hadd_ps(mul3, mul3); - add3 = _mm_hadd_ps(add3, add3); + __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); + __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); + __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); + __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); # else - __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); - add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); - __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); - add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); - __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); - add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); - __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); - add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); + mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); + mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); + mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); + mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); + +# if((GLM_ARCH & GLM_ARCH_SSE3)) + __m128 add0 = _mm_hadd_ps(mul0, mul0); + add0 = _mm_hadd_ps(add0, add0); + __m128 add1 = _mm_hadd_ps(mul1, mul1); + add1 = _mm_hadd_ps(add1, add1); + __m128 add2 = _mm_hadd_ps(mul2, mul2); + add2 = _mm_hadd_ps(add2, add2); + __m128 add3 = _mm_hadd_ps(mul3, mul3); + add3 = _mm_hadd_ps(add3, add3); +# else + __m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); + add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); + __m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); + add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); + __m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2)); + add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); + __m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); + add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); +# endif #endif