Improve efficiency of operator*(fquatSIMD, fquatSIMD) in SSE4 mode.

Now only requires 3 shuffle, 4 mul and 4 dpps.
This commit is contained in:
Dave Reid 2013-04-24 07:58:41 +10:00
parent d23da666c5
commit 197b6c96d8

View File

@ -122,7 +122,7 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
// SSE4 STATS: // SSE4 STATS:
// 3 shuffle // 3 shuffle
// 8 mul // 4 mul
// 4 dpps // 4 dpps
__m128 mul0 = _mm_mul_ps(q1.Data, q2.Data); __m128 mul0 = _mm_mul_ps(q1.Data, q2.Data);
@ -130,35 +130,36 @@ GLM_FUNC_QUALIFIER fquatSIMD operator* (fquatSIMD const & q1, fquatSIMD const &
__m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); __m128 mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
__m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); __m128 mul3 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f));
mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f));
mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f));
# if((GLM_ARCH & GLM_ARCH_SSE4)) # if((GLM_ARCH & GLM_ARCH_SSE4))
__m128 add0 = _mm_dp_ps(mul0, _mm_set1_ps(1.0f), 0xff); __m128 add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
__m128 add1 = _mm_dp_ps(mul1, _mm_set1_ps(1.0f), 0xff); __m128 add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff);
__m128 add2 = _mm_dp_ps(mul2, _mm_set1_ps(1.0f), 0xff); __m128 add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff);
__m128 add3 = _mm_dp_ps(mul3, _mm_set1_ps(1.0f), 0xff); __m128 add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff);
# elif((GLM_ARCH & GLM_ARCH_SSE3))
__m128 add0 = _mm_hadd_ps(mul0, mul0);
add0 = _mm_hadd_ps(add0, add0);
__m128 add1 = _mm_hadd_ps(mul1, mul1);
add1 = _mm_hadd_ps(add1, add1);
__m128 add2 = _mm_hadd_ps(mul2, mul2);
add2 = _mm_hadd_ps(add2, add2);
__m128 add3 = _mm_hadd_ps(mul3, mul3);
add3 = _mm_hadd_ps(add3, add3);
# else # else
__m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0)); mul0 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); mul1 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f));
__m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1)); mul2 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f));
add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); mul3 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f));
__m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); # if((GLM_ARCH & GLM_ARCH_SSE3))
__m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3)); __m128 add0 = _mm_hadd_ps(mul0, mul0);
add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); add0 = _mm_hadd_ps(add0, add0);
__m128 add1 = _mm_hadd_ps(mul1, mul1);
add1 = _mm_hadd_ps(add1, add1);
__m128 add2 = _mm_hadd_ps(mul2, mul2);
add2 = _mm_hadd_ps(add2, add2);
__m128 add3 = _mm_hadd_ps(mul3, mul3);
add3 = _mm_hadd_ps(add3, add3);
# else
__m128 add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul0, mul0));
add0 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
__m128 add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul1, mul1));
add1 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
__m128 add2 = _mm_add_ps(mul2, _mm_movehl_ps(mul2, mul2));
add2 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
__m128 add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul3, mul3));
add3 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
# endif
#endif #endif