From 857257745d1ca7fdb24d6683150fd7438e56a1b9 Mon Sep 17 00:00:00 2001 From: Christophe Date: Wed, 20 Dec 2023 21:55:32 +0100 Subject: [PATCH] Revert broken vec4 SIMD --- glm/detail/type_vec4_simd.inl | 872 +++++++++++++++++----------------- 1 file changed, 425 insertions(+), 447 deletions(-) diff --git a/glm/detail/type_vec4_simd.inl b/glm/detail/type_vec4_simd.inl index 0b73b8cc..816ef45b 100644 --- a/glm/detail/type_vec4_simd.inl +++ b/glm/detail/type_vec4_simd.inl @@ -1,370 +1,348 @@ #if GLM_ARCH & GLM_ARCH_SSE2_BIT -namespace glm{ -namespace detail -{ -# if GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR - template - struct _swizzle_base1<4, float, Q, E0,E1,E2,E3, true> : public _swizzle_base0 +namespace glm { + namespace detail { - GLM_FUNC_QUALIFIER vec<4, float, Q> operator ()() const +# if GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR + template + struct _swizzle_base1<4, float, Q, E0, E1, E2, E3, true> : public _swizzle_base0 { - __m128 data = *reinterpret_cast<__m128 const*>(&this->_buffer); + GLM_FUNC_QUALIFIER vec<4, float, Q> operator ()() const + { + __m128 data = *reinterpret_cast<__m128 const*>(&this->_buffer); - vec<4, float, Q> Result; + vec<4, float, Q> Result; # if GLM_ARCH & GLM_ARCH_AVX_BIT Result.data = _mm_permute_ps(data, _MM_SHUFFLE(E3, E2, E1, E0)); # else Result.data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(E3, E2, E1, E0)); # endif - return Result; - } - }; + return Result; + } + }; - template - struct _swizzle_base1<4, int, Q, E0,E1,E2,E3, true> : public _swizzle_base0 - { - GLM_FUNC_QUALIFIER vec<4, int, Q> operator ()() const + template + struct _swizzle_base1<4, int, Q, E0, E1, E2, E3, true> : public _swizzle_base0 { - __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); + GLM_FUNC_QUALIFIER vec<4, int, Q> operator ()() const + { + __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); - vec<4, int, Q> Result; - Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); - return Result; - } - }; + vec<4, int, Q> Result; + Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); + return Result; + } + }; - template - struct _swizzle_base1<4, uint, Q, E0,E1,E2,E3, true> : public _swizzle_base0 - { - GLM_FUNC_QUALIFIER vec<4, uint, Q> operator ()() const + template + struct _swizzle_base1<4, uint, Q, E0, E1, E2, E3, true> : public _swizzle_base0 { - __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); + GLM_FUNC_QUALIFIER vec<4, uint, Q> operator ()() const + { + __m128i data = *reinterpret_cast<__m128i const*>(&this->_buffer); - vec<4, uint, Q> Result; - Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); - return Result; - } - }; + vec<4, uint, Q> Result; + Result.data = _mm_shuffle_epi32(data, _MM_SHUFFLE(E3, E2, E1, E0)); + return Result; + } + }; # endif// GLM_CONFIG_SWIZZLE == GLM_SWIZZLE_OPERATOR - template - struct compute_vec4_add - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_add { - vec<4, float, Q> Result; - Result.data = _mm_add_ps(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = _mm_add_ps(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_add - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + template + struct compute_vec4_add { - vec<4, double, Q> Result; - Result.data = _mm256_add_pd(a.data, b.data); - return Result; - } - }; + static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + { + vec<4, double, Q> Result; + Result.data = _mm256_add_pd(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_sub - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_sub { - vec<4, float, Q> Result; - Result.data = _mm_sub_ps(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = _mm_sub_ps(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_sub - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + template + struct compute_vec4_sub { - vec<4, double, Q> Result; - Result.data = _mm256_sub_pd(a.data, b.data); - return Result; - } - }; + static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + { + vec<4, double, Q> Result; + Result.data = _mm256_sub_pd(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_mul - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_mul { - vec<4, float, Q> Result; - Result.data = _mm_mul_ps(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = _mm_mul_ps(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_mul - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + template + struct compute_vec4_mul { - vec<4, double, Q> Result; - Result.data = _mm256_mul_pd(a.data, b.data); - return Result; - } - }; + static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + { + vec<4, double, Q> Result; + Result.data = _mm256_mul_pd(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_div - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_div { - vec<4, float, Q> Result; - Result.data = _mm_div_ps(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = _mm_div_ps(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX_BIT - template - struct compute_vec4_div - { - static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + template + struct compute_vec4_div { - vec<4, double, Q> Result; - Result.data = _mm256_div_pd(a.data, b.data); - return Result; - } - }; + static vec<4, double, Q> call(vec<4, double, Q> const& a, vec<4, double, Q> const& b) + { + vec<4, double, Q> Result; + Result.data = _mm256_div_pd(a.data, b.data); + return Result; + } + }; # endif - template<> - struct compute_vec4_div - { - static vec<4, float, aligned_lowp> call(vec<4, float, aligned_lowp> const& a, vec<4, float, aligned_lowp> const& b) + template<> + struct compute_vec4_div { - vec<4, float, aligned_lowp> Result; - Result.data = _mm_mul_ps(a.data, _mm_rcp_ps(b.data)); - return Result; - } - }; + static vec<4, float, aligned_lowp> call(vec<4, float, aligned_lowp> const& a, vec<4, float, aligned_lowp> const& b) + { + vec<4, float, aligned_lowp> Result; + Result.data = _mm_mul_ps(a.data, _mm_rcp_ps(b.data)); + return Result; + } + }; - template - struct compute_vec4_and - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_and { - vec<4, T, Q> Result; - Result.data = _mm_and_si128(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm_and_si128(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_and - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_and { - vec<4, T, Q> Result; - Result.data = _mm256_and_si256(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm256_and_si256(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_or - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_or { - vec<4, T, Q> Result; - Result.data = _mm_or_si128(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm_or_si128(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_or - { - GLM_FUNC_QUALIFIER static vec<4, int64, Q> call(vec<4, int64, Q> const& a, vec<4, int64, Q> const& b) + template + struct compute_vec4_or { - vec<4, int64, Q> Result; - Result.data = _mm256_or_si256(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_or - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_or_si256(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm256_or_si256(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_xor - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_xor { - vec<4, T, Q> Result; - Result.data = _mm_xor_si128(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm_xor_si128(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_xor - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_xor { - vec<4, T, Q> Result; - Result.data = _mm256_xor_si256(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm256_xor_si256(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_shift_left - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_shift_left { - vec<4, T, Q> Result; - Result.data = _mm_sll_epi32(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm_sll_epi32(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_shift_left - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_shift_left { - vec<4, T, Q> Result; - Result.data = _mm256_sll_epi64(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm256_sll_epi64(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_shift_right - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + template + struct compute_vec4_shift_right { - vec<4, T, Q> Result; - Result.data = _mm_srl_epi32(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm_srl_epi32(a.data, b.data); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_shift_right - { - GLM_FUNC_QUALIFIER static vec<4, int64, Q> call(vec<4, int64, Q> const& a, vec<4, int64, Q> const& b) + template + struct compute_vec4_shift_right { - vec<4, int64, Q> Result; - Result.data = _mm256_srl_epi64(a.data, b.data); - return Result; - } - }; - - template - struct compute_vec4_shift_right - { - static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) - { - vec<4, T, Q> Result; - Result.data = _mm256_srl_epi64(a.data, b.data); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& a, vec<4, T, Q> const& b) + { + vec<4, T, Q> Result; + Result.data = _mm256_srl_epi64(a.data, b.data); + return Result; + } + }; # endif - template - struct compute_vec4_bitwise_not - { - static vec<4, T, Q> call(vec<4, T, Q> const& v) + template + struct compute_vec4_bitwise_not { - vec<4, T, Q> Result; - Result.data = _mm_xor_si128(v.data, _mm_set1_epi32(-1)); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& v) + { + vec<4, T, Q> Result; + Result.data = _mm_xor_si128(v.data, _mm_set1_epi32(-1)); + return Result; + } + }; # if GLM_ARCH & GLM_ARCH_AVX2_BIT - template - struct compute_vec4_bitwise_not - { - static vec<4, T, Q> call(vec<4, T, Q> const& v) + template + struct compute_vec4_bitwise_not { - vec<4, T, Q> Result; - Result.data = _mm256_xor_si256(v.data, _mm_set1_epi32(-1)); - return Result; - } - }; + static vec<4, T, Q> call(vec<4, T, Q> const& v) + { + vec<4, T, Q> Result; + Result.data = _mm256_xor_si256(v.data, _mm_set1_epi32(-1)); + return Result; + } + }; # endif - template - struct compute_vec4_equal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + template + struct compute_vec4_equal { - return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) == 0; - } - }; + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) == 0; + } + }; # if GLM_ARCH & GLM_ARCH_SSE41_BIT - template - struct compute_vec4_equal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + template + struct compute_vec4_equal { - //return _mm_movemask_epi8(_mm_cmpeq_epi32(v1.data, v2.data)) != 0; - __m128i neq = _mm_xor_si128(v1.data, v2.data); - return _mm_test_all_zeros(neq, neq) == 0; - } - }; + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + //return _mm_movemask_epi8(_mm_cmpeq_epi32(v1.data, v2.data)) != 0; + __m128i neq = _mm_xor_si128(v1.data, v2.data); + return _mm_test_all_zeros(neq, neq) == 0; + } + }; # endif - template - struct compute_vec4_nequal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + template + struct compute_vec4_nequal { - return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) != 0; - } - }; + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + return _mm_movemask_ps(_mm_cmpneq_ps(v1.data, v2.data)) != 0; + } + }; # if GLM_ARCH & GLM_ARCH_SSE41_BIT - template - struct compute_vec4_nequal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + template + struct compute_vec4_nequal { - //return _mm_movemask_epi8(_mm_cmpneq_epi32(v1.data, v2.data)) != 0; - __m128i neq = _mm_xor_si128(v1.data, v2.data); - return _mm_test_all_zeros(neq, neq) != 0; - } - }; + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + //return _mm_movemask_epi8(_mm_cmpneq_epi32(v1.data, v2.data)) != 0; + __m128i neq = _mm_xor_si128(v1.data, v2.data); + return _mm_test_all_zeros(neq, neq) != 0; + } + }; # endif -}//namespace detail + }//namespace detail template<> GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) : @@ -486,222 +464,222 @@ namespace detail #if GLM_ARCH & GLM_ARCH_NEON_BIT namespace glm { -namespace detail { + namespace detail { - template - struct compute_vec4_add - { - static - vec<4, float, Q> - call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_add { - vec<4, float, Q> Result; - Result.data = vaddq_f32(a.data, b.data); - return Result; - } - }; + static + vec<4, float, Q> + call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vaddq_f32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_add - { - static - vec<4, uint, Q> - call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + template + struct compute_vec4_add { - vec<4, uint, Q> Result; - Result.data = vaddq_u32(a.data, b.data); - return Result; - } - }; + static + vec<4, uint, Q> + call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vaddq_u32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_add - { - static - vec<4, int, Q> - call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + template + struct compute_vec4_add { - vec<4, int, Q> Result; - Result.data = vaddq_s32(a.data, b.data); - return Result; - } - }; + static + vec<4, int, Q> + call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vaddq_s32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_sub - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_sub { - vec<4, float, Q> Result; - Result.data = vsubq_f32(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vsubq_f32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_sub - { - static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + template + struct compute_vec4_sub { - vec<4, uint, Q> Result; - Result.data = vsubq_u32(a.data, b.data); - return Result; - } - }; + static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vsubq_u32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_sub - { - static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + template + struct compute_vec4_sub { - vec<4, int, Q> Result; - Result.data = vsubq_s32(a.data, b.data); - return Result; - } - }; + static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vsubq_s32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_mul - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_mul { - vec<4, float, Q> Result; - Result.data = vmulq_f32(a.data, b.data); - return Result; - } - }; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; + Result.data = vmulq_f32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_mul - { - static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + template + struct compute_vec4_mul { - vec<4, uint, Q> Result; - Result.data = vmulq_u32(a.data, b.data); - return Result; - } - }; + static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b) + { + vec<4, uint, Q> Result; + Result.data = vmulq_u32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_mul - { - static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + template + struct compute_vec4_mul { - vec<4, int, Q> Result; - Result.data = vmulq_s32(a.data, b.data); - return Result; - } - }; + static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b) + { + vec<4, int, Q> Result; + Result.data = vmulq_s32(a.data, b.data); + return Result; + } + }; - template - struct compute_vec4_div - { - static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + template + struct compute_vec4_div { - vec<4, float, Q> Result; + static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b) + { + vec<4, float, Q> Result; #if GLM_ARCH & GLM_ARCH_ARMV8_BIT - Result.data = vdivq_f32(a.data, b.data); + Result.data = vdivq_f32(a.data, b.data); #else - /* Arm assembler reference: - * - * The Newton-Raphson iteration: x[n+1] = x[n] * (2 - d * x[n]) - * converges to (1/d) if x0 is the result of VRECPE applied to d. - * - * Note: The precision usually improves with two interactions, but more than two iterations are not helpful. */ - float32x4_t x = vrecpeq_f32(b.data); - x = vmulq_f32(vrecpsq_f32(b.data, x), x); - x = vmulq_f32(vrecpsq_f32(b.data, x), x); - Result.data = vmulq_f32(a.data, x); + /* Arm assembler reference: + * + * The Newton-Raphson iteration: x[n+1] = x[n] * (2 - d * x[n]) + * converges to (1/d) if x0 is the result of VRECPE applied to d. + * + * Note: The precision usually improves with two interactions, but more than two iterations are not helpful. */ + float32x4_t x = vrecpeq_f32(b.data); + x = vmulq_f32(vrecpsq_f32(b.data, x), x); + x = vmulq_f32(vrecpsq_f32(b.data, x), x); + Result.data = vmulq_f32(a.data, x); #endif - return Result; - } - }; + return Result; + } + }; - template - struct compute_vec4_equal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + template + struct compute_vec4_equal { - uint32x4_t cmp = vceqq_f32(v1.data, v2.data); + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + uint32x4_t cmp = vceqq_f32(v1.data, v2.data); #if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; #else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; #endif - return r == ~0u; - } - }; + return r == ~0u; + } + }; - template - struct compute_vec4_equal - { - static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + template + struct compute_vec4_equal { - uint32x4_t cmp = vceqq_u32(v1.data, v2.data); + static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + { + uint32x4_t cmp = vceqq_u32(v1.data, v2.data); #if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; #else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; #endif - return r == ~0u; - } - }; + return r == ~0u; + } + }; - template - struct compute_vec4_equal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + template + struct compute_vec4_equal { - uint32x4_t cmp = vceqq_s32(v1.data, v2.data); + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + uint32x4_t cmp = vceqq_s32(v1.data, v2.data); #if GLM_ARCH & GLM_ARCH_ARMV8_BIT - cmp = vpminq_u32(cmp, cmp); - cmp = vpminq_u32(cmp, cmp); - uint32_t r = cmp[0]; + cmp = vpminq_u32(cmp, cmp); + cmp = vpminq_u32(cmp, cmp); + uint32_t r = cmp[0]; #else - uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); - cmpx2 = vpmin_u32(cmpx2, cmpx2); - uint32_t r = cmpx2[0]; + uint32x2_t cmpx2 = vpmin_u32(vget_low_u32(cmp), vget_high_u32(cmp)); + cmpx2 = vpmin_u32(cmpx2, cmpx2); + uint32_t r = cmpx2[0]; #endif - return r == ~0u; - } - }; + return r == ~0u; + } + }; - template - struct compute_vec4_nequal - { - static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + template + struct compute_vec4_nequal { - return !compute_vec4_equal::call(v1, v2); - } - }; + static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; - template - struct compute_vec4_nequal - { - static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + template + struct compute_vec4_nequal { - return !compute_vec4_equal::call(v1, v2); - } - }; + static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; - template - struct compute_vec4_nequal - { - static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + template + struct compute_vec4_nequal { - return !compute_vec4_equal::call(v1, v2); - } - }; + static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2) + { + return !compute_vec4_equal::call(v1, v2); + } + }; -}//namespace detail + }//namespace detail #if !GLM_CONFIG_XYZW_ONLY template<>