Add Neon to glm

A few simple functions that use Neon as compiler does not use the full potential of Neon For now, -DGLM_FORCE_NEON is required until it's the default
2024-11-23 01:14:34 +00:00 · 2019-08-27 11:42:48 -07:00 · 2019-08-27 11:42:48 -07:00 · cd3cc166b4
commit cd3cc166b4
parent ca52121e1b
3 changed files with 378 additions and 5 deletions
--- a/glm/detail/qualifier.hpp
+++ b/glm/detail/qualifier.hpp
@ -167,6 +167,26 @@ namespace detail
 	};
 #	endif
 #	if GLM_ARCH & GLM_ARCH_NEON_BIT
 	template<>
 	struct storage<4, float, true>
 	{
 		typedef glm_f32vec4 type;
 	};
 	template<>
 	struct storage<4, int, true>
 	{
 		typedef glm_i32vec4 type;
 	};
 	template<>
 	struct storage<4, unsigned int, true>
 	{
 		typedef glm_u32vec4 type;
 	};
 #	endif
 	enum genTypeEnum
 	{
 		GENTYPE_VEC,
--- a/glm/detail/type_vec4_simd.inl
+++ b/glm/detail/type_vec4_simd.inl
@ -461,3 +461,337 @@ namespace detail
 }//namespace glm
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
 #if GLM_ARCH & GLM_ARCH_NEON_BIT
 namespace glm {
 namespace detail {
 	template<qualifier Q>
 	struct compute_vec4_add<float, Q, true>
 	{
 		static
 		vec<4, float, Q>
 		call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
 		{
 			vec<4, float, Q> Result;
 			Result.data = vaddq_f32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_add<uint, Q, true>
 	{
 		static
 		vec<4, uint, Q>
 		call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
 		{
 			vec<4, uint, Q> Result;
 			Result.data = vaddq_u32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_add<int, Q, true>
 	{
 		static
 		vec<4, int, Q>
 		call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
 		{
 			vec<4, uint, Q> Result;
 			Result.data = vaddq_s32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_sub<float, Q, true>
 	{
 		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
 		{
 			vec<4, float, Q> Result;
 			Result.data = vsubq_f32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_sub<uint, Q, true>
 	{
 		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
 		{
 			vec<4, uint, Q> Result;
 			Result.data = vsubq_u32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_sub<int, Q, true>
 	{
 		static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
 		{
 			vec<4, int, Q> Result;
 			Result.data = vsubq_s32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_mul<float, Q, true>
 	{
 		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
 		{
 			vec<4, float, Q> Result;
 			Result.data = vmulq_f32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_mul<uint, Q, true>
 	{
 		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
 		{
 			vec<4, uint, Q> Result;
 			Result.data = vmulq_u32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_mul<int, Q, true>
 	{
 		static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
 		{
 			vec<4, int, Q> Result;
 			Result.data = vmulq_s32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_div<float, Q, true>
 	{
 		static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
 		{
 			vec<4, float, Q> Result;
 			Result.data = vdivq_f32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_div<uint, Q, true>
 	{
 		static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
 		{
 			vec<4, uint, Q> Result;
 			Result.data = vdivq_u32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_div<int, Q, true>
 	{
 		static vec<4, int, Q> call(vec<4, float, Q> const& a, vec<4, int, Q> const& b)
 		{
 			vec<4, int, Q> Result;
 			Result.data = vdivq_s32(a.data, b.data);
 			return Result;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_equal<float, Q, false, 32, true>
 	{
 		static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
 		{
 			uint32x4_t cmp = vceqq_f32(v1.data, v2.data);
 #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
 			cmp = vpminq_u32(cmp, cmp);
 			cmp = vpminq_u32(cmp, cmp);
 			uint32_t r = cmp[0];
 #else
 			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
 			cmpx2 = vpmin_u32(cmpx2, cmpx2);
 			uint32_t r = cmpx2[0];
 #endif
 			return r == ~0u;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_equal<uint, Q, false, 32, true>
 	{
 		static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
 		{
 			uint32x4_t cmp = vceqq_u32(v1.data, v2.data);
 #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
 			cmp = vpminq_u32(cmp, cmp);
 			cmp = vpminq_u32(cmp, cmp);
 			uint32_t r = cmp[0];
 #else
 			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
 			cmpx2 = vpmin_u32(cmpx2, cmpx2);
 			uint32_t r = cmpx2[0];
 #endif
 			return r == ~0u;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_equal<int, Q, false, 32, true>
 	{
 		static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
 		{
 			uint32x4_t cmp = vceqq_s32(v1.data, v2.data);
 #if GLM_ARCH & GLM_ARCH_ARMV8_BIT
 			cmp = vpminq_u32(cmp, cmp);
 			cmp = vpminq_u32(cmp, cmp);
 			uint32_t r = cmp[0];
 #else
 			uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
 			cmpx2 = vpmin_u32(cmpx2, cmpx2);
 			uint32_t r = cmpx2[0];
 #endif
 			return r == ~0u;
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_nequal<float, Q, false, 32, true>
 	{
 		static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
 		{
 			return !compute_vec4_equal<float, Q, false, 32, true>::call(v1, v2);
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_nequal<uint, Q, false, 32, true>
 	{
 		static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
 		{
 			return !compute_vec4_equal<uint, Q, false, 32, true>::call(v1, v2);
 		}
 	};
 	template<qualifier Q>
 	struct compute_vec4_nequal<int, Q, false, 32, true>
 	{
 		static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
 		{
 			return !compute_vec4_equal<int, Q, false, 32, true>::call(v1, v2);
 		}
 	};
 }//namespace detail
 #if !GLM_CONFIG_XYZW_ONLY
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) :
 		data(vdupq_n_f32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) :
 		data(vdupq_n_f32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) :
 		data(vdupq_n_f32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) :
 		data(vdupq_n_s32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) :
 		data(vdupq_n_s32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) :
 		data(vdupq_n_s32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) :
 		data(vdupq_n_u32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) :
 		data(vdupq_n_u32(_s))
 	{}
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) :
 		data(vdupq_n_u32(_s))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) :
 		data(rhs.data)
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) :
 		data(vcvtq_f32_s32(rhs.data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) :
 		data(vcvtq_f32_u32(rhs.data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) :
 		data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) :
 		data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) :
 		data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) :
 		data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) :
 		data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data))
 	{}
 	template<>
 	template<>
 	GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) :
 		data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data))
 	{}
 #endif
 }//namespace glm
 #endif
--- a/glm/simd/platform.h
+++ b/glm/simd/platform.h
@ -235,10 +235,11 @@
 // User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
-#define GLM_ARCH_MIPS_BIT	(0x10000000)
+#define GLM_ARCH_MIPS_BIT	  (0x10000000)
-#define GLM_ARCH_PPC_BIT	(0x20000000)
+#define GLM_ARCH_PPC_BIT	  (0x20000000)
-#define GLM_ARCH_ARM_BIT	(0x40000000)
+#define GLM_ARCH_ARM_BIT	  (0x40000000)
-#define GLM_ARCH_X86_BIT	(0x80000000)
+#define GLM_ARCH_ARMV8_BIT  (0x01000000)
 #define GLM_ARCH_X86_BIT	  (0x80000000)
 #define GLM_ARCH_SIMD_BIT	(0x00001000)
@ -263,6 +264,7 @@
 #define GLM_ARCH_AVX		(GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42)
 #define GLM_ARCH_AVX2		(GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX)
 #define GLM_ARCH_ARM		(GLM_ARCH_ARM_BIT)
 #define GLM_ARCH_ARMV8		(GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM | GLM_ARCH_ARMV8_BIT)
 #define GLM_ARCH_NEON		(GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM)
 #define GLM_ARCH_MIPS		(GLM_ARCH_MIPS_BIT)
 #define GLM_ARCH_PPC		(GLM_ARCH_PPC_BIT)
@ -270,7 +272,11 @@
 #if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE)
 #	define GLM_ARCH GLM_ARCH_UNKNOWN
 #elif defined(GLM_FORCE_NEON)
-#	define GLM_ARCH (GLM_ARCH_NEON)
+#	if __ARM_ARCH >= 8
 #		define GLM_ARCH (GLM_ARCH_ARMV8)
 #	else
 #		define GLM_ARCH (GLM_ARCH_NEON)
 #	endif
 #	define GLM_FORCE_INTRINSICS
 #elif defined(GLM_FORCE_AVX2)
 #	define GLM_ARCH (GLM_ARCH_AVX2)
@ -313,9 +319,14 @@
 #		define GLM_ARCH (GLM_ARCH_SSE2)
 #	elif defined(__i386__)
 #		define GLM_ARCH (GLM_ARCH_X86)
 #	elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
 #		define GLM_ARCH (GLM_ARCH_ARMV8)
 #warning "ARM v8"
 #	elif defined(__ARM_NEON)
 #warning "ARM NEON"
 #		define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
 #	elif defined(__arm__ ) || defined(_M_ARM)
 #warning "ARM v6"
 #		define GLM_ARCH (GLM_ARCH_ARM)
 #	elif defined(__mips__ )
 #		define GLM_ARCH (GLM_ARCH_MIPS)
@ -355,6 +366,8 @@
 #	include <pmmintrin.h>
 #elif GLM_ARCH & GLM_ARCH_SSE2_BIT
 #	include <emmintrin.h>
 #elif GLM_ARCH & GLM_ARCH_NEON_BIT
 #	include <arm_neon.h>
 #endif//GLM_ARCH
 #if GLM_ARCH & GLM_ARCH_SSE2_BIT
@ -380,3 +393,9 @@
 	typedef __m256i			glm_i64vec4;
 	typedef __m256i			glm_u64vec4;
 #endif
 #if GLM_ARCH & GLM_ARCH_NEON_BIT
 	typedef float32x4_t			glm_f32vec4;
 	typedef int32x4_t			glm_i32vec4;
 	typedef uint32x4_t			glm_u32vec4;
 #endif