mirror of
https://github.com/g-truc/glm.git
synced 2024-11-10 04:31:47 +00:00
Add Neon to glm
A few simple functions that use Neon as compiler does not use the full potential of Neon For now, -DGLM_FORCE_NEON is required until it's the default
This commit is contained in:
parent
ca52121e1b
commit
cd3cc166b4
@ -167,6 +167,26 @@ namespace detail
|
||||
};
|
||||
# endif
|
||||
|
||||
# if GLM_ARCH & GLM_ARCH_NEON_BIT
|
||||
template<>
|
||||
struct storage<4, float, true>
|
||||
{
|
||||
typedef glm_f32vec4 type;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct storage<4, int, true>
|
||||
{
|
||||
typedef glm_i32vec4 type;
|
||||
};
|
||||
|
||||
template<>
|
||||
struct storage<4, unsigned int, true>
|
||||
{
|
||||
typedef glm_u32vec4 type;
|
||||
};
|
||||
# endif
|
||||
|
||||
enum genTypeEnum
|
||||
{
|
||||
GENTYPE_VEC,
|
||||
|
@ -461,3 +461,337 @@ namespace detail
|
||||
}//namespace glm
|
||||
|
||||
#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
|
||||
|
||||
#if GLM_ARCH & GLM_ARCH_NEON_BIT
|
||||
namespace glm {
|
||||
namespace detail {
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_add<float, Q, true>
|
||||
{
|
||||
static
|
||||
vec<4, float, Q>
|
||||
call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
|
||||
{
|
||||
vec<4, float, Q> Result;
|
||||
Result.data = vaddq_f32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_add<uint, Q, true>
|
||||
{
|
||||
static
|
||||
vec<4, uint, Q>
|
||||
call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
|
||||
{
|
||||
vec<4, uint, Q> Result;
|
||||
Result.data = vaddq_u32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_add<int, Q, true>
|
||||
{
|
||||
static
|
||||
vec<4, int, Q>
|
||||
call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
|
||||
{
|
||||
vec<4, uint, Q> Result;
|
||||
Result.data = vaddq_s32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_sub<float, Q, true>
|
||||
{
|
||||
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
|
||||
{
|
||||
vec<4, float, Q> Result;
|
||||
Result.data = vsubq_f32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_sub<uint, Q, true>
|
||||
{
|
||||
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
|
||||
{
|
||||
vec<4, uint, Q> Result;
|
||||
Result.data = vsubq_u32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_sub<int, Q, true>
|
||||
{
|
||||
static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
|
||||
{
|
||||
vec<4, int, Q> Result;
|
||||
Result.data = vsubq_s32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_mul<float, Q, true>
|
||||
{
|
||||
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
|
||||
{
|
||||
vec<4, float, Q> Result;
|
||||
Result.data = vmulq_f32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_mul<uint, Q, true>
|
||||
{
|
||||
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
|
||||
{
|
||||
vec<4, uint, Q> Result;
|
||||
Result.data = vmulq_u32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_mul<int, Q, true>
|
||||
{
|
||||
static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
|
||||
{
|
||||
vec<4, int, Q> Result;
|
||||
Result.data = vmulq_s32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_div<float, Q, true>
|
||||
{
|
||||
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
|
||||
{
|
||||
vec<4, float, Q> Result;
|
||||
Result.data = vdivq_f32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_div<uint, Q, true>
|
||||
{
|
||||
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
|
||||
{
|
||||
vec<4, uint, Q> Result;
|
||||
Result.data = vdivq_u32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_div<int, Q, true>
|
||||
{
|
||||
static vec<4, int, Q> call(vec<4, float, Q> const& a, vec<4, int, Q> const& b)
|
||||
{
|
||||
vec<4, int, Q> Result;
|
||||
Result.data = vdivq_s32(a.data, b.data);
|
||||
return Result;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_equal<float, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
|
||||
{
|
||||
uint32x4_t cmp = vceqq_f32(v1.data, v2.data);
|
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
uint32_t r = cmp[0];
|
||||
#else
|
||||
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
|
||||
cmpx2 = vpmin_u32(cmpx2, cmpx2);
|
||||
uint32_t r = cmpx2[0];
|
||||
#endif
|
||||
return r == ~0u;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_equal<uint, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
|
||||
{
|
||||
uint32x4_t cmp = vceqq_u32(v1.data, v2.data);
|
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
uint32_t r = cmp[0];
|
||||
#else
|
||||
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
|
||||
cmpx2 = vpmin_u32(cmpx2, cmpx2);
|
||||
uint32_t r = cmpx2[0];
|
||||
#endif
|
||||
return r == ~0u;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_equal<int, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
|
||||
{
|
||||
uint32x4_t cmp = vceqq_s32(v1.data, v2.data);
|
||||
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
cmp = vpminq_u32(cmp, cmp);
|
||||
uint32_t r = cmp[0];
|
||||
#else
|
||||
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
|
||||
cmpx2 = vpmin_u32(cmpx2, cmpx2);
|
||||
uint32_t r = cmpx2[0];
|
||||
#endif
|
||||
return r == ~0u;
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_nequal<float, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
|
||||
{
|
||||
return !compute_vec4_equal<float, Q, false, 32, true>::call(v1, v2);
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_nequal<uint, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
|
||||
{
|
||||
return !compute_vec4_equal<uint, Q, false, 32, true>::call(v1, v2);
|
||||
}
|
||||
};
|
||||
|
||||
template<qualifier Q>
|
||||
struct compute_vec4_nequal<int, Q, false, 32, true>
|
||||
{
|
||||
static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
|
||||
{
|
||||
return !compute_vec4_equal<int, Q, false, 32, true>::call(v1, v2);
|
||||
}
|
||||
};
|
||||
|
||||
}//namespace detail
|
||||
|
||||
#if !GLM_CONFIG_XYZW_ONLY
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) :
|
||||
data(vdupq_n_f32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) :
|
||||
data(vdupq_n_f32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) :
|
||||
data(vdupq_n_f32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) :
|
||||
data(vdupq_n_s32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) :
|
||||
data(vdupq_n_s32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) :
|
||||
data(vdupq_n_s32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) :
|
||||
data(vdupq_n_u32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) :
|
||||
data(vdupq_n_u32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) :
|
||||
data(vdupq_n_u32(_s))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) :
|
||||
data(rhs.data)
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) :
|
||||
data(vcvtq_f32_s32(rhs.data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) :
|
||||
data(vcvtq_f32_u32(rhs.data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) :
|
||||
data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) :
|
||||
data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) :
|
||||
data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) :
|
||||
data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) :
|
||||
data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
|
||||
template<>
|
||||
template<>
|
||||
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) :
|
||||
data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data))
|
||||
{}
|
||||
|
||||
#endif
|
||||
}//namespace glm
|
||||
|
||||
#endif
|
||||
|
@ -235,10 +235,11 @@
|
||||
|
||||
// User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
|
||||
|
||||
#define GLM_ARCH_MIPS_BIT (0x10000000)
|
||||
#define GLM_ARCH_PPC_BIT (0x20000000)
|
||||
#define GLM_ARCH_ARM_BIT (0x40000000)
|
||||
#define GLM_ARCH_X86_BIT (0x80000000)
|
||||
#define GLM_ARCH_MIPS_BIT (0x10000000)
|
||||
#define GLM_ARCH_PPC_BIT (0x20000000)
|
||||
#define GLM_ARCH_ARM_BIT (0x40000000)
|
||||
#define GLM_ARCH_ARMV8_BIT (0x01000000)
|
||||
#define GLM_ARCH_X86_BIT (0x80000000)
|
||||
|
||||
#define GLM_ARCH_SIMD_BIT (0x00001000)
|
||||
|
||||
@ -263,6 +264,7 @@
|
||||
#define GLM_ARCH_AVX (GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42)
|
||||
#define GLM_ARCH_AVX2 (GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX)
|
||||
#define GLM_ARCH_ARM (GLM_ARCH_ARM_BIT)
|
||||
#define GLM_ARCH_ARMV8 (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM | GLM_ARCH_ARMV8_BIT)
|
||||
#define GLM_ARCH_NEON (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM)
|
||||
#define GLM_ARCH_MIPS (GLM_ARCH_MIPS_BIT)
|
||||
#define GLM_ARCH_PPC (GLM_ARCH_PPC_BIT)
|
||||
@ -270,7 +272,11 @@
|
||||
#if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE)
|
||||
# define GLM_ARCH GLM_ARCH_UNKNOWN
|
||||
#elif defined(GLM_FORCE_NEON)
|
||||
# define GLM_ARCH (GLM_ARCH_NEON)
|
||||
# if __ARM_ARCH >= 8
|
||||
# define GLM_ARCH (GLM_ARCH_ARMV8)
|
||||
# else
|
||||
# define GLM_ARCH (GLM_ARCH_NEON)
|
||||
# endif
|
||||
# define GLM_FORCE_INTRINSICS
|
||||
#elif defined(GLM_FORCE_AVX2)
|
||||
# define GLM_ARCH (GLM_ARCH_AVX2)
|
||||
@ -313,9 +319,14 @@
|
||||
# define GLM_ARCH (GLM_ARCH_SSE2)
|
||||
# elif defined(__i386__)
|
||||
# define GLM_ARCH (GLM_ARCH_X86)
|
||||
# elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
|
||||
# define GLM_ARCH (GLM_ARCH_ARMV8)
|
||||
#warning "ARM v8"
|
||||
# elif defined(__ARM_NEON)
|
||||
#warning "ARM NEON"
|
||||
# define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
|
||||
# elif defined(__arm__ ) || defined(_M_ARM)
|
||||
#warning "ARM v6"
|
||||
# define GLM_ARCH (GLM_ARCH_ARM)
|
||||
# elif defined(__mips__ )
|
||||
# define GLM_ARCH (GLM_ARCH_MIPS)
|
||||
@ -355,6 +366,8 @@
|
||||
# include <pmmintrin.h>
|
||||
#elif GLM_ARCH & GLM_ARCH_SSE2_BIT
|
||||
# include <emmintrin.h>
|
||||
#elif GLM_ARCH & GLM_ARCH_NEON_BIT
|
||||
# include <arm_neon.h>
|
||||
#endif//GLM_ARCH
|
||||
|
||||
#if GLM_ARCH & GLM_ARCH_SSE2_BIT
|
||||
@ -380,3 +393,9 @@
|
||||
typedef __m256i glm_i64vec4;
|
||||
typedef __m256i glm_u64vec4;
|
||||
#endif
|
||||
|
||||
#if GLM_ARCH & GLM_ARCH_NEON_BIT
|
||||
typedef float32x4_t glm_f32vec4;
|
||||
typedef int32x4_t glm_i32vec4;
|
||||
typedef uint32x4_t glm_u32vec4;
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user