Add Neon to glm

A few simple functions that use Neon as compiler does not
use the full potential of Neon
For now, -DGLM_FORCE_NEON is required until it's the default
This commit is contained in:
Amaury Le Leyzour 2019-08-27 11:42:48 -07:00
parent ca52121e1b
commit cd3cc166b4
3 changed files with 378 additions and 5 deletions

View File

@ -167,6 +167,26 @@ namespace detail
};
# endif
# if GLM_ARCH & GLM_ARCH_NEON_BIT
template<>
struct storage<4, float, true>
{
typedef glm_f32vec4 type;
};
template<>
struct storage<4, int, true>
{
typedef glm_i32vec4 type;
};
template<>
struct storage<4, unsigned int, true>
{
typedef glm_u32vec4 type;
};
# endif
enum genTypeEnum
{
GENTYPE_VEC,

View File

@ -461,3 +461,337 @@ namespace detail
}//namespace glm
#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
#if GLM_ARCH & GLM_ARCH_NEON_BIT
namespace glm {
namespace detail {
template<qualifier Q>
struct compute_vec4_add<float, Q, true>
{
static
vec<4, float, Q>
call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
{
vec<4, float, Q> Result;
Result.data = vaddq_f32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_add<uint, Q, true>
{
static
vec<4, uint, Q>
call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
{
vec<4, uint, Q> Result;
Result.data = vaddq_u32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_add<int, Q, true>
{
static
vec<4, int, Q>
call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
{
vec<4, uint, Q> Result;
Result.data = vaddq_s32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_sub<float, Q, true>
{
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
{
vec<4, float, Q> Result;
Result.data = vsubq_f32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_sub<uint, Q, true>
{
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
{
vec<4, uint, Q> Result;
Result.data = vsubq_u32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_sub<int, Q, true>
{
static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
{
vec<4, int, Q> Result;
Result.data = vsubq_s32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_mul<float, Q, true>
{
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
{
vec<4, float, Q> Result;
Result.data = vmulq_f32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_mul<uint, Q, true>
{
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
{
vec<4, uint, Q> Result;
Result.data = vmulq_u32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_mul<int, Q, true>
{
static vec<4, int, Q> call(vec<4, int, Q> const& a, vec<4, int, Q> const& b)
{
vec<4, int, Q> Result;
Result.data = vmulq_s32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_div<float, Q, true>
{
static vec<4, float, Q> call(vec<4, float, Q> const& a, vec<4, float, Q> const& b)
{
vec<4, float, Q> Result;
Result.data = vdivq_f32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_div<uint, Q, true>
{
static vec<4, uint, Q> call(vec<4, uint, Q> const& a, vec<4, uint, Q> const& b)
{
vec<4, uint, Q> Result;
Result.data = vdivq_u32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_div<int, Q, true>
{
static vec<4, int, Q> call(vec<4, float, Q> const& a, vec<4, int, Q> const& b)
{
vec<4, int, Q> Result;
Result.data = vdivq_s32(a.data, b.data);
return Result;
}
};
template<qualifier Q>
struct compute_vec4_equal<float, Q, false, 32, true>
{
static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
{
uint32x4_t cmp = vceqq_f32(v1.data, v2.data);
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
cmp = vpminq_u32(cmp, cmp);
cmp = vpminq_u32(cmp, cmp);
uint32_t r = cmp[0];
#else
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
cmpx2 = vpmin_u32(cmpx2, cmpx2);
uint32_t r = cmpx2[0];
#endif
return r == ~0u;
}
};
template<qualifier Q>
struct compute_vec4_equal<uint, Q, false, 32, true>
{
static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
{
uint32x4_t cmp = vceqq_u32(v1.data, v2.data);
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
cmp = vpminq_u32(cmp, cmp);
cmp = vpminq_u32(cmp, cmp);
uint32_t r = cmp[0];
#else
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
cmpx2 = vpmin_u32(cmpx2, cmpx2);
uint32_t r = cmpx2[0];
#endif
return r == ~0u;
}
};
template<qualifier Q>
struct compute_vec4_equal<int, Q, false, 32, true>
{
static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
{
uint32x4_t cmp = vceqq_s32(v1.data, v2.data);
#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
cmp = vpminq_u32(cmp, cmp);
cmp = vpminq_u32(cmp, cmp);
uint32_t r = cmp[0];
#else
uint32x2_t cmpx2 = vpmin_u32(vget_low_f32(cmp), vget_high_f32(cmp));
cmpx2 = vpmin_u32(cmpx2, cmpx2);
uint32_t r = cmpx2[0];
#endif
return r == ~0u;
}
};
template<qualifier Q>
struct compute_vec4_nequal<float, Q, false, 32, true>
{
static bool call(vec<4, float, Q> const& v1, vec<4, float, Q> const& v2)
{
return !compute_vec4_equal<float, Q, false, 32, true>::call(v1, v2);
}
};
template<qualifier Q>
struct compute_vec4_nequal<uint, Q, false, 32, true>
{
static bool call(vec<4, uint, Q> const& v1, vec<4, uint, Q> const& v2)
{
return !compute_vec4_equal<uint, Q, false, 32, true>::call(v1, v2);
}
};
template<qualifier Q>
struct compute_vec4_nequal<int, Q, false, 32, true>
{
static bool call(vec<4, int, Q> const& v1, vec<4, int, Q> const& v2)
{
return !compute_vec4_equal<int, Q, false, 32, true>::call(v1, v2);
}
};
}//namespace detail
#if !GLM_CONFIG_XYZW_ONLY
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(float _s) :
data(vdupq_n_f32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(float _s) :
data(vdupq_n_f32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(float _s) :
data(vdupq_n_f32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_lowp>::vec(int _s) :
data(vdupq_n_s32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_mediump>::vec(int _s) :
data(vdupq_n_s32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, int, aligned_highp>::vec(int _s) :
data(vdupq_n_s32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_lowp>::vec(uint _s) :
data(vdupq_n_u32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_mediump>::vec(uint _s) :
data(vdupq_n_u32(_s))
{}
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, uint, aligned_highp>::vec(uint _s) :
data(vdupq_n_u32(_s))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, float, aligned_highp>& rhs) :
data(rhs.data)
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, int, aligned_highp>& rhs) :
data(vcvtq_f32_s32(rhs.data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(const vec<4, uint, aligned_highp>& rhs) :
data(vcvtq_f32_u32(rhs.data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(int _x, int _y, int _z, int _w) :
data(vcvtq_f32_s32(vec<4, int, aligned_lowp>(_x, _y, _z, _w).data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(int _x, int _y, int _z, int _w) :
data(vcvtq_f32_s32(vec<4, int, aligned_mediump>(_x, _y, _z, _w).data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(int _x, int _y, int _z, int _w) :
data(vcvtq_f32_s32(vec<4, int, aligned_highp>(_x, _y, _z, _w).data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_lowp>::vec(uint _x, uint _y, uint _z, uint _w) :
data(vcvtq_f32_u32(vec<4, uint, aligned_lowp>(_x, _y, _z, _w).data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_mediump>::vec(uint _x, uint _y, uint _z, uint _w) :
data(vcvtq_f32_u32(vec<4, uint, aligned_mediump>(_x, _y, _z, _w).data))
{}
template<>
template<>
GLM_FUNC_QUALIFIER GLM_CONSTEXPR vec<4, float, aligned_highp>::vec(uint _x, uint _y, uint _z, uint _w) :
data(vcvtq_f32_u32(vec<4, uint, aligned_highp>(_x, _y, _z, _w).data))
{}
#endif
}//namespace glm
#endif

View File

@ -235,10 +235,11 @@
// User defines: GLM_FORCE_PURE GLM_FORCE_INTRINSICS GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2 GLM_FORCE_AVX2
#define GLM_ARCH_MIPS_BIT (0x10000000)
#define GLM_ARCH_PPC_BIT (0x20000000)
#define GLM_ARCH_ARM_BIT (0x40000000)
#define GLM_ARCH_X86_BIT (0x80000000)
#define GLM_ARCH_MIPS_BIT (0x10000000)
#define GLM_ARCH_PPC_BIT (0x20000000)
#define GLM_ARCH_ARM_BIT (0x40000000)
#define GLM_ARCH_ARMV8_BIT (0x01000000)
#define GLM_ARCH_X86_BIT (0x80000000)
#define GLM_ARCH_SIMD_BIT (0x00001000)
@ -263,6 +264,7 @@
#define GLM_ARCH_AVX (GLM_ARCH_AVX_BIT | GLM_ARCH_SSE42)
#define GLM_ARCH_AVX2 (GLM_ARCH_AVX2_BIT | GLM_ARCH_AVX)
#define GLM_ARCH_ARM (GLM_ARCH_ARM_BIT)
#define GLM_ARCH_ARMV8 (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM | GLM_ARCH_ARMV8_BIT)
#define GLM_ARCH_NEON (GLM_ARCH_NEON_BIT | GLM_ARCH_SIMD_BIT | GLM_ARCH_ARM)
#define GLM_ARCH_MIPS (GLM_ARCH_MIPS_BIT)
#define GLM_ARCH_PPC (GLM_ARCH_PPC_BIT)
@ -270,7 +272,11 @@
#if defined(GLM_FORCE_ARCH_UNKNOWN) || defined(GLM_FORCE_PURE)
# define GLM_ARCH GLM_ARCH_UNKNOWN
#elif defined(GLM_FORCE_NEON)
# define GLM_ARCH (GLM_ARCH_NEON)
# if __ARM_ARCH >= 8
# define GLM_ARCH (GLM_ARCH_ARMV8)
# else
# define GLM_ARCH (GLM_ARCH_NEON)
# endif
# define GLM_FORCE_INTRINSICS
#elif defined(GLM_FORCE_AVX2)
# define GLM_ARCH (GLM_ARCH_AVX2)
@ -313,9 +319,14 @@
# define GLM_ARCH (GLM_ARCH_SSE2)
# elif defined(__i386__)
# define GLM_ARCH (GLM_ARCH_X86)
# elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
# define GLM_ARCH (GLM_ARCH_ARMV8)
#warning "ARM v8"
# elif defined(__ARM_NEON)
#warning "ARM NEON"
# define GLM_ARCH (GLM_ARCH_ARM | GLM_ARCH_NEON)
# elif defined(__arm__ ) || defined(_M_ARM)
#warning "ARM v6"
# define GLM_ARCH (GLM_ARCH_ARM)
# elif defined(__mips__ )
# define GLM_ARCH (GLM_ARCH_MIPS)
@ -355,6 +366,8 @@
# include <pmmintrin.h>
#elif GLM_ARCH & GLM_ARCH_SSE2_BIT
# include <emmintrin.h>
#elif GLM_ARCH & GLM_ARCH_NEON_BIT
# include <arm_neon.h>
#endif//GLM_ARCH
#if GLM_ARCH & GLM_ARCH_SSE2_BIT
@ -380,3 +393,9 @@
typedef __m256i glm_i64vec4;
typedef __m256i glm_u64vec4;
#endif
#if GLM_ARCH & GLM_ARCH_NEON_BIT
typedef float32x4_t glm_f32vec4;
typedef int32x4_t glm_i32vec4;
typedef uint32x4_t glm_u32vec4;
#endif