From 3e20cc66544de4f8f872f581339167fa63b938f8 Mon Sep 17 00:00:00 2001 From: sharkautarch <128002472+sharkautarch@users.noreply.github.com> Date: Sun, 15 Sep 2024 11:50:07 -0400 Subject: [PATCH] constexpr simd vec: perf tuning for packed vec3 --- glm/detail/qualifier.hpp | 14 +++-- glm/detail/simd_constexpr/simd_helpers.inl | 60 ++++++++++++---------- glm/detail/simd_constexpr/vec.hpp | 27 ++++++---- 3 files changed, 62 insertions(+), 39 deletions(-) diff --git a/glm/detail/qualifier.hpp b/glm/detail/qualifier.hpp index 1964ad1a..117c5759 100644 --- a/glm/detail/qualifier.hpp +++ b/glm/detail/qualifier.hpp @@ -109,15 +109,20 @@ namespace detail # if GLM_ARCH & GLM_ARCH_SSE2_BIT #if defined(__clang__) || defined(__GNUC__) +#if __x86_64__ +#define ATTR(size) __attribute__((packed,aligned(size))) +#else +#define ATTR(size) +#endif template - struct storage<2, T, false> + struct ATTR(sizeof(T)/2) storage<2, T, false> { - typedef T type __attribute__((aligned(sizeof(T)),vector_size(2*sizeof(T)))); + typedef T type __attribute__((aligned(sizeof(T)/2),vector_size(2*sizeof(T)))); }; template - struct storage<1, T, false> + struct ATTR(1) storage<1, T, false> { - typedef T type __attribute__((aligned(1),vector_size(sizeof(T)))); + typedef T type; }; template struct storage<2, T, true> @@ -129,6 +134,7 @@ namespace detail { typedef T type __attribute__((aligned(sizeof(T)),vector_size(sizeof(T)))); }; +#undef ATTR #endif template<> struct storage<4, float, true> diff --git a/glm/detail/simd_constexpr/simd_helpers.inl b/glm/detail/simd_constexpr/simd_helpers.inl index 4ebc3434..3652cc50 100644 --- a/glm/detail/simd_constexpr/simd_helpers.inl +++ b/glm/detail/simd_constexpr/simd_helpers.inl @@ -9,47 +9,53 @@ namespace glm::detail using FirstTx = Tx0; }; template - using PaddedVec = PaddedGccVec()>; - using gcc_vec_t = PaddedVec::GccV; + using GccVec = typename detail::GccVExt::GccV; + using gcc_vec_t = GccVec; using data_t = typename detail::storage::value>::type; - static inline auto __attribute__((always_inline)) gcc_vec_to_data(PaddedVec v) { - if constexpr (L == 3 && !BIsAlignedQ()) { - data_t d; - std::memcpy(&d, &v, sizeof(d)); - return d; - } else { + static inline auto __attribute__((always_inline)) gcc_vec_to_data(auto v) { + static constexpr auto size = std::min(sizeof(v), sizeof(data_t)); + static constexpr auto biggerSize = std::max(sizeof(v), sizeof(data_t)); + if constexpr (size == biggerSize) { return std::bit_cast(v); + } else { + data_t d; + std::memcpy(&d, &v, size); + return d; } } static inline auto __attribute__((always_inline)) simd_ctor_scalar(arithmetic auto scalar) { - PaddedVec v = {}; - v.gcc_vec = v.gcc_vec + ( (T)scalar ); + gcc_vec_t v = gcc_vec_t{} + ( (T)scalar ); + using Tx = decltype(scalar); + scalar.Tx::~Tx(); return gcc_vec_to_data(v); } template requires (Lx == L) static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec v) { - using OtherPaddedVec = PaddedVec; - OtherPaddedVec o = std::bit_cast(v.data); - PaddedVec converted = {.gcc_vec=__builtin_convertvector(o.gcc_vec, gcc_vec_t)}; + using OtherVec = GccVec; + OtherVec o; + static constexpr auto size = std::min(sizeof(v.data), sizeof(o)); + std::memcpy(&o, &(v.data), size); + using o_vec_t = decltype(v); + v.o_vec_t::~o_vec_t(); + gcc_vec_t converted = __builtin_convertvector(o, gcc_vec_t); return gcc_vec_to_data(converted); } template requires (Lx != L && Lx < L) static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec v) { - using OtherPaddedVec = PaddedVec; - using OurSizeTheirType = PaddedVec; - OtherPaddedVec o = std::bit_cast(v.data); - OurSizeTheirType oExpanded = {}; - for (length_t i = 0; i < Lx; i++) { - oExpanded.gcc_vec[i] = o.gcc_vec[i]; - } + using OurSizeTheirType = GccVec; + static constexpr auto size = std::min(sizeof(OurSizeTheirType), sizeof(v.data)); + OurSizeTheirType oExpanded; + std::memcpy(&oExpanded, &(v.data), size); + using o_vec_t = decltype(v); + v.o_vec_t::~o_vec_t(); - PaddedVec converted = {.gcc_vec=__builtin_convertvector(oExpanded.gcc_vec, gcc_vec_t)}; + gcc_vec_t converted = __builtin_convertvector(oExpanded, gcc_vec_t); return gcc_vec_to_data(converted); } @@ -62,11 +68,13 @@ namespace glm::detail static inline auto __attribute__((always_inline)) simd_ctor_multi_scalars(A... scalars) requires ( isLengthOfVector() && SameArithmeticTypes()) { //assuming that number of scalars is always the same as the length of the to-be-constructed vector - using Tx = typename GetFirstType::FirstTx; - using OtherPaddedVec = PaddedVec; - typename OtherPaddedVec::GccV o = {Tx(scalars)...}; - PaddedVec converted = {.gcc_vec=__builtin_convertvector(o, gcc_vec_t)}; - return gcc_vec_to_data(converted); + gcc_vec_t v; + std::array pack{scalars...}; + for (int i = 0; i != sizeof...(scalars); i++ ) { + v[i] = pack[i]; + pack[i].T::~T(); + } + return gcc_vec_to_data(v); } }; } \ No newline at end of file diff --git a/glm/detail/simd_constexpr/vec.hpp b/glm/detail/simd_constexpr/vec.hpp index a730dae1..5532b2e8 100644 --- a/glm/detail/simd_constexpr/vec.hpp +++ b/glm/detail/simd_constexpr/vec.hpp @@ -94,9 +94,10 @@ namespace glm }; } + /*template + using PaddedGccVec = detail::PaddedGccVec()>;*/ template - using PaddedGccVec = detail::PaddedGccVec()>; - + using GccVec = typename detail::GccVExt::GccV; template using VecDataArray = detail::VecDataArray()>; @@ -161,6 +162,7 @@ namespace glm switch (i) { default: + __builtin_unreachable(); case 0: return x; case 1: { @@ -192,6 +194,7 @@ namespace glm switch (i) { default: + __builtin_unreachable(); case 0: return x; case 1: { @@ -296,7 +299,7 @@ namespace glm { using VTX = decltype(vs0); if constexpr ( std::is_integral_v || std::is_floating_point_v ) { - return RetArr<1>{vs0}; + return RetArr<1>{(T)vs0}; } else if constexpr ( ( requires { VTX::k_len; }) ) { using Tx = VTX::value_type; using ArrX = VecDataArray; @@ -323,7 +326,7 @@ namespace glm : EC {.data= [scalar...]() -> data_t { - if (std::is_constant_evaluated()) { + if (std::is_constant_evaluated() || (L == 3 && !BIsAlignedQ())) { DataArray a = {.p={ T(scalar)... }}; return std::bit_cast(a); } else { @@ -454,7 +457,7 @@ namespace glm } template - inline GLM_CONSTEXPR vec & operator*=(vec v) + inline GLM_CONSTEXPR vec & operator*=(vec const& __restrict__ v) __restrict__ { if constexpr (L < 3) { this->data *= v.data; @@ -788,9 +791,12 @@ namespace glm } - friend inline GLM_CONSTEXPR vec operator*(vec v1, vec v2) + friend inline GLM_CONSTEXPR vec __attribute__((const, always_inline, nothrow, no_stack_protector)) operator*(vec v1, vec const& __restrict__ v2) { - return vec(v1) *= v2; + if constexpr (L == 3 && !BIsAlignedQ()) + return *(new (&v1) vec(v1.x*v2.x, v1.y*v2.y, v1.z*v2.z)); + else + return v1 *= v2; } @@ -813,9 +819,12 @@ namespace glm } template - friend inline GLM_CONSTEXPR vec operator/(vec v1, vec v2) requires (!NotVec1 && NotVec1) + friend inline GLM_CONSTEXPR vec operator/(vec v1, vec && __restrict__ v2) requires (!NotVec1 && NotVec1) { - return vec(v1.x) /= v2; + if constexpr (L == 3 && !BIsAlignedQ()) + return *(new (&v2) vec( v1.data / v2.x, v1.data/v2.y, v1.data/v2.z )); + else + return vec(v1.x) /= v2; }