Optimizations of log2 for ivec4

2024-11-23 01:14:34 +00:00 · 2014-11-24 01:56:36 +01:00 · 2014-11-24 01:56:36 +01:00 · e8fbcf76dd
commit e8fbcf76dd
parent 117634c7ea
5 changed files with 171 additions and 40 deletions
--- a/glm/detail/func_exponential.inl
+++ b/glm/detail/func_exponential.inl
@ -35,20 +35,22 @@
 namespace glm{
 namespace detail
 {
-	template <bool isFloat>
-	struct compute_log2{};
-
-	template <>
-	struct compute_log2<true>
-	{
-		template <typename T>
-		GLM_FUNC_QUALIFIER T operator() (T Value) const
-		{
 #	if GLM_LANG & GLM_LANG_CXX11_FLAG
-				return std::log2(Value);
+		using std::log2;
 #	else
-				return std::log(Value) * static_cast<T>(1.4426950408889634073599246810019);
+		template <typename genType>
+		genType log2(genType Value)
+		{
+			return std::log(Value) * static_cast<genType>(1.4426950408889634073599246810019);
+		}
 #	endif
+
+	template <typename T, precision P, template <class, precision> class vecType, bool isFloat = true>
+	struct compute_log2
+	{
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & vec)
+		{
+			return detail::functor1<T, T, P, vecType>::call(log2, vec);
 		}
 	};

@ -121,17 +123,13 @@ namespace detail
 	template <typename genType>
 	GLM_FUNC_QUALIFIER genType log2(genType x)
 	{
-		GLM_STATIC_ASSERT(std::numeric_limits<genType>::is_iec559 || std::numeric_limits<genType>::is_integer,
-			"GLM core 'log2' only accept floating-point inputs. Include <glm/gtx/integer.hpp> for additional integer support.");
-
-		assert(x > genType(0)); // log2 is only defined on the range (0, inf]
-		return detail::compute_log2<std::numeric_limits<genType>::is_iec559>()(x);
+		return log2(tvec1<genType>(x)).x;
 	}

 	template <typename T, precision P, template <typename, precision> class vecType>
 	GLM_FUNC_QUALIFIER vecType<T, P> log2(vecType<T, P> const & x)
 	{
-		return detail::functor1<T, T, P, vecType>::call(log2, x);
+		return detail::compute_log2<T, P, vecType, std::numeric_limits<T>::is_iec559>::call(x);
 	}

 	// sqrt
--- a/glm/detail/setup.hpp
+++ b/glm/detail/setup.hpp
@ -564,11 +564,12 @@
 // User defines: GLM_FORCE_PURE GLM_FORCE_SSE2 GLM_FORCE_SSE3 GLM_FORCE_AVX GLM_FORCE_AVX2

 #define GLM_ARCH_PURE		0x0000
-#define GLM_ARCH_SSE2		0x0001
-#define GLM_ARCH_SSE3		0x0002
-#define GLM_ARCH_SSE4		0x0004
-#define GLM_ARCH_AVX		0x0008
-#define GLM_ARCH_AVX2		0x0010
+#define GLM_ARCH_X86		0x0001
+#define GLM_ARCH_SSE2		0x0002
+#define GLM_ARCH_SSE3		0x0004
+#define GLM_ARCH_SSE4		0x0008
+#define GLM_ARCH_AVX		0x0010
+#define GLM_ARCH_AVX2		0x0020

 #if defined(GLM_FORCE_PURE)
 #	define GLM_ARCH GLM_ARCH_PURE
--- a/glm/gtc/integer.inl
+++ b/glm/gtc/integer.inl
@ -29,19 +29,35 @@
 namespace glm{
 namespace detail
 {
-	GLM_FUNC_QUALIFIER unsigned int nlz(unsigned int x) 
+	template <typename T, precision P, template <class, precision> class vecType>
+	struct compute_log2<T, P, vecType, false>
 	{
-		return 31u - findMSB(x);
-	}
-
-	template <>
-	struct compute_log2<false>
+		GLM_FUNC_QUALIFIER static vecType<T, P> call(vecType<T, P> const & vec)
 		{
-		template <typename T>
-		GLM_FUNC_QUALIFIER T operator() (T const & Value) const
-		{
-			return Value <= static_cast<T>(1) ? T(0) : T(32) - nlz(Value - T(1));
+			//Equivalent to return findMSB(vec); but save one function call in ASM with VC
+			//return findMSB(vec);
+			return detail::compute_findMSB_vec<T, P, vecType, sizeof(T) * 8>::call(vec);
 		}
 	};
+
+#	if(GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
+
+	template <precision P>
+	struct compute_log2<int, P, tvec4, false>
+	{
+		GLM_FUNC_QUALIFIER static tvec4<int, P> call(tvec4<int, P> const & vec)
+		{
+			tvec4<int, P> Result(glm::uninitialize);
+
+			_BitScanReverse(reinterpret_cast<unsigned long*>(&Result.x), vec.x);
+			_BitScanReverse(reinterpret_cast<unsigned long*>(&Result.y), vec.y);
+			_BitScanReverse(reinterpret_cast<unsigned long*>(&Result.z), vec.z);
+			_BitScanReverse(reinterpret_cast<unsigned long*>(&Result.w), vec.w);
+
+			return Result;
+		}
+	};
+
+#	endif//GLM_ARCH != GLM_ARCH_PURE
 }//namespace detail
 }//namespace glm
--- a/test/core/core_func_common.cpp
+++ b/test/core/core_func_common.cpp
@ -887,12 +887,6 @@ namespace sign
 			Error += Data[i].Return == Result ? 0 : 1;
 		}

-		for(std::size_t i = 0; i < sizeof(Data) / sizeof(type<glm::int32>); ++i)
-		{
-			glm::int32 Result = sign_sub(Data[i].Value);
-			Error += Data[i].Return == Result ? 0 : 1;
-		}
-
 		return Error;
 	}

--- a/test/gtc/gtc_integer.cpp
+++ b/test/gtc/gtc_integer.cpp
@ -7,9 +7,11 @@
 // File    : test/gtc/integer.cpp
 ///////////////////////////////////////////////////////////////////////////////////////////////////

+#define GLM_FORCE_INLINE
 #include <glm/gtc/integer.hpp>
 #include <glm/gtc/type_precision.hpp>
 #include <glm/gtc/vec1.hpp>
+#include <glm/gtx/type_aligned.hpp>
 #include <glm/vector_relational.hpp>
 #include <glm/vec2.hpp>
 #include <glm/vec3.hpp>
@ -48,6 +50,126 @@ namespace log2_
 	int perf()
 	{
 		int Error = 0;
+		std::size_t const Count(100000000);
+
+		{
+			std::vector<int> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+				Result[i] = glm::log2(static_cast<int>(i));
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<int>: %d clocks\n", End - Begin);
+		}
+
+		{
+			std::vector<glm::ivec4> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+				Result[i] = glm::log2(glm::ivec4(i));
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<ivec4>: %d clocks\n", End - Begin);
+		}
+
+#		if(GLM_ARCH != GLM_ARCH_PURE) && (GLM_COMPILER & (GLM_COMPILER_VC | GLM_COMPILER_APPLE_CLANG | GLM_COMPILER_LLVM))
+		{
+			std::vector<glm::ivec4> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+			{
+				glm::tvec4<unsigned long, glm::defaultp> Tmp(glm::uninitialize);
+				_BitScanReverse(&Tmp.x, i);
+				_BitScanReverse(&Tmp.y, i);
+				_BitScanReverse(&Tmp.z, i);
+				_BitScanReverse(&Tmp.w, i);
+				Result[i] = glm::ivec4(Tmp);
+			}
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<ivec4> inlined: %d clocks\n", End - Begin);
+		}
+
+
+		{
+			std::vector<glm::tvec4<unsigned long, glm::defaultp> > Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+			{
+				_BitScanReverse(&Result[i].x, i);
+				_BitScanReverse(&Result[i].y, i);
+				_BitScanReverse(&Result[i].z, i);
+				_BitScanReverse(&Result[i].w, i);
+			}
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<ivec4> inlined no cast: %d clocks\n", End - Begin);
+		}
+
+
+		{
+			std::vector<glm::ivec4> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+			{
+				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].x), i);
+				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].y), i);
+				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].z), i);
+				_BitScanReverse(reinterpret_cast<unsigned long*>(&Result[i].w), i);
+			}
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<ivec4> reinterpret: %d clocks\n", End - Begin);
+		}
+#		endif//GLM_ARCH != GLM_ARCH_PURE
+
+		{
+			std::vector<float> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+				Result[i] = glm::log2(static_cast<float>(i));
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<float>: %d clocks\n", End - Begin);
+		}
+
+		{
+			std::vector<glm::vec4> Result;
+			Result.resize(Count);
+
+			std::clock_t Begin = clock();
+
+			for(std::size_t i = 0; i < Count; ++i)
+				Result[i] = glm::log2(glm::vec4(i));
+
+			std::clock_t End = clock();
+
+			printf("glm::log2<vec4>: %d clocks\n", End - Begin);
+		}

 		return Error;
 	}