[libc] Fix GPU benchmarking
This commit is contained in:
parent
cfa918bec1
commit
de59e7b86c
@ -7,9 +7,9 @@
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/fixedvector.h"
|
||||
#include "src/__support/macros/config.h"
|
||||
#include "src/__support/time/gpu/time_utils.h"
|
||||
#include "src/stdio/printf.h"
|
||||
#include "src/stdlib/srand.h"
|
||||
#include "src/time/gpu/time_utils.h"
|
||||
|
||||
namespace LIBC_NAMESPACE_DECL {
|
||||
namespace benchmarks {
|
||||
|
@ -3,12 +3,8 @@
|
||||
#include "src/math/atan2.h"
|
||||
#include "src/stdlib/rand.h"
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
#include "src/math/nvptx/declarations.h"
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
#include "src/math/amdgpu/declarations.h"
|
||||
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
|
||||
#include "platform.h"
|
||||
#endif
|
||||
|
||||
#define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \
|
||||
@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
|
||||
BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
|
||||
BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
|
||||
BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
|
||||
BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
|
||||
BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
|
||||
BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
|
||||
BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
|
||||
BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
|
||||
BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
|
||||
BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
|
||||
BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
|
||||
BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
|
||||
BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
|
||||
BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
|
||||
BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
|
||||
#endif
|
||||
|
57
libc/benchmarks/gpu/src/math/platform.h
Normal file
57
libc/benchmarks/gpu/src/math/platform.h
Normal file
@ -0,0 +1,57 @@
|
||||
//===-- AMDGPU specific platform definitions for math support -------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
|
||||
#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
|
||||
#include "src/__support/macros/attributes.h"
|
||||
#include "src/__support/macros/config.h"
|
||||
#include <stdint.h>
|
||||
|
||||
namespace LIBC_NAMESPACE_DECL {
|
||||
|
||||
#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
|
||||
// The ROCm device library uses control globals to alter codegen for the
|
||||
// different targets. To avoid needing to link them in manually we simply
|
||||
// define them here.
|
||||
extern "C" {
|
||||
extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
|
||||
extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
|
||||
extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
|
||||
extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
|
||||
extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
|
||||
}
|
||||
|
||||
// These aliases cause clang to emit the control constants with ODR linkage.
|
||||
// This allows us to link against the symbols without preventing them from being
|
||||
// optimized out or causing symbol collisions.
|
||||
[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
|
||||
[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
|
||||
[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
|
||||
__oclc_correctly_rounded_sqrt32__;
|
||||
[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
|
||||
[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
|
||||
#endif
|
||||
} // namespace LIBC_NAMESPACE_DECL
|
||||
|
||||
// Forward declarations for the vendor math libraries.
|
||||
extern "C" {
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
double __ocml_sin_f64(double);
|
||||
float __ocml_sin_f32(float);
|
||||
double __ocml_atan2_f64(double, double);
|
||||
float __ocml_atan2_f32(float, float);
|
||||
#endif
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
double __nv_sin(double);
|
||||
float __nv_sinf(float);
|
||||
double __nv_atan2(double, double);
|
||||
float __nv_atan2f(float, float);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
|
@ -8,12 +8,8 @@
|
||||
#include "src/math/sinf.h"
|
||||
#include "src/stdlib/rand.h"
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
#include "src/math/nvptx/declarations.h"
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
#include "src/math/amdgpu/declarations.h"
|
||||
#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
|
||||
#include "platform.h"
|
||||
#endif
|
||||
|
||||
// BENCHMARK() expects a function that with no parameters that returns a
|
||||
@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
|
||||
BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
|
||||
BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
|
||||
BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
|
||||
BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
|
||||
BENCH(double, NvSin, __nv_sin, -1023, 1023);
|
||||
BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
|
||||
BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
|
||||
BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
|
||||
BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
|
||||
BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
|
||||
BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
|
||||
BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
|
||||
BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
|
||||
BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
|
||||
BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
|
||||
#endif
|
||||
|
||||
BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
|
||||
@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
|
||||
BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);
|
||||
|
||||
#ifdef NVPTX_MATH_FOUND
|
||||
BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
|
||||
BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
|
||||
BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
|
||||
BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
|
||||
BENCH(float, NvSinf, __nv_sinf, -127, 128);
|
||||
BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
|
||||
BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
|
||||
BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
|
||||
#endif
|
||||
|
||||
#ifdef AMDGPU_MATH_FOUND
|
||||
BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
|
||||
BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
|
||||
BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
|
||||
BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
|
||||
BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
|
||||
BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
|
||||
BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
|
||||
BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
|
||||
#endif
|
||||
|
@ -10,6 +10,7 @@
|
||||
#define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
|
||||
|
||||
#include "src/__support/CPP/array.h"
|
||||
#include "src/__support/CPP/atomic.h"
|
||||
#include "src/__support/CPP/type_traits.h"
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/common.h"
|
||||
@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
|
||||
// allows us to substract the constant-time overhead from the latency to
|
||||
// obtain a true result. This can vary with system load.
|
||||
[[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
uint32_t result = 0.0;
|
||||
asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
|
||||
@ -44,13 +45,13 @@ template <typename F, typename T>
|
||||
T arg = storage;
|
||||
|
||||
// The AMDGPU architecture needs to wait on pending results.
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
// Get the current timestamp from the clock.
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
// This forces the compiler to load the input argument and run the clock
|
||||
// cycle counter before the profiling region.
|
||||
asm("" ::"s"(start));
|
||||
asm("" : "+v"(arg) : "s"(start));
|
||||
|
||||
// Run the function under test and return its value.
|
||||
auto result = f(arg);
|
||||
@ -71,7 +72,7 @@ template <typename F, typename T>
|
||||
// ordering.
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
asm("" ::"s"(stop));
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
|
||||
T1 arg1 = storage1;
|
||||
T2 arg2 = storage2;
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"s"(start));
|
||||
@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
asm("" ::"s"(stop));
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
|
||||
return stop - start;
|
||||
}
|
||||
@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
|
||||
throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
asm("" ::"v"(&inputs));
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"s"(start));
|
||||
@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
asm("" ::"s"(stop));
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
|
||||
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
|
||||
asm("" ::"v"(&inputs1), "v"(&inputs2));
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"s"(start));
|
||||
@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
asm("" ::"s"(stop));
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
|
||||
// Return the time elapsed.
|
||||
return stop - start;
|
||||
|
@ -10,6 +10,7 @@
|
||||
#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
|
||||
|
||||
#include "src/__support/CPP/array.h"
|
||||
#include "src/__support/CPP/atomic.h"
|
||||
#include "src/__support/CPP/type_traits.h"
|
||||
#include "src/__support/GPU/utils.h"
|
||||
#include "src/__support/common.h"
|
||||
@ -46,7 +47,7 @@ template <typename F, typename T>
|
||||
T arg = storage;
|
||||
|
||||
// Get the current timestamp from the clock.
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
// This forces the compiler to load the input argument and run the clock cycle
|
||||
@ -63,7 +64,7 @@ template <typename F, typename T>
|
||||
// Obtain the current timestamp after running the calculation and force
|
||||
// ordering.
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
asm("" ::"r"(stop));
|
||||
volatile T output = result;
|
||||
|
||||
@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
|
||||
T1 arg = storage;
|
||||
T2 arg2 = storage2;
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"llr"(start));
|
||||
@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
|
||||
asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
asm("" ::"r"(stop));
|
||||
volatile auto output = result;
|
||||
|
||||
@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
|
||||
throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
asm("" ::"r"(&inputs));
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"llr"(start));
|
||||
@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
|
||||
}
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
asm("" ::"r"(stop));
|
||||
volatile auto output = result;
|
||||
|
||||
@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
|
||||
F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
|
||||
asm("" ::"r"(&inputs1), "r"(&inputs2));
|
||||
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
uint64_t start = gpu::processor_clock();
|
||||
|
||||
asm("" ::"llr"(start));
|
||||
@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
|
||||
}
|
||||
|
||||
uint64_t stop = gpu::processor_clock();
|
||||
gpu::memory_fence();
|
||||
cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
|
||||
asm("" ::"r"(stop));
|
||||
volatile auto output = result;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user