[libc] Fix GPU benchmarking

2025-07-18 14:36:09 -05:00 · 2025-07-18 14:36:09 -05:00 · de59e7b86c
commit de59e7b86c
parent cfa918bec1
6 changed files with 106 additions and 55 deletions
--- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
+++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp
@ -7,9 +7,9 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/fixedvector.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/time/gpu/time_utils.h"
 #include "src/stdio/printf.h"
 #include "src/stdlib/srand.h"
-#include "src/time/gpu/time_utils.h"

 namespace LIBC_NAMESPACE_DECL {
 namespace benchmarks {
--- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp
@ -3,12 +3,8 @@
 #include "src/math/atan2.h"
 #include "src/stdlib/rand.h"

-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif

 #define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N)                      \
@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30);
 BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000);

 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023);
-BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3);
-BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30);
-BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000);
+BENCH(double, NvAtan2, __nv_atan2, -1023, 1023);
+BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3);
+BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30);
+BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000);
 #endif

 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023);
-BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3);
-BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30);
-BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000);
+BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023);
+BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3);
+BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30);
+BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000);
 #endif
--- a/libc/benchmarks/gpu/src/math/platform.h
+++ b/libc/benchmarks/gpu/src/math/platform.h
@ -0,0 +1,57 @@
+//===-- AMDGPU specific platform definitions for math support -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+#ifdef LIBC_TARGET_ARCH_IS_AMDGPU
+// The ROCm device library uses control globals to alter codegen for the
+// different targets. To avoid needing to link them in manually we simply
+// define them here.
+extern "C" {
+extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0;
+extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1;
+extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0;
+extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000;
+}
+
+// These aliases cause clang to emit the control constants with ODR linkage.
+// This allows us to link against the symbols without preventing them from being
+// optimized out or causing symbol collisions.
+[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__;
+[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__;
+[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t
+    __oclc_correctly_rounded_sqrt32__;
+[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__;
+[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__;
+#endif
+} // namespace LIBC_NAMESPACE_DECL
+
+// Forward declarations for the vendor math libraries.
+extern "C" {
+#ifdef AMDGPU_MATH_FOUND
+double __ocml_sin_f64(double);
+float __ocml_sin_f32(float);
+double __ocml_atan2_f64(double, double);
+float __ocml_atan2_f32(float, float);
+#endif
+
+#ifdef NVPTX_MATH_FOUND
+double __nv_sin(double);
+float __nv_sinf(float);
+double __nv_atan2(double, double);
+float __nv_atan2f(float, float);
+#endif
+}
+
+#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H
--- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
+++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp
@ -8,12 +8,8 @@
 #include "src/math/sinf.h"
 #include "src/stdlib/rand.h"

-#ifdef NVPTX_MATH_FOUND
-#include "src/math/nvptx/declarations.h"
-#endif
-
-#ifdef AMDGPU_MATH_FOUND
-#include "src/math/amdgpu/declarations.h"
+#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND)
+#include "platform.h"
 #endif

 // BENCHMARK() expects a function that with no parameters that returns a
@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30);
 BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000);

 #ifdef NVPTX_MATH_FOUND
-BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023);
-BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3);
-BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30);
-BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000);
+BENCH(double, NvSin, __nv_sin, -1023, 1023);
+BENCH(double, NvSinTwoPi, __nv_sin, -10, 3);
+BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30);
+BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000);
 #endif

 #ifdef AMDGPU_MATH_FOUND
-BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023);
-BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3);
-BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30);
-BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000);
+BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023);
+BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3);
+BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30);
+BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000);
 #endif

 BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128);
@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30);
 BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120);

 #ifdef NVPTX_MATH_FOUND
-BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128);
-BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3);
-BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30);
-BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120);
+BENCH(float, NvSinf, __nv_sinf, -127, 128);
+BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3);
+BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30);
+BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120);
 #endif

 #ifdef AMDGPU_MATH_FOUND
-BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128);
-BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3);
-BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30);
-BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120);
+BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128);
+BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3);
+BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30);
+BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120);
 #endif
--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
+++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU

 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL {
 // allows us to substract the constant-time overhead from the latency to
 // obtain a true result. This can vary with system load.
 [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() {
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();
  uint32_t result = 0.0;
  asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result));
@ -44,13 +45,13 @@ template <typename F, typename T>
  T arg = storage;

  // The AMDGPU architecture needs to wait on pending results.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  // Get the current timestamp from the clock.
  uint64_t start = gpu::processor_clock();

  // This forces the compiler to load the input argument and run the clock
  // cycle counter before the profiling region.
-  asm("" ::"s"(start));
+  asm("" : "+v"(arg) : "s"(start));

  // Run the function under test and return its value.
  auto result = f(arg);
@ -71,7 +72,7 @@ template <typename F, typename T>
  // ordering.
  uint64_t stop = gpu::processor_clock();
  asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);

  // Return the time elapsed.
  return stop - start;
@ -84,7 +85,7 @@ template <typename F, typename T1, typename T2>
  T1 arg1 = storage1;
  T2 arg2 = storage2;

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"s"(start));
@ -100,7 +101,7 @@ template <typename F, typename T1, typename T2>

  uint64_t stop = gpu::processor_clock();
  asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);

  return stop - start;
 }
@ -111,7 +112,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
  asm("" ::"v"(&inputs));

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"s"(start));
@ -124,7 +125,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {

  uint64_t stop = gpu::processor_clock();
  asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);

  // Return the time elapsed.
  return stop - start;
@ -136,7 +137,7 @@ template <typename F, typename T, size_t N>
    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
  asm("" ::"v"(&inputs1), "v"(&inputs2));

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"s"(start));
@ -149,7 +150,7 @@ template <typename F, typename T, size_t N>

  uint64_t stop = gpu::processor_clock();
  asm("" ::"s"(stop));
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);

  // Return the time elapsed.
  return stop - start;
--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
+++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX

 #include "src/__support/CPP/array.h"
+#include "src/__support/CPP/atomic.h"
 #include "src/__support/CPP/type_traits.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/common.h"
@ -46,7 +47,7 @@ template <typename F, typename T>
  T arg = storage;

  // Get the current timestamp from the clock.
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  // This forces the compiler to load the input argument and run the clock cycle
@ -63,7 +64,7 @@ template <typename F, typename T>
  // Obtain the current timestamp after running the calculation and force
  // ordering.
  uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  asm("" ::"r"(stop));
  volatile T output = result;

@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
  T1 arg = storage;
  T2 arg2 = storage2;

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"llr"(start));
@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) {
  asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result));

  uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  asm("" ::"r"(stop));
  volatile auto output = result;

@ -101,7 +102,7 @@ template <typename F, typename T, size_t N>
 throughput(F f, const cpp::array<T, N> &inputs) {
  asm("" ::"r"(&inputs));

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"llr"(start));
@ -114,7 +115,7 @@ throughput(F f, const cpp::array<T, N> &inputs) {
  }

  uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  asm("" ::"r"(stop));
  volatile auto output = result;

@ -128,7 +129,7 @@ template <typename F, typename T, size_t N>
    F f, const cpp::array<T, N> &inputs1, const cpp::array<T, N> &inputs2) {
  asm("" ::"r"(&inputs1), "r"(&inputs2));

-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  uint64_t start = gpu::processor_clock();

  asm("" ::"llr"(start));
@ -140,7 +141,7 @@ template <typename F, typename T, size_t N>
  }

  uint64_t stop = gpu::processor_clock();
-  gpu::memory_fence();
+  cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL);
  asm("" ::"r"(stop));
  volatile auto output = result;