From 75bf7392089d027bb6fa78ded21acaa97b16a412 Mon Sep 17 00:00:00 2001 From: Leandro Lacerda Date: Sat, 16 Aug 2025 17:14:26 -0300 Subject: [PATCH] [libc][gpu] Disable loop unrolling in the throughput benchmark loop (#153971) This patch makes GPU throughput benchmark results more comparable across targets by disabling loop unrolling in the benchmark loop. Motivation: * PTX (post-LTO) evidence on NVPTX: for libc `sin`, the generated PTX shows the `throughput` loop unrolled 8x at `N=128` (one iteration advances the input pointer by 64 bytes = 8 doubles), interleaving eight independent chains before the back-edge. This hides latency and significantly reduces cycles/call as the batch size `N` grows. * Observed scaling (NVPTX measurements): with unrolling enabled, `sin` dropped from ~3,100 cycles/call at `N=1` to ~360 at `N=128`. After enforcing `#pragma clang loop unroll(disable)`, results stabilized (e.g., from ~3100 cycles/call at `N=1` to ~2700 at `N=128`). * libdevice contrast: the libdevice `sin` path did not exhibit a similar drop in our measurements, and the PTX appears as compact internal calls rather than a long FMA chain, leaving less ILP for the outer loop to extract. What this change does: * Applies `#pragma clang loop unroll(disable)` to the GPU `throughput()` loop in both NVPTX and AMDGPU backends. Leaving unrolling entirely to the optimizer makes apples-to-apples comparisons uneven (e.g., libc vs. vendor). Disabling unrolling yields fairer, more consistent numbers. --- libc/benchmarks/gpu/timing/amdgpu/timing.h | 8 ++++++++ libc/benchmarks/gpu/timing/nvptx/timing.h | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h index b4a174f72981..8b92584b3923 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/timing.h +++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h @@ -117,6 +117,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = input; @@ -146,6 +148,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = f(input); @@ -174,6 +178,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -206,6 +212,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 0c93a67129b8..944d3732eae6 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -106,6 +106,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = input; @@ -135,6 +137,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = f(input); @@ -163,6 +167,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -195,6 +201,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i];