From d18a784d410617fbb3f9d2bf6d30fcd84b3533db Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 19 Mar 2026 10:51:48 -0500 Subject: [PATCH] [compiler-rt] Define GPU specific handling of profiling functions (#185763) Summary: The changes in https://www.github.com/llvm/llvm-project/pull/185552 allowed us to start building the standard `libclang_rt.profile.a` for GPU targets. This PR expands this by adding an optimized GPU routine for counter increment and removing the special-case handling of these functions in the OpenMP runtime. Vast majority of these functions are boilerplate, but we should be able to do more interesting things with this in the future, like value or memory profiling. --- compiler-rt/lib/profile/CMakeLists.txt | 1 + compiler-rt/lib/profile/InstrProfiling.h | 10 +++++ .../lib/profile/InstrProfilingPlatformGPU.c | 42 +++++++++++++++++++ .../Instrumentation/InstrProfiling.cpp | 15 ++++++- offload/test/CMakeLists.txt | 6 --- offload/test/lit.cfg | 13 +++++- offload/test/lit.site.cfg.in | 1 - openmp/device/CMakeLists.txt | 1 - openmp/device/include/Profiling.h | 21 ---------- openmp/device/src/Profiling.cpp | 18 -------- 10 files changed, 78 insertions(+), 50 deletions(-) create mode 100644 compiler-rt/lib/profile/InstrProfilingPlatformGPU.c delete mode 100644 openmp/device/include/Profiling.h delete mode 100644 openmp/device/src/Profiling.cpp diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 4cc2610cec87..86328b4c1392 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -74,6 +74,7 @@ set(PROFILE_SOURCES InstrProfilingPlatformLinux.c InstrProfilingPlatformOther.c InstrProfilingPlatformWindows.c + InstrProfilingPlatformGPU.c ) if (NOT COMPILER_RT_PROFILE_BAREMETAL) diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 187ef55ef378..54013d7e6568 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -166,6 +166,16 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data, uint32_t CounterIndex, uint64_t CounterValue); +/*! + * \brief Wave-cooperative counter increment for GPU targets. + * + * Reduces per-lane atomic contention by electing a single lane per wave to + * perform the counter update. \c Uniform is an optional counter tracking the + * number of uniform. + */ +void __llvm_profile_instrument_gpu(uint64_t *Counter, uint64_t *Uniform, + uint64_t Step); + /*! * \brief Write instrumentation data to the current file. * diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c new file mode 100644 index 000000000000..78bf512f8c44 --- /dev/null +++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c @@ -0,0 +1,42 @@ +/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ + +// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file +// provides: +// +// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by +// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path. + +#if defined(__NVPTX__) || defined(__AMDGPU__) + +#include "InstrProfiling.h" +#include + +// Indicates that the current wave is fully occupied. +static int is_uniform(uint64_t mask) { + const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes()); + return mask == uniform_mask; +} + +// Wave-cooperative counter increment. The instrumentation pass emits calls to +// this in place of the default non-atomic load/add/store or atomicrmw sequence. +// The optional uniform counter allows calculating wave uniformity if present. +COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter, + uint64_t *uniform, + uint64_t step) { + uint64_t mask = __gpu_lane_mask(); + if (__gpu_is_first_in_lane(mask)) { + __scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask), + __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); + if (uniform && is_uniform(mask)) + __scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask), + __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); + } +} + +#endif diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 199b7357fa86..d1696f4afbe3 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1192,8 +1192,19 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) { auto *Addr = getCounterAddress(Inc); IRBuilder<> Builder(Inc); - if (Options.Atomic || AtomicCounterUpdateAll || - (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { + if (isGPUProfTarget(M)) { + auto *I64Ty = Builder.getInt64Ty(); + auto *PtrTy = Builder.getPtrTy(); + auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()), + {PtrTy, PtrTy, I64Ty}, false); + auto Callee = + M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy); + Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy); + Value *Uniform = + ConstantPointerNull::get(PointerType::getUnqual(M.getContext())); + Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()}); + } else if (Options.Atomic || AtomicCounterUpdateAll || + (Inc->getIndex()->isNullValue() && AtomicFirstCounter)) { Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(), MaybeAlign(), AtomicOrdering::Monotonic); } else { diff --git a/offload/test/CMakeLists.txt b/offload/test/CMakeLists.txt index 40da2a7d573e..434f25f512a4 100644 --- a/offload/test/CMakeLists.txt +++ b/offload/test/CMakeLists.txt @@ -12,12 +12,6 @@ else() set(LIBOMPTARGET_DEBUG False) endif() -if ("compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES) - set(LIBOMPTARGET_TEST_GPU_PGO True) -else() - set(LIBOMPTARGET_TEST_GPU_PGO False) -endif() - # Replace the space from user's input with ";" in case that CMake add escape # char into the lit command. string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}") diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 2d5d69167109..2226764f9aa9 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -2,6 +2,7 @@ # Configuration file for the 'lit' test runner. import os +import glob import lit.formats # Tell pylint that we know config and lit_config exist somewhere. @@ -132,7 +133,17 @@ if config.libomptarget_has_libc: config.available_features.add('libc') profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata") -if config.libomptarget_test_pgo: +target = config.libomptarget_current_target +for suffix in ['-JIT-LTO', '-LTO']: + if target.endswith(suffix): + target = target[:-len(suffix)] + break +has_profile_rt = True +if target.startswith('amdgcn') or target.startswith('nvptx'): + has_profile_rt = bool(glob.glob(os.path.join( + config.llvm_lib_directory, 'clang', '*', 'lib', target, + 'libclang_rt.profile.a'))) +if has_profile_rt: config.available_features.add('pgo') config.substitutions.append(("%profdata", profdata_path)) diff --git a/offload/test/lit.site.cfg.in b/offload/test/lit.site.cfg.in index c8ba45c9683e..47b1fbd18514 100644 --- a/offload/test/lit.site.cfg.in +++ b/offload/test/lit.site.cfg.in @@ -27,7 +27,6 @@ config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@" config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@ config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@ -config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@ config.offload_tblgen = "@OFFLOAD_TBLGEN_EXECUTABLE@" # Let the main config do the real work. lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/openmp/device/CMakeLists.txt b/openmp/device/CMakeLists.txt index 096a6fe0b6e7..ff5a64fdd2f0 100644 --- a/openmp/device/CMakeLists.txt +++ b/openmp/device/CMakeLists.txt @@ -16,7 +16,6 @@ set(src_files ${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp diff --git a/openmp/device/include/Profiling.h b/openmp/device/include/Profiling.h deleted file mode 100644 index d99475225412..000000000000 --- a/openmp/device/include/Profiling.h +++ /dev/null @@ -1,21 +0,0 @@ -//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_DEVICERTL_PROFILING_H -#define OMPTARGET_DEVICERTL_PROFILING_H - -extern "C" { -void __llvm_profile_register_function(void *Ptr); -void __llvm_profile_register_names_function(void *Ptr, long int I); -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); -} - -#endif diff --git a/openmp/device/src/Profiling.cpp b/openmp/device/src/Profiling.cpp deleted file mode 100644 index df141af5ebee..000000000000 --- a/openmp/device/src/Profiling.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//===------- Profiling.cpp ---------------------------------------- C++ ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "Profiling.h" - -extern "C" { - -// Provides empty implementations for certain functions in compiler-rt -// that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} -}