[compiler-rt] Define GPU specific handling of profiling functions (#185763)

Summary:
The changes in https://www.github.com/llvm/llvm-project/pull/185552
allowed us to
start building the standard `libclang_rt.profile.a` for GPU targets.
This PR expands this by adding an optimized GPU routine for counter
increment and removing the special-case handling of these functions in
the OpenMP runtime.

Vast majority of these functions are boilerplate, but we should be able
to do more interesting things with this in the future, like value or
memory profiling.
This commit is contained in:
Joseph Huber 2026-03-19 10:51:48 -05:00 committed by GitHub
parent 923cc2d43b
commit d18a784d41
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 78 additions and 50 deletions

View File

@ -74,6 +74,7 @@ set(PROFILE_SOURCES
InstrProfilingPlatformLinux.c
InstrProfilingPlatformOther.c
InstrProfilingPlatformWindows.c
InstrProfilingPlatformGPU.c
)
if (NOT COMPILER_RT_PROFILE_BAREMETAL)

View File

@ -166,6 +166,16 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
uint32_t CounterIndex,
uint64_t CounterValue);
/*!
* \brief Wave-cooperative counter increment for GPU targets.
*
* Reduces per-lane atomic contention by electing a single lane per wave to
* perform the counter update. \c Uniform is an optional counter tracking the
* number of uniform.
*/
void __llvm_profile_instrument_gpu(uint64_t *Counter, uint64_t *Uniform,
uint64_t Step);
/*!
* \brief Write instrumentation data to the current file.
*

View File

@ -0,0 +1,42 @@
/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
|*
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|* See https://llvm.org/LICENSE.txt for license information.
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|*
\*===----------------------------------------------------------------------===*/
// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
// provides:
//
// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
#if defined(__NVPTX__) || defined(__AMDGPU__)
#include "InstrProfiling.h"
#include <gpuintrin.h>
// Indicates that the current wave is fully occupied.
static int is_uniform(uint64_t mask) {
const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
return mask == uniform_mask;
}
// Wave-cooperative counter increment. The instrumentation pass emits calls to
// this in place of the default non-atomic load/add/store or atomicrmw sequence.
// The optional uniform counter allows calculating wave uniformity if present.
COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
uint64_t *uniform,
uint64_t step) {
uint64_t mask = __gpu_lane_mask();
if (__gpu_is_first_in_lane(mask)) {
__scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
if (uniform && is_uniform(mask))
__scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
}
}
#endif

View File

@ -1192,8 +1192,19 @@ void InstrLowerer::lowerIncrement(InstrProfIncrementInst *Inc) {
auto *Addr = getCounterAddress(Inc);
IRBuilder<> Builder(Inc);
if (Options.Atomic || AtomicCounterUpdateAll ||
(Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
if (isGPUProfTarget(M)) {
auto *I64Ty = Builder.getInt64Ty();
auto *PtrTy = Builder.getPtrTy();
auto *CalleeTy = FunctionType::get(Type::getVoidTy(M.getContext()),
{PtrTy, PtrTy, I64Ty}, false);
auto Callee =
M.getOrInsertFunction("__llvm_profile_instrument_gpu", CalleeTy);
Value *CastAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(Addr, PtrTy);
Value *Uniform =
ConstantPointerNull::get(PointerType::getUnqual(M.getContext()));
Builder.CreateCall(Callee, {CastAddr, Uniform, Inc->getStep()});
} else if (Options.Atomic || AtomicCounterUpdateAll ||
(Inc->getIndex()->isNullValue() && AtomicFirstCounter)) {
Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
MaybeAlign(), AtomicOrdering::Monotonic);
} else {

View File

@ -12,12 +12,6 @@ else()
set(LIBOMPTARGET_DEBUG False)
endif()
if ("compiler-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
set(LIBOMPTARGET_TEST_GPU_PGO True)
else()
set(LIBOMPTARGET_TEST_GPU_PGO False)
endif()
# Replace the space from user's input with ";" in case that CMake add escape
# char into the lit command.
string(REPLACE " " ";" LIBOMPTARGET_LIT_ARG_LIST "${LIBOMPTARGET_LIT_ARGS}")

View File

@ -2,6 +2,7 @@
# Configuration file for the 'lit' test runner.
import os
import glob
import lit.formats
# Tell pylint that we know config and lit_config exist somewhere.
@ -132,7 +133,17 @@ if config.libomptarget_has_libc:
config.available_features.add('libc')
profdata_path = os.path.join(config.bin_llvm_tools_dir, "llvm-profdata")
if config.libomptarget_test_pgo:
target = config.libomptarget_current_target
for suffix in ['-JIT-LTO', '-LTO']:
if target.endswith(suffix):
target = target[:-len(suffix)]
break
has_profile_rt = True
if target.startswith('amdgcn') or target.startswith('nvptx'):
has_profile_rt = bool(glob.glob(os.path.join(
config.llvm_lib_directory, 'clang', '*', 'lib', target,
'libclang_rt.profile.a')))
if has_profile_rt:
config.available_features.add('pgo')
config.substitutions.append(("%profdata", profdata_path))

View File

@ -27,7 +27,6 @@ config.offload_device_info = "@OFFLOAD_DEVICE_INFO_EXECUTABLE@"
config.libomptarget_debug = @LIBOMPTARGET_DEBUG@
config.has_libomptarget_ompt = @LIBOMPTARGET_OMPT_SUPPORT@
config.libomptarget_has_libc = @LIBOMPTARGET_GPU_LIBC_SUPPORT@
config.libomptarget_test_pgo = @LIBOMPTARGET_TEST_GPU_PGO@
config.offload_tblgen = "@OFFLOAD_TBLGEN_EXECUTABLE@"
# Let the main config do the real work.
lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")

View File

@ -16,7 +16,6 @@ set(src_files
${CMAKE_CURRENT_SOURCE_DIR}/src/Mapping.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Misc.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Parallelism.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Profiling.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Reduction.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/State.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/Synchronization.cpp

View File

@ -1,21 +0,0 @@
//===-------- Profiling.h - OpenMP interface ---------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_DEVICERTL_PROFILING_H
#define OMPTARGET_DEVICERTL_PROFILING_H
extern "C" {
void __llvm_profile_register_function(void *Ptr);
void __llvm_profile_register_names_function(void *Ptr, long int I);
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2);
}
#endif

View File

@ -1,18 +0,0 @@
//===------- Profiling.cpp ---------------------------------------- C++ ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "Profiling.h"
extern "C" {
// Provides empty implementations for certain functions in compiler-rt
// that are emitted by the PGO instrumentation.
void __llvm_profile_register_function(void *Ptr) {}
void __llvm_profile_register_names_function(void *Ptr, long int I) {}
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
}