llvm-project/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c
Joseph Huber ffd6a13b5f
[compiler-rt] Rework profile data handling for GPU targets (#187136)
Summary:
Currently, the GPU iterates through all of the present symbols and
copies them by prefix. This is inefficient as it requires a lot of small
high-latency data transfers rather than a few large ones. Additionally,
we force every single profiling symbol to have protected visibility.
This means potentially hundreds of unnecessary symbols in the symbol
table.

This PR changes the interface to move towards the start / stop section
handling. AMDGPU supports this natively as an ELF target, so we need
little changes. Instead of overriding visibility, we use a single table
to define the bounds that we can obtain with one contiguous load.

Using a table interface should also work for the in-progress HIP
implementation for this, as it wraps the start / stop sections into
standard void pointers which will be inside of an already mapped region
of memory, so they should be accessible from the HIP API.

NVPTX is more difficult as it is an ELF platform without this support. I
have hooked up the 'Other' handling to work around this, but even then
it's a bit of a stretch. I could remove this support here, but I wanted
to demonstrate that we can share the ABI. However, NVPTX will only work
if we force LTO and change the backend to emit variables in the same

TL;DR, we now do this:
```c
struct { start1, stop1, start2, stop2, start3, stop3, version; } device;
struct host = DtoH(lookup("device"));
counters = DtoH(host.stop - host.start)
version = DtoH(host.version);
```
2026-03-26 10:17:43 -05:00

87 lines
3.6 KiB
C

/*===- InstrProfilingPlatformGPU.c - GPU profiling support ----------------===*\
|*
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|* See https://llvm.org/LICENSE.txt for license information.
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|*
\*===----------------------------------------------------------------------===*/
// GPU-specific profiling functions for AMDGPU and NVPTX targets. This file
// provides:
//
// Platform plumbing (section boundaries, binary IDs, VNodes) are handled by
// InstrProfilingPlatformLinux.c via the COMPILER_RT_PROFILE_BAREMETAL path.
#if defined(__NVPTX__) || defined(__AMDGPU__)
#include "InstrProfiling.h"
#include <gpuintrin.h>
// Symbols exported to the GPU runtime need to be visible in the .dynsym table.
#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
// Indicates that the current wave is fully occupied.
static int is_uniform(uint64_t mask) {
const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
return mask == uniform_mask;
}
// Wave-cooperative counter increment. The instrumentation pass emits calls to
// this in place of the default non-atomic load/add/store or atomicrmw sequence.
// The optional uniform counter allows calculating wave uniformity if present.
COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
uint64_t *uniform,
uint64_t step) {
uint64_t mask = __gpu_lane_mask();
if (__gpu_is_first_in_lane(mask)) {
__scoped_atomic_fetch_add(counter, step * __builtin_popcountg(mask),
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
if (uniform && is_uniform(mask))
__scoped_atomic_fetch_add(uniform, step * __builtin_popcountg(mask),
__ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE);
}
}
#if defined(__AMDGPU__)
#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
COMPILER_RT_GPU_VISIBILITY
__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
PROF_NAME_START,
PROF_NAME_STOP,
PROF_CNTS_START,
PROF_CNTS_STOP,
PROF_DATA_START,
PROF_DATA_STOP,
&INSTR_PROF_RAW_VERSION_VAR};
#elif defined(__NVPTX__)
// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
// FIXME: This will not work until we make the NVPTX backend emit section
// globals next to each other.
COMPILER_RT_GPU_VISIBILITY
__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
#endif
#endif