[compiler-rt] Rework profile data handling for GPU targets (#187136)

Summary:
Currently, the GPU iterates through all of the present symbols and
copies them by prefix. This is inefficient as it requires a lot of small
high-latency data transfers rather than a few large ones. Additionally,
we force every single profiling symbol to have protected visibility.
This means potentially hundreds of unnecessary symbols in the symbol
table.

This PR changes the interface to move towards the start / stop section
handling. AMDGPU supports this natively as an ELF target, so we need
little changes. Instead of overriding visibility, we use a single table
to define the bounds that we can obtain with one contiguous load.

Using a table interface should also work for the in-progress HIP
implementation for this, as it wraps the start / stop sections into
standard void pointers which will be inside of an already mapped region
of memory, so they should be accessible from the HIP API.

NVPTX is more difficult as it is an ELF platform without this support. I
have hooked up the 'Other' handling to work around this, but even then
it's a bit of a stretch. I could remove this support here, but I wanted
to demonstrate that we can share the ABI. However, NVPTX will only work
if we force LTO and change the backend to emit variables in the same

TL;DR, we now do this:
```c
struct { start1, stop1, start2, stop2, start3, stop3, version; } device;
struct host = DtoH(lookup("device"));
counters = DtoH(host.stop - host.start)
version = DtoH(host.version);
```
This commit is contained in:
Joseph Huber 2026-03-26 10:17:43 -05:00 committed by GitHub
parent 76f88063b6
commit ffd6a13b5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 274 additions and 139 deletions

View File

@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
#undef INSTR_PROF_VALUE_NODE
/* INSTR_PROF_VALUE_NODE end. */
/* INSTR_PROF_GPU_SECT start. */
/* Fields of the GPU profile section bounds structure, populated by the
* compiler runtime and read by the host to extract profiling data. */
#ifndef INSTR_PROF_GPU_SECT
#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer)
#else
#define INSTR_PROF_DATA_DEFINED
#endif
INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \
NamesStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \
NamesStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \
CountersStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \
CountersStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
Ctx), DataStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
Ctx), DataStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \
VersionVar, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
#undef INSTR_PROF_GPU_SECT
/* INSTR_PROF_GPU_SECT end. */
/* INSTR_PROF_RAW_HEADER start */
/* Definition of member fields of the raw profile header data structure. */
/* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating
@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
* specified via command line. */
#define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename
/* GPU profiling section bounds structure, populated by the compiler runtime
* and read by the host to extract profiling data. */
#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections
/* section name strings common to all targets other
than WIN32 */
#define INSTR_PROF_DATA_COMMON __llvm_prf_data

View File

@ -57,6 +57,11 @@ typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
#include "profile/InstrProfData.inc"
} VTableProfData;
typedef struct __llvm_profile_gpu_sections {
#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) Type Name;
#include "profile/InstrProfData.inc"
} __llvm_profile_gpu_sections;
typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT)
__llvm_gcov_init_func_struct {
#define COVINIT_FUNC(Type, LLVMType, Name, Initializer) Type Name;

View File

@ -17,6 +17,9 @@
#include "InstrProfiling.h"
#include <gpuintrin.h>
// Symbols exported to the GPU runtime need to be visible in the .dynsym table.
#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected")))
// Indicates that the current wave is fully occupied.
static int is_uniform(uint64_t mask) {
const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes());
@ -39,4 +42,45 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter,
}
}
#if defined(__AMDGPU__)
#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON)
#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON)
#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON)
extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY
COMPILER_RT_WEAK;
// AMDGPU is a proper ELF target and exports the linker-defined section bounds.
COMPILER_RT_GPU_VISIBILITY
__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
PROF_NAME_START,
PROF_NAME_STOP,
PROF_CNTS_START,
PROF_CNTS_STOP,
PROF_DATA_START,
PROF_DATA_STOP,
&INSTR_PROF_RAW_VERSION_VAR};
#elif defined(__NVPTX__)
// NVPTX supports neither sections nor ELF symbols, we rely on the handling in
// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time.
// FIXME: This will not work until we make the NVPTX backend emit section
// globals next to each other.
COMPILER_RT_GPU_VISIBILITY
__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = {
NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR};
#endif
#endif

View File

@ -23,7 +23,7 @@
#if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \
(defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \
defined(_AIX) || defined(__wasm__) || defined(__HAIKU__) || \
defined(COMPILER_RT_PROFILE_BAREMETAL)
(defined(COMPILER_RT_PROFILE_BAREMETAL) && !defined(__NVPTX__))
#if !defined(_AIX) && !defined(__wasm__) && \
!defined(COMPILER_RT_PROFILE_BAREMETAL)

View File

@ -13,28 +13,38 @@
// This implementation expects the compiler instrumentation pass to define a
// constructor in each file which calls into this file.
#if !defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \
!defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \
!defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \
!defined(__wasm__) && !defined(__HAIKU__) && \
!defined(COMPILER_RT_PROFILE_BAREMETAL)
#include <stdlib.h>
#include <stdio.h>
#if (!defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \
!defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \
!defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \
!defined(__wasm__) && !defined(__HAIKU__) && \
!defined(COMPILER_RT_PROFILE_BAREMETAL)) || \
defined(__NVPTX__)
#include "InstrProfiling.h"
#include "InstrProfilingInternal.h"
#if defined(__NVPTX__)
extern __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE;
#define DataFirst INSTR_PROF_SECT_BOUNDS_TABLE.DataStart
#define DataLast INSTR_PROF_SECT_BOUNDS_TABLE.DataStop
#define NamesFirst INSTR_PROF_SECT_BOUNDS_TABLE.NamesStart
#define NamesLast INSTR_PROF_SECT_BOUNDS_TABLE.NamesStop
#define CountersFirst INSTR_PROF_SECT_BOUNDS_TABLE.CountersStart
#define CountersLast INSTR_PROF_SECT_BOUNDS_TABLE.CountersStop
#else
static const __llvm_profile_data *DataFirst = NULL;
static const __llvm_profile_data *DataLast = NULL;
static const VTableProfData *VTableProfDataFirst = NULL;
static const VTableProfData *VTableProfDataLast = NULL;
static const char *NamesFirst = NULL;
static const char *NamesLast = NULL;
static const char *VNamesFirst = NULL;
static const char *VNamesLast = NULL;
static char *CountersFirst = NULL;
static char *CountersLast = NULL;
#endif
static const VTableProfData *VTableProfDataFirst = NULL;
static const VTableProfData *VTableProfDataLast = NULL;
static const char *VNamesFirst = NULL;
static const char *VNamesLast = NULL;
static char *BitmapFirst = NULL;
static char *BitmapLast = NULL;
static const void *getMinAddr(const void *A1, const void *A2) {
return A1 < A2 ? A1 : A2;
@ -55,6 +65,19 @@ COMPILER_RT_VISIBILITY
void __llvm_profile_register_function(void *Data_) {
/* TODO: Only emit this function if we can't use linker magic. */
const __llvm_profile_data *Data = (__llvm_profile_data *)Data_;
#if defined(__NVPTX__)
// NVPTX stores absolute counter addresses to avoid circular dependencies in
// PTX global variable initializers. Convert to a relative offset so the
// host-side profile reader sees the standard format.
{
uintptr_t Rel = (uintptr_t)Data->CounterPtr - (uintptr_t)Data_;
__builtin_memcpy((char *)Data_ +
__builtin_offsetof(__llvm_profile_data, CounterPtr),
&Rel, sizeof(Rel));
}
#endif
if (!DataFirst) {
DataFirst = Data;
DataLast = Data + 1;

View File

@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \
#undef INSTR_PROF_VALUE_NODE
/* INSTR_PROF_VALUE_NODE end. */
/* INSTR_PROF_GPU_SECT start. */
/* Fields of the GPU profile section bounds structure, populated by the
* compiler runtime and read by the host to extract profiling data. */
#ifndef INSTR_PROF_GPU_SECT
#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer)
#else
#define INSTR_PROF_DATA_DEFINED
#endif
INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \
NamesStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \
NamesStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \
CountersStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \
CountersStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
Ctx), DataStart, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \
Ctx), DataStop, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \
VersionVar, \
ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx)))
#undef INSTR_PROF_GPU_SECT
/* INSTR_PROF_GPU_SECT end. */
/* INSTR_PROF_RAW_HEADER start */
/* Definition of member fields of the raw profile header data structure. */
/* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating
@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
* specified via command line. */
#define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename
/* GPU profiling section bounds structure, populated by the compiler runtime
* and read by the host to extract profiling data. */
#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections
/* section name strings common to all targets other
than WIN32 */
#define INSTR_PROF_DATA_COMMON __llvm_prf_data

View File

@ -486,25 +486,18 @@ bool isGPUProfTarget(const Module &M) {
}
void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) {
// If the target is a GPU, make the symbol protected so it can
// be read from the host device
if (isGPUProfTarget(M))
FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility);
// Hide the symbol so that we correctly get a copy for each executable.
else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage()))
FuncNameVar->setVisibility(GlobalValue::HiddenVisibility);
}
GlobalVariable *createPGOFuncNameVar(Module &M,
GlobalValue::LinkageTypes Linkage,
StringRef PGOFuncName) {
// Ensure profiling variables on GPU are visible to be read from host
if (isGPUProfTarget(M))
Linkage = GlobalValue::ExternalLinkage;
// We generally want to match the function's linkage, but available_externally
// and extern_weak both have the wrong semantics, and anything that doesn't
// need to link across compilation units doesn't need to be visible at all.
else if (Linkage == GlobalValue::ExternalWeakLinkage)
if (Linkage == GlobalValue::ExternalWeakLinkage)
Linkage = GlobalValue::LinkOnceAnyLinkage;
else if (Linkage == GlobalValue::AvailableExternallyLinkage)
Linkage = GlobalValue::LinkOnceODRLinkage;

View File

@ -1425,6 +1425,10 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) {
}
static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
// NVPTX is an ELF target but PTX does not expose sections or linker symbols.
if (TT.isNVPTX())
return true;
// compiler-rt uses linker support to get data/counters/name start/end for
// ELF, COFF, Mach-O, XCOFF, and Wasm.
if (TT.isOSBinFormatELF() || TT.isOSBinFormatCOFF() ||
@ -1815,10 +1819,6 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
if (isGPUProfTarget(M)) {
Linkage = GlobalValue::ExternalLinkage;
Visibility = GlobalValue::ProtectedVisibility;
}
// If the data variable is not referenced by code (if we don't emit
// @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the
// data variable live under linker GC, the data variable can be private. This
@ -1830,12 +1830,17 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
// If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees
// that other copies must have the same CFG and cannot have value profiling.
// If no hash suffix, other profd copies may be referenced by code.
else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
(TT.isOSBinFormatELF() ||
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) &&
(TT.isOSBinFormatELF() ||
(!DataReferencedByCode && TT.isOSBinFormatCOFF()))) {
Linkage = GlobalValue::PrivateLinkage;
Visibility = GlobalValue::DefaultVisibility;
}
// AMDGPU objects are always ET_DYN, so non-local symbols with default
// visibility are preemptible. The CounterPtr label difference emits a REL32
// relocation that lld rejects against preemptible targets.
if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage))
Visibility = GlobalValue::ProtectedVisibility;
auto *Data =
new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName);
Constant *RelativeCounterPtr;
@ -1849,6 +1854,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) {
RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy);
if (BitmapPtr != nullptr)
RelativeBitmapPtr = ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy);
} else if (TT.isNVPTX()) {
// The NVPTX target cannot handle self-referencing constant expressions in
// global initializers at all. Use absolute pointers and have the runtime
// registration convert them to relative offsets.
DataSectionKind = IPSK_data;
RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy);
} else {
// Reference the counter variable with a label difference (link-time
// constant).
@ -1953,10 +1964,6 @@ void InstrLowerer::emitNameData() {
NamesVar = new GlobalVariable(M, NamesVal->getType(), true,
GlobalValue::PrivateLinkage, NamesVal,
getInstrProfNamesVarName());
if (isGPUProfTarget(M)) {
NamesVar->setLinkage(GlobalValue::ExternalLinkage);
NamesVar->setVisibility(GlobalValue::ProtectedVisibility);
}
NamesSize = CompressedNameStr.size();
setGlobalVariableLargeSection(TT, *NamesVar);
@ -2048,6 +2055,11 @@ void InstrLowerer::emitRegistration() {
}
bool InstrLowerer::emitRuntimeHook() {
// GPU profiling data is read directly by the host offload runtime. We do not
// need the standard runtime hook.
if (TT.isGPU())
return false;
// We expect the linker to be invoked with -u<hook_var> flag for Linux
// in which case there is no need to emit the external variable.
if (TT.isOSLinux() || TT.isOSAIX())
@ -2062,10 +2074,7 @@ bool InstrLowerer::emitRuntimeHook() {
auto *Var =
new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage,
nullptr, getInstrProfRuntimeHookVarName());
if (isGPUProfTarget(M))
Var->setVisibility(GlobalValue::ProtectedVisibility);
else
Var->setVisibility(GlobalValue::HiddenVisibility);
Var->setVisibility(GlobalValue::HiddenVisibility);
if (TT.isOSBinFormatELF() && !TT.isPS()) {
// Mark the user variable as used so that it isn't stripped out.

View File

@ -469,9 +469,6 @@ createIRLevelProfileFlagVar(Module &M,
M, IntTy64, true, GlobalValue::WeakAnyLinkage,
Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName);
IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility);
if (isGPUProfTarget(M))
IRLevelVersionVariable->setVisibility(
llvm::GlobalValue::ProtectedVisibility);
Triple TT(M.getTargetTriple());
if (TT.supportsCOMDAT()) {

View File

@ -65,6 +65,12 @@ struct __llvm_profile_data {
#include "llvm/ProfileData/InstrProfData.inc"
};
struct __llvm_profile_gpu_sections {
#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) \
std::remove_const<Type>::type Name;
#include "llvm/ProfileData/InstrProfData.inc"
};
extern "C" {
extern int __attribute__((weak)) __llvm_write_custom_profile(
const char *Target, const __llvm_profile_data *DataBegin,
@ -72,11 +78,14 @@ extern int __attribute__((weak)) __llvm_write_custom_profile(
const char *CountersEnd, const char *NamesBegin, const char *NamesEnd,
const uint64_t *VersionOverride);
}
/// PGO profiling data extracted from a GPU device
/// PGO profiling data extracted from a GPU device via __llvm_profile_sections.
struct GPUProfGlobals {
SmallVector<int64_t> Counts;
SmallVector<__llvm_profile_data> Data;
SmallVector<uint8_t> NamesData;
SmallVector<char> NamesSection;
SmallVector<char> CountersSection;
SmallVector<char> DataSection;
/// Distance from __llvm_prf_data to __llvm_prf_cnts on the device. Used to
/// adjust CounterPtr label differences when remapping to the host buffer.
intptr_t DeviceCountersDelta = 0;
Triple TargetTriple;
uint64_t Version = INSTR_PROF_RAW_VERSION;

View File

@ -16,6 +16,7 @@
#include "Shared/Utils.h"
#include "llvm/ProfileData/InstrProf.h"
#include "llvm/ProfileData/InstrProfData.inc"
#include "llvm/Support/Error.h"
@ -179,67 +180,67 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
Expected<GPUProfGlobals>
GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
DeviceImageTy &Image) {
GPUProfGlobals DeviceProfileData;
const char *TableName = INSTR_PROF_QUOTE(INSTR_PROF_SECT_BOUNDS_TABLE);
if (!isSymbolInImage(Device, Image, TableName))
return GPUProfGlobals{};
GPUProfGlobals ProfData;
auto ObjFile = getELFObjectFile(Image);
if (!ObjFile)
return ObjFile.takeError();
std::unique_ptr<ELFObjectFileBase> ELFObj(
static_cast<ELFObjectFileBase *>(ObjFile->release()));
DeviceProfileData.TargetTriple = ELFObj->makeTriple();
ProfData.TargetTriple = ELFObj->makeTriple();
// Iterate through elf symbols
for (auto &Sym : ELFObj->symbols()) {
auto NameOrErr = Sym.getName();
if (!NameOrErr)
return NameOrErr.takeError();
__llvm_profile_gpu_sections Table = {};
GlobalTy TableGlobal(TableName, sizeof(Table), &Table);
if (auto Err = readGlobalFromDevice(Device, Image, TableGlobal))
return Err;
// Check if given current global is a profiling global based
// on name
if (*NameOrErr == getInstrProfNamesVarName()) {
// Read in profiled function names from ELF
auto SectionOrErr = Sym.getSection();
if (!SectionOrErr)
return SectionOrErr.takeError();
// Read the contiguous data from one of the profiling sections on the device.
auto ReadSection = [&](const void *Start, const void *Stop,
SmallVector<char> &Out) -> Error {
uintptr_t Begin = reinterpret_cast<uintptr_t>(Start);
uintptr_t End = reinterpret_cast<uintptr_t>(Stop);
size_t Size = End - Begin;
Out.resize_for_overwrite(Size);
return Size ? Device.dataRetrieve(Out.data(), Start, Size,
/*AsyncInfo=*/nullptr)
: Error::success();
};
auto ContentsOrErr = (*SectionOrErr)->getContents();
if (!ContentsOrErr)
return ContentsOrErr.takeError();
if (auto Err =
ReadSection(Table.NamesStart, Table.NamesStop, ProfData.NamesSection))
return Err;
if (auto Err = ReadSection(Table.CountersStart, Table.CountersStop,
ProfData.CountersSection))
return Err;
if (auto Err =
ReadSection(Table.DataStart, Table.DataStop, ProfData.DataSection))
return Err;
SmallVector<uint8_t> NameBytes(ContentsOrErr->bytes());
DeviceProfileData.NamesData = NameBytes;
} else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) {
// Read global variable profiling counts
SmallVector<int64_t> Counts(Sym.getSize() / sizeof(int64_t), 0);
GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data());
if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal))
return Err;
DeviceProfileData.Counts.append(std::move(Counts));
} else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) {
// Read profiling data for this global variable
__llvm_profile_data Data{};
GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data);
if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal))
return Err;
DeviceProfileData.Data.push_back(std::move(Data));
} else if (*NameOrErr == INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)) {
uint64_t RawVersionData;
GlobalTy RawVersionGlobal(NameOrErr->str(), Sym.getSize(),
&RawVersionData);
if (auto Err = readGlobalFromDevice(Device, Image, RawVersionGlobal))
return Err;
DeviceProfileData.Version = RawVersionData;
}
}
return DeviceProfileData;
ProfData.DeviceCountersDelta =
reinterpret_cast<intptr_t>(Table.CountersStart) -
reinterpret_cast<intptr_t>(Table.DataStart);
// Get the profiling version from the device.
if (auto Err = Device.dataRetrieve(&ProfData.Version, Table.VersionVar,
sizeof(uint64_t),
/*AsyncInfo=*/nullptr))
return Err;
return ProfData;
}
void GPUProfGlobals::dump() const {
outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str()
<< "\n";
outs() << "======== Counters =========\n";
for (size_t i = 0; i < Counts.size(); i++) {
size_t NumCounters = CountersSection.size() / sizeof(int64_t);
outs() << "======== Counters (" << NumCounters << ") =========\n";
auto *Counts = reinterpret_cast<const int64_t *>(CountersSection.data());
for (size_t i = 0; i < NumCounters; i++) {
if (i > 0 && i % 10 == 0)
outs() << "\n";
else if (i != 0)
@ -248,33 +249,14 @@ void GPUProfGlobals::dump() const {
}
outs() << "\n";
outs() << "========== Data ===========\n";
for (const auto &ProfData : Data) {
outs() << "{ ";
// The ProfData.Name maybe array, eg: NumValueSites[IPVK_Last+1] .
// If we print out it directly, we are accessing out of bound data.
// Skip dumping the array for now.
#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \
if (sizeof(#Name) > 2 && #Name[sizeof(#Name) - 2] == ']') { \
outs() << "[...] "; \
} else { \
outs() << ProfData.Name << " "; \
}
#include "llvm/ProfileData/InstrProfData.inc"
outs() << "}\n";
}
size_t NumDataEntries = DataSection.size() / sizeof(__llvm_profile_data);
outs() << "========== Data (" << NumDataEntries << ") ===========\n";
outs() << "======== Functions ========\n";
std::string s;
s.reserve(NamesData.size());
for (uint8_t Name : NamesData) {
s.push_back((char)Name);
}
InstrProfSymtab Symtab;
if (Error Err = Symtab.create(StringRef(s))) {
if (Error Err =
Symtab.create(StringRef(NamesSection.data(), NamesSection.size())))
consumeError(std::move(Err));
}
Symtab.dumpNames(outs());
outs() << "===========================\n";
}
@ -286,35 +268,36 @@ Error GPUProfGlobals::write() const {
"The compiler-rt profiling library must be linked for "
"GPU PGO to work.");
size_t DataSize = Data.size() * sizeof(__llvm_profile_data),
CountsSize = Counts.size() * sizeof(int64_t);
__llvm_profile_data *DataBegin, *DataEnd;
char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd;
// Lay out as [Data][Counters][Names] to match the raw profile format order.
// TODO: Move this interface to compiler-rt.
SmallVector<char> Buffer(DataSection.size() + CountersSection.size() +
NamesSection.size());
char *DataBegin = Buffer.data();
char *CountersBegin = DataBegin + DataSection.size();
char *NamesBegin = CountersBegin + CountersSection.size();
// Initialize array of contiguous data. We need to make sure each section is
// contiguous so that the PGO library can compute deltas properly
SmallVector<uint8_t> ContiguousData(NamesData.size() + DataSize + CountsSize);
memcpy(DataBegin, DataSection.data(), DataSection.size());
memcpy(CountersBegin, CountersSection.data(), CountersSection.size());
memcpy(NamesBegin, NamesSection.data(), NamesSection.size());
// Compute region pointers
DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize);
DataEnd =
(__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize);
CountersBegin = (char *)ContiguousData.data();
CountersEnd = (char *)(ContiguousData.data() + CountsSize);
NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize);
NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize +
NamesData.size());
// Adjust CounterPtr values so they are consistent with the host layout rather
// than the device layout.
intptr_t HostDelta = CountersBegin - DataBegin;
intptr_t Adjustment = HostDelta - DeviceCountersDelta;
auto *Records = reinterpret_cast<__llvm_profile_data *>(DataBegin);
size_t NumRecords = DataSection.size() / sizeof(__llvm_profile_data);
for (size_t I = 0; I < NumRecords; I++)
Records[I].CounterPtr = reinterpret_cast<void *>(
reinterpret_cast<intptr_t>(Records[I].CounterPtr) + Adjustment);
// Copy data to contiguous buffer
memcpy(DataBegin, Data.data(), DataSize);
memcpy(CountersBegin, Counts.data(), CountsSize);
memcpy(NamesBegin, NamesData.data(), NamesData.size());
// Invoke compiler-rt entrypoint
int result = __llvm_write_custom_profile(
TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin,
CountersEnd, NamesBegin, NamesEnd, &Version);
if (result != 0)
int Result = __llvm_write_custom_profile(
TargetTriple.str().c_str(),
reinterpret_cast<const __llvm_profile_data *>(DataBegin),
reinterpret_cast<const __llvm_profile_data *>(DataBegin +
DataSection.size()),
CountersBegin, CountersBegin + CountersSection.size(), NamesBegin,
NamesBegin + NamesSection.size(), &Version);
if (Result != 0)
return Plugin::error(ErrorCode::HOST_IO,
"error writing GPU PGO data to file");
@ -322,5 +305,5 @@ Error GPUProfGlobals::write() const {
}
bool GPUProfGlobals::empty() const {
return Counts.empty() && Data.empty() && NamesData.empty();
return CountersSection.empty() && DataSection.empty() && NamesSection.empty();
}