From ffd6a13b5ffed8bb2e314db29be784381f984ae1 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 26 Mar 2026 10:17:43 -0500 Subject: [PATCH] [compiler-rt] Rework profile data handling for GPU targets (#187136) Summary: Currently, the GPU iterates through all of the present symbols and copies them by prefix. This is inefficient as it requires a lot of small high-latency data transfers rather than a few large ones. Additionally, we force every single profiling symbol to have protected visibility. This means potentially hundreds of unnecessary symbols in the symbol table. This PR changes the interface to move towards the start / stop section handling. AMDGPU supports this natively as an ELF target, so we need little changes. Instead of overriding visibility, we use a single table to define the bounds that we can obtain with one contiguous load. Using a table interface should also work for the in-progress HIP implementation for this, as it wraps the start / stop sections into standard void pointers which will be inside of an already mapped region of memory, so they should be accessible from the HIP API. NVPTX is more difficult as it is an ELF platform without this support. I have hooked up the 'Other' handling to work around this, but even then it's a bit of a stretch. I could remove this support here, but I wanted to demonstrate that we can share the ABI. However, NVPTX will only work if we force LTO and change the backend to emit variables in the same TL;DR, we now do this: ```c struct { start1, stop1, start2, stop2, start3, stop3, version; } device; struct host = DtoH(lookup("device")); counters = DtoH(host.stop - host.start) version = DtoH(host.version); ``` --- compiler-rt/include/profile/InstrProfData.inc | 36 ++++ compiler-rt/lib/profile/InstrProfiling.h | 5 + .../lib/profile/InstrProfilingPlatformGPU.c | 44 +++++ .../lib/profile/InstrProfilingPlatformLinux.c | 2 +- .../lib/profile/InstrProfilingPlatformOther.c | 47 +++-- .../llvm/ProfileData/InstrProfData.inc | 36 ++++ llvm/lib/ProfileData/InstrProf.cpp | 11 +- .../Instrumentation/InstrProfiling.cpp | 39 ++-- .../Instrumentation/PGOInstrumentation.cpp | 3 - .../common/include/GlobalHandler.h | 17 +- .../common/src/GlobalHandler.cpp | 173 ++++++++---------- 11 files changed, 274 insertions(+), 139 deletions(-) diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc index 46d6bb5bd889..7525feab8f13 100644 --- a/compiler-rt/include/profile/InstrProfData.inc +++ b/compiler-rt/include/profile/InstrProfData.inc @@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #undef INSTR_PROF_VALUE_NODE /* INSTR_PROF_VALUE_NODE end. */ +/* INSTR_PROF_GPU_SECT start. */ +/* Fields of the GPU profile section bounds structure, populated by the + * compiler runtime and read by the host to extract profiling data. */ +#ifndef INSTR_PROF_GPU_SECT +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \ + VersionVar, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +#undef INSTR_PROF_GPU_SECT +/* INSTR_PROF_GPU_SECT end. */ + /* INSTR_PROF_RAW_HEADER start */ /* Definition of member fields of the raw profile header data structure. */ /* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating @@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, * specified via command line. */ #define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename +/* GPU profiling section bounds structure, populated by the compiler runtime + * and read by the host to extract profiling data. */ +#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections + /* section name strings common to all targets other than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index 54013d7e6568..1d22934bd6ef 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -57,6 +57,11 @@ typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData { #include "profile/InstrProfData.inc" } VTableProfData; +typedef struct __llvm_profile_gpu_sections { +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) Type Name; +#include "profile/InstrProfData.inc" +} __llvm_profile_gpu_sections; + typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) __llvm_gcov_init_func_struct { #define COVINIT_FUNC(Type, LLVMType, Name, Initializer) Type Name; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c index 78bf512f8c44..ab7031343c85 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformGPU.c @@ -17,6 +17,9 @@ #include "InstrProfiling.h" #include +// Symbols exported to the GPU runtime need to be visible in the .dynsym table. +#define COMPILER_RT_GPU_VISIBILITY __attribute__((visibility("protected"))) + // Indicates that the current wave is fully occupied. static int is_uniform(uint64_t mask) { const uint64_t uniform_mask = ~0ull >> (64 - __gpu_num_lanes()); @@ -39,4 +42,45 @@ COMPILER_RT_VISIBILITY void __llvm_profile_instrument_gpu(uint64_t *counter, } } +#if defined(__AMDGPU__) + +#define PROF_NAME_START INSTR_PROF_SECT_START(INSTR_PROF_NAME_COMMON) +#define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON) +#define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON) +#define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON) +#define PROF_DATA_START INSTR_PROF_SECT_START(INSTR_PROF_DATA_COMMON) +#define PROF_DATA_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_DATA_COMMON) + +extern char PROF_NAME_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_NAME_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_CNTS_START[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern char PROF_CNTS_STOP[] COMPILER_RT_VISIBILITY COMPILER_RT_WEAK; +extern __llvm_profile_data PROF_DATA_START[] COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; +extern __llvm_profile_data PROF_DATA_STOP[] COMPILER_RT_VISIBILITY + COMPILER_RT_WEAK; + +// AMDGPU is a proper ELF target and exports the linker-defined section bounds. +COMPILER_RT_GPU_VISIBILITY +__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { + PROF_NAME_START, + PROF_NAME_STOP, + PROF_CNTS_START, + PROF_CNTS_STOP, + PROF_DATA_START, + PROF_DATA_STOP, + &INSTR_PROF_RAW_VERSION_VAR}; + +#elif defined(__NVPTX__) + +// NVPTX supports neither sections nor ELF symbols, we rely on the handling in +// the 'InstrProfilingPlatformOther.c' file to fill this at initialization time. +// FIXME: This will not work until we make the NVPTX backend emit section +// globals next to each other. +COMPILER_RT_GPU_VISIBILITY +__llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE = { + NULL, NULL, NULL, NULL, NULL, NULL, &INSTR_PROF_RAW_VERSION_VAR}; + +#endif + #endif diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c index acdb222004fd..7a22be6bb586 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c @@ -23,7 +23,7 @@ #if defined(__linux__) || defined(__FreeBSD__) || defined(__Fuchsia__) || \ (defined(__sun__) && defined(__svr4__)) || defined(__NetBSD__) || \ defined(_AIX) || defined(__wasm__) || defined(__HAIKU__) || \ - defined(COMPILER_RT_PROFILE_BAREMETAL) + (defined(COMPILER_RT_PROFILE_BAREMETAL) && !defined(__NVPTX__)) #if !defined(_AIX) && !defined(__wasm__) && \ !defined(COMPILER_RT_PROFILE_BAREMETAL) diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c index f5d1c74f1011..205bba1060c3 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c @@ -13,28 +13,38 @@ // This implementation expects the compiler instrumentation pass to define a // constructor in each file which calls into this file. -#if !defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \ - !defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \ - !defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \ - !defined(__wasm__) && !defined(__HAIKU__) && \ - !defined(COMPILER_RT_PROFILE_BAREMETAL) - -#include -#include +#if (!defined(__APPLE__) && !defined(__linux__) && !defined(__FreeBSD__) && \ + !defined(__Fuchsia__) && !(defined(__sun__) && defined(__svr4__)) && \ + !defined(__NetBSD__) && !defined(_WIN32) && !defined(_AIX) && \ + !defined(__wasm__) && !defined(__HAIKU__) && \ + !defined(COMPILER_RT_PROFILE_BAREMETAL)) || \ + defined(__NVPTX__) #include "InstrProfiling.h" #include "InstrProfilingInternal.h" +#if defined(__NVPTX__) +extern __llvm_profile_gpu_sections INSTR_PROF_SECT_BOUNDS_TABLE; +#define DataFirst INSTR_PROF_SECT_BOUNDS_TABLE.DataStart +#define DataLast INSTR_PROF_SECT_BOUNDS_TABLE.DataStop +#define NamesFirst INSTR_PROF_SECT_BOUNDS_TABLE.NamesStart +#define NamesLast INSTR_PROF_SECT_BOUNDS_TABLE.NamesStop +#define CountersFirst INSTR_PROF_SECT_BOUNDS_TABLE.CountersStart +#define CountersLast INSTR_PROF_SECT_BOUNDS_TABLE.CountersStop +#else static const __llvm_profile_data *DataFirst = NULL; static const __llvm_profile_data *DataLast = NULL; -static const VTableProfData *VTableProfDataFirst = NULL; -static const VTableProfData *VTableProfDataLast = NULL; static const char *NamesFirst = NULL; static const char *NamesLast = NULL; -static const char *VNamesFirst = NULL; -static const char *VNamesLast = NULL; static char *CountersFirst = NULL; static char *CountersLast = NULL; +#endif +static const VTableProfData *VTableProfDataFirst = NULL; +static const VTableProfData *VTableProfDataLast = NULL; +static const char *VNamesFirst = NULL; +static const char *VNamesLast = NULL; +static char *BitmapFirst = NULL; +static char *BitmapLast = NULL; static const void *getMinAddr(const void *A1, const void *A2) { return A1 < A2 ? A1 : A2; @@ -55,6 +65,19 @@ COMPILER_RT_VISIBILITY void __llvm_profile_register_function(void *Data_) { /* TODO: Only emit this function if we can't use linker magic. */ const __llvm_profile_data *Data = (__llvm_profile_data *)Data_; + +#if defined(__NVPTX__) + // NVPTX stores absolute counter addresses to avoid circular dependencies in + // PTX global variable initializers. Convert to a relative offset so the + // host-side profile reader sees the standard format. + { + uintptr_t Rel = (uintptr_t)Data->CounterPtr - (uintptr_t)Data_; + __builtin_memcpy((char *)Data_ + + __builtin_offsetof(__llvm_profile_data, CounterPtr), + &Rel, sizeof(Rel)); + } +#endif + if (!DataFirst) { DataFirst = Data; DataLast = Data + 1; diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 46d6bb5bd889..7525feab8f13 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -142,6 +142,38 @@ INSTR_PROF_VALUE_NODE(PtrToNodeT, llvm::PointerType::getUnqual(Ctx), Next, \ #undef INSTR_PROF_VALUE_NODE /* INSTR_PROF_VALUE_NODE end. */ +/* INSTR_PROF_GPU_SECT start. */ +/* Fields of the GPU profile section bounds structure, populated by the + * compiler runtime and read by the host to extract profiling data. */ +#ifndef INSTR_PROF_GPU_SECT +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) +#else +#define INSTR_PROF_DATA_DEFINED +#endif +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const char *, llvm::PointerType::getUnqual(Ctx), \ + NamesStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(char *, llvm::PointerType::getUnqual(Ctx), \ + CountersStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStart, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(const __llvm_profile_data *, llvm::PointerType::getUnqual( \ + Ctx), DataStop, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +INSTR_PROF_GPU_SECT(uint64_t *, llvm::PointerType::getUnqual(Ctx), \ + VersionVar, \ + ConstantPointerNull::get(llvm::PointerType::getUnqual(Ctx))) +#undef INSTR_PROF_GPU_SECT +/* INSTR_PROF_GPU_SECT end. */ + /* INSTR_PROF_RAW_HEADER start */ /* Definition of member fields of the raw profile header data structure. */ /* Please update llvm/docs/InstrProfileFormat.rst as appropriate when updating @@ -761,6 +793,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure, * specified via command line. */ #define INSTR_PROF_PROFILE_NAME_VAR __llvm_profile_filename +/* GPU profiling section bounds structure, populated by the compiler runtime + * and read by the host to extract profiling data. */ +#define INSTR_PROF_SECT_BOUNDS_TABLE __llvm_profile_sections + /* section name strings common to all targets other than WIN32 */ #define INSTR_PROF_DATA_COMMON __llvm_prf_data diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp index 82469481881c..b96db851fa6b 100644 --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -486,25 +486,18 @@ bool isGPUProfTarget(const Module &M) { } void setPGOFuncVisibility(Module &M, GlobalVariable *FuncNameVar) { - // If the target is a GPU, make the symbol protected so it can - // be read from the host device - if (isGPUProfTarget(M)) - FuncNameVar->setVisibility(GlobalValue::ProtectedVisibility); // Hide the symbol so that we correctly get a copy for each executable. - else if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) + if (!GlobalValue::isLocalLinkage(FuncNameVar->getLinkage())) FuncNameVar->setVisibility(GlobalValue::HiddenVisibility); } GlobalVariable *createPGOFuncNameVar(Module &M, GlobalValue::LinkageTypes Linkage, StringRef PGOFuncName) { - // Ensure profiling variables on GPU are visible to be read from host - if (isGPUProfTarget(M)) - Linkage = GlobalValue::ExternalLinkage; // We generally want to match the function's linkage, but available_externally // and extern_weak both have the wrong semantics, and anything that doesn't // need to link across compilation units doesn't need to be visible at all. - else if (Linkage == GlobalValue::ExternalWeakLinkage) + if (Linkage == GlobalValue::ExternalWeakLinkage) Linkage = GlobalValue::LinkOnceAnyLinkage; else if (Linkage == GlobalValue::AvailableExternallyLinkage) Linkage = GlobalValue::LinkOnceODRLinkage; diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index d1696f4afbe3..dabd495cddd4 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1425,6 +1425,10 @@ static inline Constant *getFuncAddrForProfData(Function *Fn) { } static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) { + // NVPTX is an ELF target but PTX does not expose sections or linker symbols. + if (TT.isNVPTX()) + return true; + // compiler-rt uses linker support to get data/counters/name start/end for // ELF, COFF, Mach-O, XCOFF, and Wasm. if (TT.isOSBinFormatELF() || TT.isOSBinFormatCOFF() || @@ -1815,10 +1819,6 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]); - if (isGPUProfTarget(M)) { - Linkage = GlobalValue::ExternalLinkage; - Visibility = GlobalValue::ProtectedVisibility; - } // If the data variable is not referenced by code (if we don't emit // @llvm.instrprof.value.profile, NS will be 0), and the counter keeps the // data variable live under linker GC, the data variable can be private. This @@ -1830,12 +1830,17 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { // If profd is in a deduplicate comdat, NS==0 with a hash suffix guarantees // that other copies must have the same CFG and cannot have value profiling. // If no hash suffix, other profd copies may be referenced by code. - else if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && - (TT.isOSBinFormatELF() || - (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { + if (NS == 0 && !(DataReferencedByCode && NeedComdat && !Renamed) && + (TT.isOSBinFormatELF() || + (!DataReferencedByCode && TT.isOSBinFormatCOFF()))) { Linkage = GlobalValue::PrivateLinkage; Visibility = GlobalValue::DefaultVisibility; } + // AMDGPU objects are always ET_DYN, so non-local symbols with default + // visibility are preemptible. The CounterPtr label difference emits a REL32 + // relocation that lld rejects against preemptible targets. + if (TT.isAMDGPU() && !GlobalValue::isLocalLinkage(Linkage)) + Visibility = GlobalValue::ProtectedVisibility; auto *Data = new GlobalVariable(M, DataTy, false, Linkage, nullptr, DataVarName); Constant *RelativeCounterPtr; @@ -1849,6 +1854,12 @@ void InstrLowerer::createDataVariable(InstrProfCntrInstBase *Inc) { RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy); if (BitmapPtr != nullptr) RelativeBitmapPtr = ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy); + } else if (TT.isNVPTX()) { + // The NVPTX target cannot handle self-referencing constant expressions in + // global initializers at all. Use absolute pointers and have the runtime + // registration convert them to relative offsets. + DataSectionKind = IPSK_data; + RelativeCounterPtr = ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy); } else { // Reference the counter variable with a label difference (link-time // constant). @@ -1953,10 +1964,6 @@ void InstrLowerer::emitNameData() { NamesVar = new GlobalVariable(M, NamesVal->getType(), true, GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); - if (isGPUProfTarget(M)) { - NamesVar->setLinkage(GlobalValue::ExternalLinkage); - NamesVar->setVisibility(GlobalValue::ProtectedVisibility); - } NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); @@ -2048,6 +2055,11 @@ void InstrLowerer::emitRegistration() { } bool InstrLowerer::emitRuntimeHook() { + // GPU profiling data is read directly by the host offload runtime. We do not + // need the standard runtime hook. + if (TT.isGPU()) + return false; + // We expect the linker to be invoked with -u flag for Linux // in which case there is no need to emit the external variable. if (TT.isOSLinux() || TT.isOSAIX()) @@ -2062,10 +2074,7 @@ bool InstrLowerer::emitRuntimeHook() { auto *Var = new GlobalVariable(M, Int32Ty, false, GlobalValue::ExternalLinkage, nullptr, getInstrProfRuntimeHookVarName()); - if (isGPUProfTarget(M)) - Var->setVisibility(GlobalValue::ProtectedVisibility); - else - Var->setVisibility(GlobalValue::HiddenVisibility); + Var->setVisibility(GlobalValue::HiddenVisibility); if (TT.isOSBinFormatELF() && !TT.isPS()) { // Mark the user variable as used so that it isn't stripped out. diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 0232d45e5b7b..db032d6fcad4 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -469,9 +469,6 @@ createIRLevelProfileFlagVar(Module &M, M, IntTy64, true, GlobalValue::WeakAnyLinkage, Constant::getIntegerValue(IntTy64, APInt(64, ProfileVersion)), VarName); IRLevelVersionVariable->setVisibility(GlobalValue::HiddenVisibility); - if (isGPUProfTarget(M)) - IRLevelVersionVariable->setVisibility( - llvm::GlobalValue::ProtectedVisibility); Triple TT(M.getTargetTriple()); if (TT.supportsCOMDAT()) { diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index af7dac66ca85..fc8d6fe38475 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -65,6 +65,12 @@ struct __llvm_profile_data { #include "llvm/ProfileData/InstrProfData.inc" }; +struct __llvm_profile_gpu_sections { +#define INSTR_PROF_GPU_SECT(Type, LLVMType, Name, Initializer) \ + std::remove_const::type Name; +#include "llvm/ProfileData/InstrProfData.inc" +}; + extern "C" { extern int __attribute__((weak)) __llvm_write_custom_profile( const char *Target, const __llvm_profile_data *DataBegin, @@ -72,11 +78,14 @@ extern int __attribute__((weak)) __llvm_write_custom_profile( const char *CountersEnd, const char *NamesBegin, const char *NamesEnd, const uint64_t *VersionOverride); } -/// PGO profiling data extracted from a GPU device +/// PGO profiling data extracted from a GPU device via __llvm_profile_sections. struct GPUProfGlobals { - SmallVector Counts; - SmallVector<__llvm_profile_data> Data; - SmallVector NamesData; + SmallVector NamesSection; + SmallVector CountersSection; + SmallVector DataSection; + /// Distance from __llvm_prf_data to __llvm_prf_cnts on the device. Used to + /// adjust CounterPtr label differences when remapping to the host buffer. + intptr_t DeviceCountersDelta = 0; Triple TargetTriple; uint64_t Version = INSTR_PROF_RAW_VERSION; diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index b92c606d14da..9216834b1e15 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -16,6 +16,7 @@ #include "Shared/Utils.h" +#include "llvm/ProfileData/InstrProf.h" #include "llvm/ProfileData/InstrProfData.inc" #include "llvm/Support/Error.h" @@ -179,67 +180,67 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { - GPUProfGlobals DeviceProfileData; + const char *TableName = INSTR_PROF_QUOTE(INSTR_PROF_SECT_BOUNDS_TABLE); + if (!isSymbolInImage(Device, Image, TableName)) + return GPUProfGlobals{}; + + GPUProfGlobals ProfData; auto ObjFile = getELFObjectFile(Image); if (!ObjFile) return ObjFile.takeError(); std::unique_ptr ELFObj( static_cast(ObjFile->release())); - DeviceProfileData.TargetTriple = ELFObj->makeTriple(); + ProfData.TargetTriple = ELFObj->makeTriple(); - // Iterate through elf symbols - for (auto &Sym : ELFObj->symbols()) { - auto NameOrErr = Sym.getName(); - if (!NameOrErr) - return NameOrErr.takeError(); + __llvm_profile_gpu_sections Table = {}; + GlobalTy TableGlobal(TableName, sizeof(Table), &Table); + if (auto Err = readGlobalFromDevice(Device, Image, TableGlobal)) + return Err; - // Check if given current global is a profiling global based - // on name - if (*NameOrErr == getInstrProfNamesVarName()) { - // Read in profiled function names from ELF - auto SectionOrErr = Sym.getSection(); - if (!SectionOrErr) - return SectionOrErr.takeError(); + // Read the contiguous data from one of the profiling sections on the device. + auto ReadSection = [&](const void *Start, const void *Stop, + SmallVector &Out) -> Error { + uintptr_t Begin = reinterpret_cast(Start); + uintptr_t End = reinterpret_cast(Stop); + size_t Size = End - Begin; + Out.resize_for_overwrite(Size); + return Size ? Device.dataRetrieve(Out.data(), Start, Size, + /*AsyncInfo=*/nullptr) + : Error::success(); + }; - auto ContentsOrErr = (*SectionOrErr)->getContents(); - if (!ContentsOrErr) - return ContentsOrErr.takeError(); + if (auto Err = + ReadSection(Table.NamesStart, Table.NamesStop, ProfData.NamesSection)) + return Err; + if (auto Err = ReadSection(Table.CountersStart, Table.CountersStop, + ProfData.CountersSection)) + return Err; + if (auto Err = + ReadSection(Table.DataStart, Table.DataStop, ProfData.DataSection)) + return Err; - SmallVector NameBytes(ContentsOrErr->bytes()); - DeviceProfileData.NamesData = NameBytes; - } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) { - // Read global variable profiling counts - SmallVector Counts(Sym.getSize() / sizeof(int64_t), 0); - GlobalTy CountGlobal(NameOrErr->str(), Sym.getSize(), Counts.data()); - if (auto Err = readGlobalFromDevice(Device, Image, CountGlobal)) - return Err; - DeviceProfileData.Counts.append(std::move(Counts)); - } else if (NameOrErr->starts_with(getInstrProfDataVarPrefix())) { - // Read profiling data for this global variable - __llvm_profile_data Data{}; - GlobalTy DataGlobal(NameOrErr->str(), Sym.getSize(), &Data); - if (auto Err = readGlobalFromDevice(Device, Image, DataGlobal)) - return Err; - DeviceProfileData.Data.push_back(std::move(Data)); - } else if (*NameOrErr == INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR)) { - uint64_t RawVersionData; - GlobalTy RawVersionGlobal(NameOrErr->str(), Sym.getSize(), - &RawVersionData); - if (auto Err = readGlobalFromDevice(Device, Image, RawVersionGlobal)) - return Err; - DeviceProfileData.Version = RawVersionData; - } - } - return DeviceProfileData; + ProfData.DeviceCountersDelta = + reinterpret_cast(Table.CountersStart) - + reinterpret_cast(Table.DataStart); + + // Get the profiling version from the device. + if (auto Err = Device.dataRetrieve(&ProfData.Version, Table.VersionVar, + sizeof(uint64_t), + /*AsyncInfo=*/nullptr)) + return Err; + + return ProfData; } void GPUProfGlobals::dump() const { outs() << "======= GPU Profile =======\nTarget: " << TargetTriple.str() << "\n"; - outs() << "======== Counters =========\n"; - for (size_t i = 0; i < Counts.size(); i++) { + size_t NumCounters = CountersSection.size() / sizeof(int64_t); + outs() << "======== Counters (" << NumCounters << ") =========\n"; + auto *Counts = reinterpret_cast(CountersSection.data()); + for (size_t i = 0; i < NumCounters; i++) { if (i > 0 && i % 10 == 0) outs() << "\n"; else if (i != 0) @@ -248,33 +249,14 @@ void GPUProfGlobals::dump() const { } outs() << "\n"; - outs() << "========== Data ===========\n"; - for (const auto &ProfData : Data) { - outs() << "{ "; -// The ProfData.Name maybe array, eg: NumValueSites[IPVK_Last+1] . -// If we print out it directly, we are accessing out of bound data. -// Skip dumping the array for now. -#define INSTR_PROF_DATA(Type, LLVMType, Name, Initializer) \ - if (sizeof(#Name) > 2 && #Name[sizeof(#Name) - 2] == ']') { \ - outs() << "[...] "; \ - } else { \ - outs() << ProfData.Name << " "; \ - } -#include "llvm/ProfileData/InstrProfData.inc" - outs() << "}\n"; - } + size_t NumDataEntries = DataSection.size() / sizeof(__llvm_profile_data); + outs() << "========== Data (" << NumDataEntries << ") ===========\n"; outs() << "======== Functions ========\n"; - std::string s; - s.reserve(NamesData.size()); - for (uint8_t Name : NamesData) { - s.push_back((char)Name); - } - InstrProfSymtab Symtab; - if (Error Err = Symtab.create(StringRef(s))) { + if (Error Err = + Symtab.create(StringRef(NamesSection.data(), NamesSection.size()))) consumeError(std::move(Err)); - } Symtab.dumpNames(outs()); outs() << "===========================\n"; } @@ -286,35 +268,36 @@ Error GPUProfGlobals::write() const { "The compiler-rt profiling library must be linked for " "GPU PGO to work."); - size_t DataSize = Data.size() * sizeof(__llvm_profile_data), - CountsSize = Counts.size() * sizeof(int64_t); - __llvm_profile_data *DataBegin, *DataEnd; - char *CountersBegin, *CountersEnd, *NamesBegin, *NamesEnd; + // Lay out as [Data][Counters][Names] to match the raw profile format order. + // TODO: Move this interface to compiler-rt. + SmallVector Buffer(DataSection.size() + CountersSection.size() + + NamesSection.size()); + char *DataBegin = Buffer.data(); + char *CountersBegin = DataBegin + DataSection.size(); + char *NamesBegin = CountersBegin + CountersSection.size(); - // Initialize array of contiguous data. We need to make sure each section is - // contiguous so that the PGO library can compute deltas properly - SmallVector ContiguousData(NamesData.size() + DataSize + CountsSize); + memcpy(DataBegin, DataSection.data(), DataSection.size()); + memcpy(CountersBegin, CountersSection.data(), CountersSection.size()); + memcpy(NamesBegin, NamesSection.data(), NamesSection.size()); - // Compute region pointers - DataBegin = (__llvm_profile_data *)(ContiguousData.data() + CountsSize); - DataEnd = - (__llvm_profile_data *)(ContiguousData.data() + CountsSize + DataSize); - CountersBegin = (char *)ContiguousData.data(); - CountersEnd = (char *)(ContiguousData.data() + CountsSize); - NamesBegin = (char *)(ContiguousData.data() + CountsSize + DataSize); - NamesEnd = (char *)(ContiguousData.data() + CountsSize + DataSize + - NamesData.size()); + // Adjust CounterPtr values so they are consistent with the host layout rather + // than the device layout. + intptr_t HostDelta = CountersBegin - DataBegin; + intptr_t Adjustment = HostDelta - DeviceCountersDelta; + auto *Records = reinterpret_cast<__llvm_profile_data *>(DataBegin); + size_t NumRecords = DataSection.size() / sizeof(__llvm_profile_data); + for (size_t I = 0; I < NumRecords; I++) + Records[I].CounterPtr = reinterpret_cast( + reinterpret_cast(Records[I].CounterPtr) + Adjustment); - // Copy data to contiguous buffer - memcpy(DataBegin, Data.data(), DataSize); - memcpy(CountersBegin, Counts.data(), CountsSize); - memcpy(NamesBegin, NamesData.data(), NamesData.size()); - - // Invoke compiler-rt entrypoint - int result = __llvm_write_custom_profile( - TargetTriple.str().c_str(), DataBegin, DataEnd, CountersBegin, - CountersEnd, NamesBegin, NamesEnd, &Version); - if (result != 0) + int Result = __llvm_write_custom_profile( + TargetTriple.str().c_str(), + reinterpret_cast(DataBegin), + reinterpret_cast(DataBegin + + DataSection.size()), + CountersBegin, CountersBegin + CountersSection.size(), NamesBegin, + NamesBegin + NamesSection.size(), &Version); + if (Result != 0) return Plugin::error(ErrorCode::HOST_IO, "error writing GPU PGO data to file"); @@ -322,5 +305,5 @@ Error GPUProfGlobals::write() const { } bool GPUProfGlobals::empty() const { - return Counts.empty() && Data.empty() && NamesData.empty(); + return CountersSection.empty() && DataSection.empty() && NamesSection.empty(); }