[AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc (#151882)
Enables AMD data center class GPUs to use memory manager memory pooling up to 3GB allocation by default, up from the "1 << 13" threshold that all plugin-nextgen devices use.
This commit is contained in:
parent
c3103068b7
commit
83e5a99ff6
@ -2945,6 +2945,40 @@ private:
|
|||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
|
||||||
|
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
|
||||||
|
if (!Pool->isGlobal() || !Pool->isCoarseGrained())
|
||||||
|
continue;
|
||||||
|
uint64_t Value;
|
||||||
|
hsa_status_t Status =
|
||||||
|
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
|
||||||
|
if (Status != HSA_STATUS_SUCCESS)
|
||||||
|
continue;
|
||||||
|
constexpr uint64_t Almost64Gig = 0xFF0000000;
|
||||||
|
if (Value >= Almost64Gig)
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false; // CoarseGrain pool w/ 64GB or more capacity not found
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t getMemoryManagerSizeThreshold() override {
|
||||||
|
// Targeting high memory capacity GPUs such as
|
||||||
|
// data center GPUs.
|
||||||
|
if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
|
||||||
|
// Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
|
||||||
|
// if threshold is not already set by ENV var
|
||||||
|
// LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
|
||||||
|
// This MemoryManager is used for omp_target_alloc(), OpenMP
|
||||||
|
// (non-usm) map clause, etc.
|
||||||
|
//
|
||||||
|
// Ideally, this kind of pooling is best performed at
|
||||||
|
// a common level (e.g, user side of HSA) between OpenMP and HIP
|
||||||
|
// but that feature does not exist (yet).
|
||||||
|
return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/// Envar for controlling the number of HSA queues per device. High number of
|
/// Envar for controlling the number of HSA queues per device. High number of
|
||||||
/// queues may degrade performance.
|
/// queues may degrade performance.
|
||||||
UInt32Envar OMPX_NumQueues;
|
UInt32Envar OMPX_NumQueues;
|
||||||
|
@ -1139,6 +1139,9 @@ private:
|
|||||||
/// Pointer to the memory manager or nullptr if not available.
|
/// Pointer to the memory manager or nullptr if not available.
|
||||||
MemoryManagerTy *MemoryManager;
|
MemoryManagerTy *MemoryManager;
|
||||||
|
|
||||||
|
/// Per device setting of MemoryManager's Threshold
|
||||||
|
virtual size_t getMemoryManagerSizeThreshold() { return 0; }
|
||||||
|
|
||||||
/// Environment variables defined by the OpenMP standard.
|
/// Environment variables defined by the OpenMP standard.
|
||||||
Int32Envar OMP_TeamLimit;
|
Int32Envar OMP_TeamLimit;
|
||||||
Int32Envar OMP_NumTeams;
|
Int32Envar OMP_NumTeams;
|
||||||
|
@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
|
|||||||
|
|
||||||
// Enable the memory manager if required.
|
// Enable the memory manager if required.
|
||||||
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
|
auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
|
||||||
if (EnableMM)
|
if (EnableMM) {
|
||||||
|
if (ThresholdMM == 0)
|
||||||
|
ThresholdMM = getMemoryManagerSizeThreshold();
|
||||||
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
|
MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
|
||||||
|
}
|
||||||
|
|
||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
}
|
}
|
||||||
|
@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
|
|||||||
# For all other targets, we currently assume it is.
|
# For all other targets, we currently assume it is.
|
||||||
supports_unified_shared_memory = True
|
supports_unified_shared_memory = True
|
||||||
supports_apu = False
|
supports_apu = False
|
||||||
|
supports_large_allocation_memory_pool = False
|
||||||
if config.libomptarget_current_target.startswith('nvptx'):
|
if config.libomptarget_current_target.startswith('nvptx'):
|
||||||
try:
|
try:
|
||||||
cuda_arch = int(config.cuda_test_arch[:3])
|
cuda_arch = int(config.cuda_test_arch[:3])
|
||||||
@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
|
|||||||
elif config.libomptarget_current_target.startswith('amdgcn'):
|
elif config.libomptarget_current_target.startswith('amdgcn'):
|
||||||
# amdgpu_test_arch contains a list of AMD GPUs in the system
|
# amdgpu_test_arch contains a list of AMD GPUs in the system
|
||||||
# only check the first one assuming that we will run the test on it.
|
# only check the first one assuming that we will run the test on it.
|
||||||
if not (config.amdgpu_test_arch.startswith("gfx90a") or
|
if (config.amdgpu_test_arch.startswith("gfx90a") or
|
||||||
config.amdgpu_test_arch.startswith("gfx942") or
|
config.amdgpu_test_arch.startswith("gfx942") or
|
||||||
config.amdgpu_test_arch.startswith("gfx950")):
|
config.amdgpu_test_arch.startswith("gfx950")):
|
||||||
|
supports_large_allocation_memory_pool = True
|
||||||
|
else:
|
||||||
supports_unified_shared_memory = False
|
supports_unified_shared_memory = False
|
||||||
# check if AMD architecture is an APU:
|
# check if AMD architecture is an APU:
|
||||||
if ((config.amdgpu_test_arch.startswith("gfx942") and
|
if ((config.amdgpu_test_arch.startswith("gfx942") and
|
||||||
@ -144,6 +147,8 @@ if supports_unified_shared_memory:
|
|||||||
config.available_features.add('unified_shared_memory')
|
config.available_features.add('unified_shared_memory')
|
||||||
if supports_apu:
|
if supports_apu:
|
||||||
config.available_features.add('apu')
|
config.available_features.add('apu')
|
||||||
|
if supports_large_allocation_memory_pool:
|
||||||
|
config.available_features.add('large_allocation_memory_pool')
|
||||||
|
|
||||||
# Setup environment to find dynamic library at runtime
|
# Setup environment to find dynamic library at runtime
|
||||||
if config.operating_system == 'Windows':
|
if config.operating_system == 'Windows':
|
||||||
|
@ -10,6 +10,9 @@
|
|||||||
// UNSUPPORTED: s390x-ibm-linux-gnu
|
// UNSUPPORTED: s390x-ibm-linux-gnu
|
||||||
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
|
// UNSUPPORTED: s390x-ibm-linux-gnu-LTO
|
||||||
|
|
||||||
|
// If offload memory pooling is enabled for a large allocation, reuse error is
|
||||||
|
// not detected. UNSUPPORTED: large_allocation_memory_pool
|
||||||
|
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
37
offload/test/sanitizer/use_after_free_3.c
Normal file
37
offload/test/sanitizer/use_after_free_3.c
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
// clang-format off
|
||||||
|
// RUN: %libomptarget-compileopt-generic
|
||||||
|
// RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
|
||||||
|
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
|
||||||
|
// clang-format on
|
||||||
|
|
||||||
|
// If offload memory pooling is enabled for a large allocation, reuse error is
|
||||||
|
// not detected. Run the test w/ and w/o ENV var override on memory pooling
|
||||||
|
// threshold. REQUIRES: large_allocation_memory_pool
|
||||||
|
|
||||||
|
#include <omp.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
int N = (1 << 30);
|
||||||
|
char *A = (char *)malloc(N);
|
||||||
|
char *P;
|
||||||
|
#pragma omp target map(A[ : N]) map(from : P)
|
||||||
|
{
|
||||||
|
P = &A[N / 2];
|
||||||
|
*P = 3;
|
||||||
|
}
|
||||||
|
// clang-format off
|
||||||
|
// CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
|
||||||
|
// CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
|
||||||
|
// CHECK: Last deallocation:
|
||||||
|
// CHECK: Last allocation of size 1073741824
|
||||||
|
// clang-format on
|
||||||
|
#pragma omp target
|
||||||
|
{
|
||||||
|
*P = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
// CHECK-PASS: PASS
|
||||||
|
printf("PASS\n");
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user