[AMDGPU][Offload] Enable memory manager use for up to ~3GB allocation size in omp_target_alloc (#151882)

Enables AMD data center class GPUs to use memory manager memory pooling up to 3GB allocation by default, up from the "1 << 13" threshold that all plugin-nextgen devices use.
2025-08-06 14:41:20 -07:00 · 2025-08-06 14:41:20 -07:00 · 83e5a99ff6
commit 83e5a99ff6
parent c3103068b7
6 changed files with 89 additions and 4 deletions
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@ -2945,6 +2945,40 @@ private:
    return Plugin::success();
  }
  bool checkIfCoarseGrainMemoryNearOrAbove64GB() {
    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
      if (!Pool->isGlobal() || !Pool->isCoarseGrained())
        continue;
      uint64_t Value;
      hsa_status_t Status =
          Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
      if (Status != HSA_STATUS_SUCCESS)
        continue;
      constexpr uint64_t Almost64Gig = 0xFF0000000;
      if (Value >= Almost64Gig)
        return true;
    }
    return false; // CoarseGrain pool w/ 64GB or more capacity not found
  }
  size_t getMemoryManagerSizeThreshold() override {
    // Targeting high memory capacity GPUs such as
    // data center GPUs.
    if (checkIfCoarseGrainMemoryNearOrAbove64GB()) {
      // Set GenericDeviceTy::MemoryManager's Threshold to 3GiB,
      // if threshold is not already set by ENV var
      // LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD.
      // This MemoryManager is used for omp_target_alloc(), OpenMP
      // (non-usm) map clause, etc.
      //
      // Ideally, this kind of pooling is best performed at
      // a common level (e.g, user side of HSA) between OpenMP and HIP
      // but that feature does not exist (yet).
      return 3ul * 1024 * 1024 * 1024 /* 3 GiB */;
    }
    return 0;
  }
  /// Envar for controlling the number of HSA queues per device. High number of
  /// queues may degrade performance.
  UInt32Envar OMPX_NumQueues;
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@ -1139,6 +1139,9 @@ private:
  /// Pointer to the memory manager or nullptr if not available.
  MemoryManagerTy *MemoryManager;
  /// Per device setting of MemoryManager's Threshold
  virtual size_t getMemoryManagerSizeThreshold() { return 0; }
  /// Environment variables defined by the OpenMP standard.
  Int32Envar OMP_TeamLimit;
  Int32Envar OMP_NumTeams;
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@ -815,8 +815,11 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
  // Enable the memory manager if required.
  auto [ThresholdMM, EnableMM] = MemoryManagerTy::getSizeThresholdFromEnv();
-  if (EnableMM)
+  if (EnableMM) {
    if (ThresholdMM == 0)
      ThresholdMM = getMemoryManagerSizeThreshold();
    MemoryManager = new MemoryManagerTy(*this, ThresholdMM);
  }
  return Plugin::success();
 }
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@ -121,6 +121,7 @@ if config.libomptarget_test_pgo:
 # For all other targets, we currently assume it is.
 supports_unified_shared_memory = True
 supports_apu = False
 supports_large_allocation_memory_pool = False
 if config.libomptarget_current_target.startswith('nvptx'):
  try:
    cuda_arch = int(config.cuda_test_arch[:3])
@ -132,9 +133,11 @@ if config.libomptarget_current_target.startswith('nvptx'):
 elif config.libomptarget_current_target.startswith('amdgcn'):
    # amdgpu_test_arch contains a list of AMD GPUs in the system
    # only check the first one assuming that we will run the test on it.
-    if not (config.amdgpu_test_arch.startswith("gfx90a") or
+    if (config.amdgpu_test_arch.startswith("gfx90a") or
-            config.amdgpu_test_arch.startswith("gfx942") or
+        config.amdgpu_test_arch.startswith("gfx942") or
-            config.amdgpu_test_arch.startswith("gfx950")):
+        config.amdgpu_test_arch.startswith("gfx950")):
       supports_large_allocation_memory_pool = True
    else:
       supports_unified_shared_memory = False
    # check if AMD architecture is an APU:
    if ((config.amdgpu_test_arch.startswith("gfx942") and
@ -144,6 +147,8 @@ if supports_unified_shared_memory:
   config.available_features.add('unified_shared_memory')
 if supports_apu:
   config.available_features.add('apu')
 if supports_large_allocation_memory_pool:
   config.available_features.add('large_allocation_memory_pool')
 # Setup environment to find dynamic library at runtime
 if config.operating_system == 'Windows':
--- a/offload/test/sanitizer/use_after_free_2.c
+++ b/offload/test/sanitizer/use_after_free_2.c
@ -10,6 +10,9 @@
 // UNSUPPORTED: s390x-ibm-linux-gnu
 // UNSUPPORTED: s390x-ibm-linux-gnu-LTO
 // If offload memory pooling is enabled for a large allocation, reuse error is
 // not detected. UNSUPPORTED: large_allocation_memory_pool
 #include <omp.h>
 int main() {
--- a/offload/test/sanitizer/use_after_free_3.c
+++ b/offload/test/sanitizer/use_after_free_3.c
@ -0,0 +1,37 @@
 // clang-format off
 // RUN: %libomptarget-compileopt-generic
 // RUN: %not --crash env -u LLVM_DISABLE_SYMBOLIZATION OFFLOAD_TRACK_ALLOCATION_TRACES=1 LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD=1024 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefixes=CHECK-PASS
 // clang-format on
 // If offload memory pooling is enabled for a large allocation, reuse error is
 // not detected. Run the test w/ and w/o ENV var override on memory pooling
 // threshold. REQUIRES: large_allocation_memory_pool
 #include <omp.h>
 #include <stdio.h>
 int main() {
  int N = (1 << 30);
  char *A = (char *)malloc(N);
  char *P;
 #pragma omp target map(A[ : N]) map(from : P)
  {
    P = &A[N / 2];
    *P = 3;
  }
  // clang-format off
 // CHECK: OFFLOAD ERROR: memory access fault by GPU {{.*}} (agent 0x{{.*}}) at virtual address [[PTR:0x[0-9a-z]*]]. Reasons: {{.*}}
 // CHECK: Device pointer [[PTR]] points into prior host-issued allocation:
 // CHECK: Last deallocation:
 // CHECK: Last allocation of size 1073741824
 // clang-format on
 #pragma omp target
  {
    *P = 5;
  }
  // CHECK-PASS: PASS
  printf("PASS\n");
  return 0;
 }