[Offload] Add olCalculateOptimalOccupancy (#142950)

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implemented on Cuda; AMDGPU and Host return unsupported. --------- Co-authored-by: Callum Fare <callum@codeplay.com>
2025-08-19 15:16:47 +01:00 · 2025-08-19 15:16:47 +01:00 · 2c11a83691
commit 2c11a83691
parent 2c4f0e7ac6
11 changed files with 136 additions and 1 deletions
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains Offload API definitions related to launching kernels
+// This file contains Offload API definitions related to kernels
 //
 //===----------------------------------------------------------------------===//

@ -42,3 +42,21 @@ def : Function {
        Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
    ];
 }
+
+def : Function {
+    let name = "olCalculateOptimalOccupancy";
+    let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
+    let details = [
+        "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
+    ];
+    let params = [
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
+    ];
+    let returns = [
+        Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
+        Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
+    ];
+}
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@ -781,6 +781,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
  return olDestroy(Program);
 }

+Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
+                                       ol_symbol_handle_t Kernel,
+                                       size_t DynamicMemSize,
+                                       size_t *GroupSize) {
+  if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
+    return createOffloadError(ErrorCode::SYMBOL_KIND,
+                              "provided symbol is not a kernel");
+  auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError())
+    return Err;
+
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                          ol_symbol_handle_t Kernel, const void *ArgumentsData,
                          size_t ArgumentsSize,
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                   KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations for AMDGPU are not yet implemented");
+  }
+
  /// Print more elaborate kernel launch info for AMDGPU
  Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                               KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@ -388,6 +388,9 @@ struct GenericKernelTy {
                           KernelLaunchParamsTy LaunchParams,
                           AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

+  virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                          uint64_t DynamicMemSize) const = 0;
+
  /// Get the kernel name.
  const char *getName() const { return Name.c_str(); }

--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)

 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;

 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);

 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                       const CUmemAllocationProp *prop,
                                       CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);

 #endif
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                   KernelLaunchParamsTy LaunchParams,
                   AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
+                                  uint64_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
  /// The CUDA kernel function to execute.
  CUfunction Func;
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
    return Plugin::success();
  }

+  /// Return maximum block size for maximum occupancy
+  Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
+                                  uint64_t DynamicMemSize) const override {
+    return Plugin::error(
+        ErrorCode::UNSUPPORTED,
+        "occupancy calculations are not implemented for the host device");
+  }
+
 private:
  /// The kernel function to execute.
  void (*Func)(void);
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@ -20,6 +20,7 @@ add_offload_unittest("init"
 target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)

 add_offload_unittest("kernel"
+    kernel/olCalculateOptimalOccupancy.cpp
    kernel/olLaunchKernel.cpp)

 add_offload_unittest("memory"
--- a/offload/unittests/OffloadAPI/common/Fixtures.hpp
+++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp
@ -26,6 +26,20 @@
  } while (0)
 #endif

+#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
+#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL)                                  \
+  do {                                                                         \
+    ol_result_t Res = ACTUAL;                                                  \
+    if (Res && Res->Code == OL_ERRC_UNSUPPORTED) {                             \
+      GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test";          \
+      return;                                                                  \
+    } else if (Res && Res->Code != OL_ERRC_SUCCESS) {                          \
+      GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": "                \
+                   << Res->Details;                                            \
+    }                                                                          \
+  } while (0)
+#endif
+
 // TODO: rework this so the EXPECTED/ACTUAL results are readable
 #ifndef ASSERT_ERROR
 #define ASSERT_ERROR(EXPECTED, ACTUAL)                                         \
--- a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp
@ -0,0 +1,45 @@
+//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olCalculateOptimalOccupancyTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);
+
+TEST_P(olCalculateOptimalOccupancyTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
+  size_t Size{0};
+  ASSERT_SUCCESS_OR_UNSUPPORTED(
+      olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
+  ASSERT_GT(Size, 0u);
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
+}
+
+TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
+}