[Offload] Add olCalculateOptimalOccupancy (#142950)

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is
currently
only implemented on Cuda; AMDGPU and Host return unsupported.

---------

Co-authored-by: Callum Fare <callum@codeplay.com>
This commit is contained in:
Ross Brunton 2025-08-19 15:16:47 +01:00 committed by GitHub
parent 2c4f0e7ac6
commit 2c11a83691
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 136 additions and 1 deletions

View File

@ -6,7 +6,7 @@
//
//===----------------------------------------------------------------------===//
//
// This file contains Offload API definitions related to launching kernels
// This file contains Offload API definitions related to kernels
//
//===----------------------------------------------------------------------===//
@ -42,3 +42,21 @@ def : Function {
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
];
}
def : Function {
let name = "olCalculateOptimalOccupancy";
let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
let details = [
"For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
];
let params = [
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
];
let returns = [
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
];
}

View File

@ -781,6 +781,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
return olDestroy(Program);
}
Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
ol_symbol_handle_t Kernel,
size_t DynamicMemSize,
size_t *GroupSize) {
if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
return createOffloadError(ErrorCode::SYMBOL_KIND,
"provided symbol is not a kernel");
auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
if (auto Err = Res.takeError())
return Err;
*GroupSize = *Res;
return Error::success();
}
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_symbol_handle_t Kernel, const void *ArgumentsData,
size_t ArgumentsSize,

View File

@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy
///
/// TODO: This needs to be implemented for amdgpu
Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
uint64_t DynamicMemSize) const override {
return Plugin::error(
ErrorCode::UNSUPPORTED,
"occupancy calculations for AMDGPU are not yet implemented");
}
/// Print more elaborate kernel launch info for AMDGPU
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],

View File

@ -388,6 +388,9 @@ struct GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
uint64_t DynamicMemSize) const = 0;
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }

View File

@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)

View File

@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
typedef size_t (*CUoccupancyB2DSize)(int);
CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
CUresult cuMemGetAllocationGranularity(size_t *granularity,
const CUmemAllocationProp *prop,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
#endif

View File

@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
/// Return maximum block size for maximum occupancy
Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
uint64_t DynamicMemSize) const override {
int minGridSize;
int maxBlockSize;
auto Res = cuOccupancyMaxPotentialBlockSize(
&minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
if (auto Err = Plugin::check(
Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
return Err;
}
return maxBlockSize;
}
private:
/// The CUDA kernel function to execute.
CUfunction Func;

View File

@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
return Plugin::success();
}
/// Return maximum block size for maximum occupancy
Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
uint64_t DynamicMemSize) const override {
return Plugin::error(
ErrorCode::UNSUPPORTED,
"occupancy calculations are not implemented for the host device");
}
private:
/// The kernel function to execute.
void (*Func)(void);

View File

@ -20,6 +20,7 @@ add_offload_unittest("init"
target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
add_offload_unittest("kernel"
kernel/olCalculateOptimalOccupancy.cpp
kernel/olLaunchKernel.cpp)
add_offload_unittest("memory"

View File

@ -26,6 +26,20 @@
} while (0)
#endif
#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \
do { \
ol_result_t Res = ACTUAL; \
if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \
GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \
return; \
} else if (Res && Res->Code != OL_ERRC_SUCCESS) { \
GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \
<< Res->Details; \
} \
} while (0)
#endif
// TODO: rework this so the EXPECTED/ACTUAL results are readable
#ifndef ASSERT_ERROR
#define ASSERT_ERROR(EXPECTED, ACTUAL) \

View File

@ -0,0 +1,45 @@
//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "../common/Fixtures.hpp"
#include <OffloadAPI.h>
#include <gtest/gtest.h>
using olCalculateOptimalOccupancyTest = OffloadKernelTest;
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);
TEST_P(olCalculateOptimalOccupancyTest, Success) {
size_t Size{0};
ASSERT_SUCCESS_OR_UNSUPPORTED(
olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
ASSERT_GT(Size, 0u);
}
TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
size_t Size{0};
ASSERT_SUCCESS_OR_UNSUPPORTED(
olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
ASSERT_GT(Size, 0u);
}
TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
size_t Size;
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
}
TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
size_t Size;
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
}
TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
}