[Offload] Add olCalculateOptimalOccupancy (#142950)
This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implemented on Cuda; AMDGPU and Host return unsupported. --------- Co-authored-by: Callum Fare <callum@codeplay.com>
This commit is contained in:
parent
2c4f0e7ac6
commit
2c11a83691
@ -6,7 +6,7 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains Offload API definitions related to launching kernels
|
||||
// This file contains Offload API definitions related to kernels
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@ -42,3 +42,21 @@ def : Function {
|
||||
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
|
||||
];
|
||||
}
|
||||
|
||||
def : Function {
|
||||
let name = "olCalculateOptimalOccupancy";
|
||||
let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy.";
|
||||
let details = [
|
||||
"For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.",
|
||||
];
|
||||
let params = [
|
||||
Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
|
||||
Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
|
||||
Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>,
|
||||
Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT>
|
||||
];
|
||||
let returns = [
|
||||
Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>,
|
||||
Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>,
|
||||
];
|
||||
}
|
||||
|
@ -781,6 +781,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
|
||||
return olDestroy(Program);
|
||||
}
|
||||
|
||||
Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device,
|
||||
ol_symbol_handle_t Kernel,
|
||||
size_t DynamicMemSize,
|
||||
size_t *GroupSize) {
|
||||
if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL)
|
||||
return createOffloadError(ErrorCode::SYMBOL_KIND,
|
||||
"provided symbol is not a kernel");
|
||||
auto *KernelImpl = std::get<GenericKernelTy *>(Kernel->PluginImpl);
|
||||
|
||||
auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
|
||||
if (auto Err = Res.takeError())
|
||||
return Err;
|
||||
|
||||
*GroupSize = *Res;
|
||||
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
|
||||
ol_symbol_handle_t Kernel, const void *ArgumentsData,
|
||||
size_t ArgumentsSize,
|
||||
|
@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy {
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||
|
||||
/// Return maximum block size for maximum occupancy
|
||||
///
|
||||
/// TODO: This needs to be implemented for amdgpu
|
||||
Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
|
||||
uint64_t DynamicMemSize) const override {
|
||||
return Plugin::error(
|
||||
ErrorCode::UNSUPPORTED,
|
||||
"occupancy calculations for AMDGPU are not yet implemented");
|
||||
}
|
||||
|
||||
/// Print more elaborate kernel launch info for AMDGPU
|
||||
Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
|
||||
KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
|
||||
|
@ -388,6 +388,9 @@ struct GenericKernelTy {
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
|
||||
|
||||
virtual Expected<uint64_t> maxGroupSize(GenericDeviceTy &GenericDevice,
|
||||
uint64_t DynamicMemSize) const = 0;
|
||||
|
||||
/// Get the kernel name.
|
||||
const char *getName() const { return Name.c_str(); }
|
||||
|
||||
|
@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
|
||||
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
|
||||
DLWRAP(cuDevicePrimaryCtxRetain, 2)
|
||||
DLWRAP(cuModuleLoadDataEx, 5)
|
||||
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
|
||||
|
||||
DLWRAP(cuDeviceCanAccessPeer, 3)
|
||||
DLWRAP(cuCtxEnablePeerAccess, 2)
|
||||
|
@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
|
||||
static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
|
||||
|
||||
typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
|
||||
typedef size_t (*CUoccupancyB2DSize)(int);
|
||||
|
||||
CUresult cuCtxGetDevice(CUdevice *);
|
||||
CUresult cuDeviceGet(CUdevice *, int);
|
||||
@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
|
||||
CUresult cuMemGetAllocationGranularity(size_t *granularity,
|
||||
const CUmemAllocationProp *prop,
|
||||
CUmemAllocationGranularity_flags option);
|
||||
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
|
||||
CUoccupancyB2DSize, size_t, int);
|
||||
|
||||
#endif
|
||||
|
@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
|
||||
KernelLaunchParamsTy LaunchParams,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
||||
|
||||
/// Return maximum block size for maximum occupancy
|
||||
Expected<uint64_t> maxGroupSize(GenericDeviceTy &,
|
||||
uint64_t DynamicMemSize) const override {
|
||||
int minGridSize;
|
||||
int maxBlockSize;
|
||||
auto Res = cuOccupancyMaxPotentialBlockSize(
|
||||
&minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
|
||||
if (auto Err = Plugin::check(
|
||||
Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
|
||||
return Err;
|
||||
}
|
||||
return maxBlockSize;
|
||||
}
|
||||
|
||||
private:
|
||||
/// The CUDA kernel function to execute.
|
||||
CUfunction Func;
|
||||
|
@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy {
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// Return maximum block size for maximum occupancy
|
||||
Expected<uint64_t> maxGroupSize(GenericDeviceTy &Device,
|
||||
uint64_t DynamicMemSize) const override {
|
||||
return Plugin::error(
|
||||
ErrorCode::UNSUPPORTED,
|
||||
"occupancy calculations are not implemented for the host device");
|
||||
}
|
||||
|
||||
private:
|
||||
/// The kernel function to execute.
|
||||
void (*Func)(void);
|
||||
|
@ -20,6 +20,7 @@ add_offload_unittest("init"
|
||||
target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER)
|
||||
|
||||
add_offload_unittest("kernel"
|
||||
kernel/olCalculateOptimalOccupancy.cpp
|
||||
kernel/olLaunchKernel.cpp)
|
||||
|
||||
add_offload_unittest("memory"
|
||||
|
@ -26,6 +26,20 @@
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED
|
||||
#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \
|
||||
do { \
|
||||
ol_result_t Res = ACTUAL; \
|
||||
if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \
|
||||
GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \
|
||||
return; \
|
||||
} else if (Res && Res->Code != OL_ERRC_SUCCESS) { \
|
||||
GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \
|
||||
<< Res->Details; \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
// TODO: rework this so the EXPECTED/ACTUAL results are readable
|
||||
#ifndef ASSERT_ERROR
|
||||
#define ASSERT_ERROR(EXPECTED, ACTUAL) \
|
||||
|
@ -0,0 +1,45 @@
|
||||
//===------- Offload API tests - olCalculateOptimalOccupancy --------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "../common/Fixtures.hpp"
|
||||
#include <OffloadAPI.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using olCalculateOptimalOccupancyTest = OffloadKernelTest;
|
||||
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest);
|
||||
|
||||
TEST_P(olCalculateOptimalOccupancyTest, Success) {
|
||||
size_t Size{0};
|
||||
ASSERT_SUCCESS_OR_UNSUPPORTED(
|
||||
olCalculateOptimalOccupancy(Device, Kernel, 0, &Size));
|
||||
ASSERT_GT(Size, 0u);
|
||||
}
|
||||
|
||||
TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) {
|
||||
size_t Size{0};
|
||||
ASSERT_SUCCESS_OR_UNSUPPORTED(
|
||||
olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size));
|
||||
ASSERT_GT(Size, 0u);
|
||||
}
|
||||
|
||||
TEST_P(olCalculateOptimalOccupancyTest, NullKernel) {
|
||||
size_t Size;
|
||||
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
|
||||
olCalculateOptimalOccupancy(Device, nullptr, 0, &Size));
|
||||
}
|
||||
|
||||
TEST_P(olCalculateOptimalOccupancyTest, NullDevice) {
|
||||
size_t Size;
|
||||
ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
|
||||
olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size));
|
||||
}
|
||||
|
||||
TEST_P(olCalculateOptimalOccupancyTest, NullOutput) {
|
||||
ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
|
||||
olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr));
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user