diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td index 502fb36467db..f9f5ef74b560 100644 --- a/offload/liboffload/API/Kernel.td +++ b/offload/liboffload/API/Kernel.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file contains Offload API definitions related to launching kernels +// This file contains Offload API definitions related to kernels // //===----------------------------------------------------------------------===// @@ -42,3 +42,21 @@ def : Function { Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, ]; } + +def : Function { + let name = "olCalculateOptimalOccupancy"; + let desc = "Given dynamic memory size, query the device for a workgroup size that will result in optimal occupancy."; + let details = [ + "For most devices, this will be the largest workgroup size that will result in all work items fitting on the device at once.", + ]; + let params = [ + Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>, + Param<"ol_symbol_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, + Param<"size_t", "SharedMemory", "dynamic shared memory required per work item in bytes", PARAM_IN>, + Param<"size_t*", "GroupSize", "optimal block size", PARAM_OUT> + ]; + let returns = [ + Return<"OL_ERRC_SYMBOL_KIND", ["The provided symbol is not a kernel"]>, + Return<"OL_ERRC_UNSUPPORTED", ["The backend cannot provide this information"]>, + ]; +} diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index cfb3342016f4..54c6d452cbd9 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -781,6 +781,24 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) { return olDestroy(Program); } +Error olCalculateOptimalOccupancy_impl(ol_device_handle_t Device, + ol_symbol_handle_t Kernel, + size_t DynamicMemSize, + size_t *GroupSize) { + if (Kernel->Kind != OL_SYMBOL_KIND_KERNEL) + return createOffloadError(ErrorCode::SYMBOL_KIND, + "provided symbol is not a kernel"); + auto *KernelImpl = std::get(Kernel->PluginImpl); + + auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize); + if (auto Err = Res.takeError()) + return Err; + + *GroupSize = *Res; + + return Error::success(); +} + Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, ol_symbol_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 3143fe408563..b07086d2c7ab 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -570,6 +570,16 @@ struct AMDGPUKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + /// + /// TODO: This needs to be implemented for amdgpu + Expected maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations for AMDGPU are not yet implemented"); + } + /// Print more elaborate kernel launch info for AMDGPU Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index a448721755a6..8637e4c1f12c 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -388,6 +388,9 @@ struct GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0; + virtual Expected maxGroupSize(GenericDeviceTy &GenericDevice, + uint64_t DynamicMemSize) const = 0; + /// Get the kernel name. const char *getName() const { return Name.c_str(); } diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp index 361a781e8f9b..c003d0b2f945 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp @@ -72,6 +72,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3) DLWRAP(cuDevicePrimaryCtxSetFlags, 2) DLWRAP(cuDevicePrimaryCtxRetain, 2) DLWRAP(cuModuleLoadDataEx, 5) +DLWRAP(cuOccupancyMaxPotentialBlockSize, 6) DLWRAP(cuDeviceCanAccessPeer, 3) DLWRAP(cuCtxEnablePeerAccess, 2) diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h index b6c022c8e7e8..5f1c44364c14 100644 --- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h +++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h @@ -290,6 +290,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01; static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02; typedef void (*CUstreamCallback)(CUstream, CUresult, void *); +typedef size_t (*CUoccupancyB2DSize)(int); CUresult cuCtxGetDevice(CUdevice *); CUresult cuDeviceGet(CUdevice *, int); @@ -372,5 +373,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); +CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, + CUoccupancyB2DSize, size_t, int); #endif diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 0a3720532e4e..6aba6f8f3e6a 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy { KernelLaunchParamsTy LaunchParams, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; + /// Return maximum block size for maximum occupancy + Expected maxGroupSize(GenericDeviceTy &, + uint64_t DynamicMemSize) const override { + int minGridSize; + int maxBlockSize; + auto Res = cuOccupancyMaxPotentialBlockSize( + &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX); + if (auto Err = Plugin::check( + Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) { + return Err; + } + return maxBlockSize; + } + private: /// The CUDA kernel function to execute. CUfunction Func; diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 25443fd1ac0b..0a2bb5ae0c77 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -114,6 +114,14 @@ struct GenELF64KernelTy : public GenericKernelTy { return Plugin::success(); } + /// Return maximum block size for maximum occupancy + Expected maxGroupSize(GenericDeviceTy &Device, + uint64_t DynamicMemSize) const override { + return Plugin::error( + ErrorCode::UNSUPPORTED, + "occupancy calculations are not implemented for the host device"); + } + private: /// The kernel function to execute. void (*Func)(void); diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index b25db7022e9d..24801a0e7d73 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -20,6 +20,7 @@ add_offload_unittest("init" target_compile_definitions("init.unittests" PRIVATE DISABLE_WRAPPER) add_offload_unittest("kernel" + kernel/olCalculateOptimalOccupancy.cpp kernel/olLaunchKernel.cpp) add_offload_unittest("memory" diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp index 43240fa3c4a0..fe7198a9c283 100644 --- a/offload/unittests/OffloadAPI/common/Fixtures.hpp +++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp @@ -26,6 +26,20 @@ } while (0) #endif +#ifndef ASSERT_SUCCESS_OR_UNSUPPORTED +#define ASSERT_SUCCESS_OR_UNSUPPORTED(ACTUAL) \ + do { \ + ol_result_t Res = ACTUAL; \ + if (Res && Res->Code == OL_ERRC_UNSUPPORTED) { \ + GTEST_SKIP() << #ACTUAL " returned unsupported; skipping test"; \ + return; \ + } else if (Res && Res->Code != OL_ERRC_SUCCESS) { \ + GTEST_FAIL() << #ACTUAL " returned " << Res->Code << ": " \ + << Res->Details; \ + } \ + } while (0) +#endif + // TODO: rework this so the EXPECTED/ACTUAL results are readable #ifndef ASSERT_ERROR #define ASSERT_ERROR(EXPECTED, ACTUAL) \ diff --git a/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp new file mode 100644 index 000000000000..17fa383cac3f --- /dev/null +++ b/offload/unittests/OffloadAPI/kernel/olCalculateOptimalOccupancy.cpp @@ -0,0 +1,45 @@ +//===------- Offload API tests - olCalculateOptimalOccupancy --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +using olCalculateOptimalOccupancyTest = OffloadKernelTest; +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olCalculateOptimalOccupancyTest); + +TEST_P(olCalculateOptimalOccupancyTest, Success) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 0, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, SuccessMem) { + size_t Size{0}; + ASSERT_SUCCESS_OR_UNSUPPORTED( + olCalculateOptimalOccupancy(Device, Kernel, 1024, &Size)); + ASSERT_GT(Size, 0u); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullKernel) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(Device, nullptr, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullDevice) { + size_t Size; + ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, + olCalculateOptimalOccupancy(nullptr, Kernel, 0, &Size)); +} + +TEST_P(olCalculateOptimalOccupancyTest, NullOutput) { + ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, + olCalculateOptimalOccupancy(Device, Kernel, 0, nullptr)); +}