Previously, the user was not able to use more than 48 KB of shared memory on NVIDIA GPUs. In order to do so, setting the function attribute `CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK` is required, which was not present in the code base. With this commit, we add the ability toset this attribute, allowing the user to utilize the full power of their GPU. In order to not have to reset the function attribute for each launch of the same kernel, we keep track of the maximum memory limit (as the variable `MaxDynCGroupMemLimit`) and only set the attribute if our desired amount exceeds the limit. By default, this limit is set to 48 KB. Feedback is greatly appreciated, especially around setting the new variable as mutable. I did this becuase the `launchImpl` method is const and I am not able to modify my variable otherwise. --------- Co-authored-by: Giorgi Gvalia <ggvalia@login33.chn.perlmutter.nersc.gov> Co-authored-by: Giorgi Gvalia <ggvalia@login07.chn.perlmutter.nersc.gov>
174 lines
4.6 KiB
C++
174 lines
4.6 KiB
C++
//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Implement subset of cuda api by calling into cuda library via dlopen
|
|
// Does the dlopen/dlsym calls as part of the call to cuInit
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "llvm/Support/DynamicLibrary.h"
|
|
|
|
#include "Shared/Debug.h"
|
|
|
|
#include "DLWrap.h"
|
|
#include "cuda.h"
|
|
|
|
#include <memory>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
DLWRAP_INITIALIZE()
|
|
|
|
DLWRAP_INTERNAL(cuInit, 1)
|
|
|
|
DLWRAP(cuCtxGetDevice, 1)
|
|
DLWRAP(cuDeviceGet, 2)
|
|
DLWRAP(cuDeviceGetAttribute, 3)
|
|
DLWRAP(cuDeviceGetCount, 1)
|
|
DLWRAP(cuFuncGetAttribute, 3)
|
|
DLWRAP(cuFuncSetAttribute, 3)
|
|
|
|
// Device info
|
|
DLWRAP(cuDeviceGetName, 3)
|
|
DLWRAP(cuDeviceTotalMem, 2)
|
|
DLWRAP(cuDriverGetVersion, 1)
|
|
|
|
DLWRAP(cuGetErrorString, 2)
|
|
DLWRAP(cuLaunchKernel, 11)
|
|
DLWRAP(cuLaunchHostFunc, 3)
|
|
|
|
DLWRAP(cuMemAlloc, 2)
|
|
DLWRAP(cuMemAllocHost, 2)
|
|
DLWRAP(cuMemAllocManaged, 3)
|
|
DLWRAP(cuMemAllocAsync, 3)
|
|
|
|
DLWRAP(cuMemcpyDtoDAsync, 4)
|
|
DLWRAP(cuMemcpyDtoH, 3)
|
|
DLWRAP(cuMemcpyDtoHAsync, 4)
|
|
DLWRAP(cuMemcpyHtoD, 3)
|
|
DLWRAP(cuMemcpyHtoDAsync, 4)
|
|
|
|
DLWRAP(cuMemFree, 1)
|
|
DLWRAP(cuMemFreeHost, 1)
|
|
DLWRAP(cuMemFreeAsync, 2)
|
|
|
|
DLWRAP(cuModuleGetFunction, 3)
|
|
DLWRAP(cuModuleGetGlobal, 4)
|
|
|
|
DLWRAP(cuModuleUnload, 1)
|
|
DLWRAP(cuStreamCreate, 2)
|
|
DLWRAP(cuStreamDestroy, 1)
|
|
DLWRAP(cuStreamSynchronize, 1)
|
|
DLWRAP(cuStreamQuery, 1)
|
|
DLWRAP(cuStreamAddCallback, 4)
|
|
DLWRAP(cuCtxSetCurrent, 1)
|
|
DLWRAP(cuDevicePrimaryCtxRelease, 1)
|
|
DLWRAP(cuDevicePrimaryCtxGetState, 3)
|
|
DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
|
|
DLWRAP(cuDevicePrimaryCtxRetain, 2)
|
|
DLWRAP(cuModuleLoadDataEx, 5)
|
|
|
|
DLWRAP(cuDeviceCanAccessPeer, 3)
|
|
DLWRAP(cuCtxEnablePeerAccess, 2)
|
|
DLWRAP(cuMemcpyPeerAsync, 6)
|
|
|
|
DLWRAP(cuCtxGetLimit, 2)
|
|
DLWRAP(cuCtxSetLimit, 2)
|
|
|
|
DLWRAP(cuEventCreate, 2)
|
|
DLWRAP(cuEventRecord, 2)
|
|
DLWRAP(cuStreamWaitEvent, 3)
|
|
DLWRAP(cuEventSynchronize, 1)
|
|
DLWRAP(cuEventDestroy, 1)
|
|
|
|
DLWRAP_FINALIZE()
|
|
|
|
DLWRAP(cuMemUnmap, 2)
|
|
DLWRAP(cuMemRelease, 1)
|
|
DLWRAP(cuMemAddressFree, 2)
|
|
DLWRAP(cuMemGetInfo, 2)
|
|
DLWRAP(cuMemAddressReserve, 5)
|
|
DLWRAP(cuMemMap, 5)
|
|
DLWRAP(cuMemCreate, 4)
|
|
DLWRAP(cuMemSetAccess, 4)
|
|
DLWRAP(cuMemGetAllocationGranularity, 3)
|
|
|
|
#ifndef DYNAMIC_CUDA_PATH
|
|
#define DYNAMIC_CUDA_PATH "libcuda.so"
|
|
#endif
|
|
|
|
#ifndef TARGET_NAME
|
|
#define TARGET_NAME CUDA
|
|
#endif
|
|
#ifndef DEBUG_PREFIX
|
|
#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
|
|
#endif
|
|
|
|
static bool checkForCUDA() {
|
|
// return true if dlopen succeeded and all functions found
|
|
|
|
// Prefer _v2 versions of functions if found in the library
|
|
std::unordered_map<std::string, const char *> TryFirst = {
|
|
{"cuMemAlloc", "cuMemAlloc_v2"},
|
|
{"cuMemFree", "cuMemFree_v2"},
|
|
{"cuMemcpyDtoH", "cuMemcpyDtoH_v2"},
|
|
{"cuMemcpyHtoD", "cuMemcpyHtoD_v2"},
|
|
{"cuStreamDestroy", "cuStreamDestroy_v2"},
|
|
{"cuModuleGetGlobal", "cuModuleGetGlobal_v2"},
|
|
{"cuMemcpyDtoHAsync", "cuMemcpyDtoHAsync_v2"},
|
|
{"cuMemcpyDtoDAsync", "cuMemcpyDtoDAsync_v2"},
|
|
{"cuMemcpyHtoDAsync", "cuMemcpyHtoDAsync_v2"},
|
|
{"cuDevicePrimaryCtxRelease", "cuDevicePrimaryCtxRelease_v2"},
|
|
{"cuDevicePrimaryCtxSetFlags", "cuDevicePrimaryCtxSetFlags_v2"},
|
|
};
|
|
|
|
const char *CudaLib = DYNAMIC_CUDA_PATH;
|
|
std::string ErrMsg;
|
|
auto DynlibHandle = std::make_unique<llvm::sys::DynamicLibrary>(
|
|
llvm::sys::DynamicLibrary::getPermanentLibrary(CudaLib, &ErrMsg));
|
|
if (!DynlibHandle->isValid()) {
|
|
DP("Unable to load library '%s': %s!\n", CudaLib, ErrMsg.c_str());
|
|
return false;
|
|
}
|
|
|
|
for (size_t I = 0; I < dlwrap::size(); I++) {
|
|
const char *Sym = dlwrap::symbol(I);
|
|
|
|
auto It = TryFirst.find(Sym);
|
|
if (It != TryFirst.end()) {
|
|
const char *First = It->second;
|
|
void *P = DynlibHandle->getAddressOfSymbol(First);
|
|
if (P) {
|
|
DP("Implementing %s with dlsym(%s) -> %p\n", Sym, First, P);
|
|
*dlwrap::pointer(I) = P;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
void *P = DynlibHandle->getAddressOfSymbol(Sym);
|
|
if (P == nullptr) {
|
|
DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
|
|
return false;
|
|
}
|
|
DP("Implementing %s with dlsym(%s) -> %p\n", Sym, Sym, P);
|
|
|
|
*dlwrap::pointer(I) = P;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
CUresult cuInit(unsigned X) {
|
|
// Note: Called exactly once from cuda rtl.cpp in a global constructor so
|
|
// does not need to handle being called repeatedly or concurrently
|
|
if (!checkForCUDA()) {
|
|
return CUDA_ERROR_INVALID_HANDLE;
|
|
}
|
|
return dlwrap_cuInit(X);
|
|
}
|