
https://github.com/codespell-project/codespell % `codespell --ignore-words-list=archtype,hsa,identty,inout,iself,nd,te,ths,vertexes --write-changes`
1518 lines
53 KiB
C++
1518 lines
53 KiB
C++
//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// RTL NextGen for CUDA machine
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include <cassert>
|
|
#include <cstddef>
|
|
#include <cuda.h>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
|
|
#include "Shared/APITypes.h"
|
|
#include "Shared/Debug.h"
|
|
#include "Shared/Environment.h"
|
|
|
|
#include "GlobalHandler.h"
|
|
#include "OpenMP/OMPT/Callback.h"
|
|
#include "PluginInterface.h"
|
|
#include "Utils/ELF.h"
|
|
|
|
#include "llvm/BinaryFormat/ELF.h"
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
|
#include "llvm/Support/Error.h"
|
|
#include "llvm/Support/FileOutputBuffer.h"
|
|
#include "llvm/Support/FileSystem.h"
|
|
#include "llvm/Support/Program.h"
|
|
|
|
namespace llvm {
|
|
namespace omp {
|
|
namespace target {
|
|
namespace plugin {
|
|
|
|
/// Forward declarations for all specialized data structures.
|
|
struct CUDAKernelTy;
|
|
struct CUDADeviceTy;
|
|
struct CUDAPluginTy;
|
|
|
|
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11000))
|
|
/// Forward declarations for all Virtual Memory Management
|
|
/// related data structures and functions. This is necessary
|
|
/// for older cuda versions.
|
|
typedef void *CUmemGenericAllocationHandle;
|
|
typedef void *CUmemAllocationProp;
|
|
typedef void *CUmemAccessDesc;
|
|
typedef void *CUmemAllocationGranularity_flags;
|
|
CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
|
|
CUdeviceptr addr, unsigned long long flags) {}
|
|
CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
|
|
CUmemGenericAllocationHandle handle,
|
|
unsigned long long flags) {}
|
|
CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
|
|
const CUmemAllocationProp *prop,
|
|
unsigned long long flags) {}
|
|
CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
|
|
const CUmemAccessDesc *desc, size_t count) {}
|
|
CUresult
|
|
cuMemGetAllocationGranularity(size_t *granularity,
|
|
const CUmemAllocationProp *prop,
|
|
CUmemAllocationGranularity_flags option) {}
|
|
#endif
|
|
|
|
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11020))
|
|
// Forward declarations of asynchronous memory management functions. This is
|
|
// necessary for older versions of CUDA.
|
|
CUresult cuMemAllocAsync(CUdeviceptr *ptr, size_t, CUstream) { *ptr = 0; }
|
|
|
|
CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
|
|
#endif
|
|
|
|
/// Class implementing the CUDA device images properties.
|
|
struct CUDADeviceImageTy : public DeviceImageTy {
|
|
/// Create the CUDA image with the id and the target image pointer.
|
|
CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
|
|
const __tgt_device_image *TgtImage)
|
|
: DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
|
|
|
|
/// Load the image as a CUDA module.
|
|
Error loadModule() {
|
|
assert(!Module && "Module already loaded");
|
|
|
|
CUresult Res = cuModuleLoadDataEx(&Module, getStart(), 0, nullptr, nullptr);
|
|
if (auto Err = Plugin::check(Res, "Error in cuModuleLoadDataEx: %s"))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Unload the CUDA module corresponding to the image.
|
|
Error unloadModule() {
|
|
assert(Module && "Module not loaded");
|
|
|
|
CUresult Res = cuModuleUnload(Module);
|
|
if (auto Err = Plugin::check(Res, "Error in cuModuleUnload: %s"))
|
|
return Err;
|
|
|
|
Module = nullptr;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Getter of the CUDA module.
|
|
CUmodule getModule() const { return Module; }
|
|
|
|
private:
|
|
/// The CUDA module that loaded the image.
|
|
CUmodule Module;
|
|
};
|
|
|
|
/// Class implementing the CUDA kernel functionalities which derives from the
|
|
/// generic kernel class.
|
|
struct CUDAKernelTy : public GenericKernelTy {
|
|
/// Create a CUDA kernel with a name and an execution mode.
|
|
CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
|
|
|
|
/// Initialize the CUDA kernel.
|
|
Error initImpl(GenericDeviceTy &GenericDevice,
|
|
DeviceImageTy &Image) override {
|
|
CUresult Res;
|
|
CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
|
|
|
|
// Retrieve the function pointer of the kernel.
|
|
Res = cuModuleGetFunction(&Func, CUDAImage.getModule(), getName());
|
|
if (auto Err = Plugin::check(Res, "Error in cuModuleGetFunction('%s'): %s",
|
|
getName()))
|
|
return Err;
|
|
|
|
// Check that the function pointer is valid.
|
|
if (!Func)
|
|
return Plugin::error("Invalid function for kernel %s", getName());
|
|
|
|
int MaxThreads;
|
|
Res = cuFuncGetAttribute(&MaxThreads,
|
|
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Func);
|
|
if (auto Err = Plugin::check(Res, "Error in cuFuncGetAttribute: %s"))
|
|
return Err;
|
|
|
|
// The maximum number of threads cannot exceed the maximum of the kernel.
|
|
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Launch the CUDA kernel function.
|
|
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
|
|
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
|
|
KernelLaunchParamsTy LaunchParams,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
|
|
|
|
private:
|
|
/// The CUDA kernel function to execute.
|
|
CUfunction Func;
|
|
};
|
|
|
|
/// Class wrapping a CUDA stream reference. These are the objects handled by the
|
|
/// Stream Manager for the CUDA plugin.
|
|
struct CUDAStreamRef final : public GenericDeviceResourceRef {
|
|
/// The underlying handle type for streams.
|
|
using HandleTy = CUstream;
|
|
|
|
/// Create an empty reference to an invalid stream.
|
|
CUDAStreamRef() : Stream(nullptr) {}
|
|
|
|
/// Create a reference to an existing stream.
|
|
CUDAStreamRef(HandleTy Stream) : Stream(Stream) {}
|
|
|
|
/// Create a new stream and save the reference. The reference must be empty
|
|
/// before calling to this function.
|
|
Error create(GenericDeviceTy &Device) override {
|
|
if (Stream)
|
|
return Plugin::error("Creating an existing stream");
|
|
|
|
CUresult Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING);
|
|
if (auto Err = Plugin::check(Res, "Error in cuStreamCreate: %s"))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Destroy the referenced stream and invalidate the reference. The reference
|
|
/// must be to a valid stream before calling to this function.
|
|
Error destroy(GenericDeviceTy &Device) override {
|
|
if (!Stream)
|
|
return Plugin::error("Destroying an invalid stream");
|
|
|
|
CUresult Res = cuStreamDestroy(Stream);
|
|
if (auto Err = Plugin::check(Res, "Error in cuStreamDestroy: %s"))
|
|
return Err;
|
|
|
|
Stream = nullptr;
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get the underlying CUDA stream.
|
|
operator HandleTy() const { return Stream; }
|
|
|
|
private:
|
|
/// The reference to the CUDA stream.
|
|
HandleTy Stream;
|
|
};
|
|
|
|
/// Class wrapping a CUDA event reference. These are the objects handled by the
|
|
/// Event Manager for the CUDA plugin.
|
|
struct CUDAEventRef final : public GenericDeviceResourceRef {
|
|
/// The underlying handle type for events.
|
|
using HandleTy = CUevent;
|
|
|
|
/// Create an empty reference to an invalid event.
|
|
CUDAEventRef() : Event(nullptr) {}
|
|
|
|
/// Create a reference to an existing event.
|
|
CUDAEventRef(HandleTy Event) : Event(Event) {}
|
|
|
|
/// Create a new event and save the reference. The reference must be empty
|
|
/// before calling to this function.
|
|
Error create(GenericDeviceTy &Device) override {
|
|
if (Event)
|
|
return Plugin::error("Creating an existing event");
|
|
|
|
CUresult Res = cuEventCreate(&Event, CU_EVENT_DEFAULT);
|
|
if (auto Err = Plugin::check(Res, "Error in cuEventCreate: %s"))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Destroy the referenced event and invalidate the reference. The reference
|
|
/// must be to a valid event before calling to this function.
|
|
Error destroy(GenericDeviceTy &Device) override {
|
|
if (!Event)
|
|
return Plugin::error("Destroying an invalid event");
|
|
|
|
CUresult Res = cuEventDestroy(Event);
|
|
if (auto Err = Plugin::check(Res, "Error in cuEventDestroy: %s"))
|
|
return Err;
|
|
|
|
Event = nullptr;
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Get the underlying CUevent.
|
|
operator HandleTy() const { return Event; }
|
|
|
|
private:
|
|
/// The reference to the CUDA event.
|
|
HandleTy Event;
|
|
};
|
|
|
|
/// Class implementing the CUDA device functionalities which derives from the
|
|
/// generic device class.
|
|
struct CUDADeviceTy : public GenericDeviceTy {
|
|
// Create a CUDA device with a device id and the default CUDA grid values.
|
|
CUDADeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, int32_t NumDevices)
|
|
: GenericDeviceTy(Plugin, DeviceId, NumDevices, NVPTXGridValues),
|
|
CUDAStreamManager(*this), CUDAEventManager(*this) {}
|
|
|
|
~CUDADeviceTy() {}
|
|
|
|
/// Initialize the device, its resources and get its properties.
|
|
Error initImpl(GenericPluginTy &Plugin) override {
|
|
CUresult Res = cuDeviceGet(&Device, DeviceId);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
|
|
return Err;
|
|
|
|
// Query the current flags of the primary context and set its flags if
|
|
// it is inactive.
|
|
unsigned int FormerPrimaryCtxFlags = 0;
|
|
int FormerPrimaryCtxIsActive = 0;
|
|
Res = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
|
|
&FormerPrimaryCtxIsActive);
|
|
if (auto Err =
|
|
Plugin::check(Res, "Error in cuDevicePrimaryCtxGetState: %s"))
|
|
return Err;
|
|
|
|
if (FormerPrimaryCtxIsActive) {
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"The primary context is active, no change to its flags\n");
|
|
if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
|
|
CU_CTX_SCHED_BLOCKING_SYNC)
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"Warning: The current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
|
|
} else {
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"The primary context is inactive, set its flags to "
|
|
"CU_CTX_SCHED_BLOCKING_SYNC\n");
|
|
Res = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
|
|
if (auto Err =
|
|
Plugin::check(Res, "Error in cuDevicePrimaryCtxSetFlags: %s"))
|
|
return Err;
|
|
}
|
|
|
|
// Retain the per device primary context and save it to use whenever this
|
|
// device is selected.
|
|
Res = cuDevicePrimaryCtxRetain(&Context, Device);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDevicePrimaryCtxRetain: %s"))
|
|
return Err;
|
|
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
// Initialize stream pool.
|
|
if (auto Err = CUDAStreamManager.init(OMPX_InitialNumStreams))
|
|
return Err;
|
|
|
|
// Initialize event pool.
|
|
if (auto Err = CUDAEventManager.init(OMPX_InitialNumEvents))
|
|
return Err;
|
|
|
|
// Query attributes to determine number of threads/block and blocks/grid.
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
|
|
GridValues.GV_Max_Teams))
|
|
return Err;
|
|
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
|
|
GridValues.GV_Max_WG_Size))
|
|
return Err;
|
|
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE,
|
|
GridValues.GV_Warp_Size))
|
|
return Err;
|
|
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
|
ComputeCapability.Major))
|
|
return Err;
|
|
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
ComputeCapability.Minor))
|
|
return Err;
|
|
|
|
uint32_t NumMuliprocessors = 0;
|
|
uint32_t MaxThreadsPerSM = 0;
|
|
uint32_t WarpSize = 0;
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
NumMuliprocessors))
|
|
return Err;
|
|
if (auto Err =
|
|
getDeviceAttr(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
|
|
MaxThreadsPerSM))
|
|
return Err;
|
|
if (auto Err = getDeviceAttr(CU_DEVICE_ATTRIBUTE_WARP_SIZE, WarpSize))
|
|
return Err;
|
|
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Deinitialize the device and release its resources.
|
|
Error deinitImpl() override {
|
|
if (Context) {
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
}
|
|
|
|
// Deinitialize the stream manager.
|
|
if (auto Err = CUDAStreamManager.deinit())
|
|
return Err;
|
|
|
|
if (auto Err = CUDAEventManager.deinit())
|
|
return Err;
|
|
|
|
// Close modules if necessary.
|
|
if (!LoadedImages.empty()) {
|
|
assert(Context && "Invalid CUDA context");
|
|
|
|
// Each image has its own module.
|
|
for (DeviceImageTy *Image : LoadedImages) {
|
|
CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(*Image);
|
|
|
|
// Unload the module of the image.
|
|
if (auto Err = CUDAImage.unloadModule())
|
|
return Err;
|
|
}
|
|
}
|
|
|
|
if (Context) {
|
|
CUresult Res = cuDevicePrimaryCtxRelease(Device);
|
|
if (auto Err =
|
|
Plugin::check(Res, "Error in cuDevicePrimaryCtxRelease: %s"))
|
|
return Err;
|
|
}
|
|
|
|
// Invalidate context and device references.
|
|
Context = nullptr;
|
|
Device = CU_DEVICE_INVALID;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) override {
|
|
// Check for the presence of global destructors at initialization time. This
|
|
// is required when the image may be deallocated before destructors are run.
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
|
|
Image.setPendingGlobalDtors();
|
|
|
|
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
|
|
}
|
|
|
|
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
|
|
DeviceImageTy &Image) override {
|
|
if (Image.hasPendingGlobalDtors())
|
|
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
|
|
return Plugin::success();
|
|
}
|
|
|
|
Expected<std::unique_ptr<MemoryBuffer>>
|
|
doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override {
|
|
// TODO: We should be able to use the 'nvidia-ptxjitcompiler' interface to
|
|
// avoid the call to 'ptxas'.
|
|
SmallString<128> PTXInputFilePath;
|
|
std::error_code EC = sys::fs::createTemporaryFile("nvptx-pre-link-jit", "s",
|
|
PTXInputFilePath);
|
|
if (EC)
|
|
return Plugin::error("Failed to create temporary file for ptxas");
|
|
|
|
// Write the file's contents to the output file.
|
|
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
|
|
FileOutputBuffer::create(PTXInputFilePath, MB->getBuffer().size());
|
|
if (!OutputOrErr)
|
|
return OutputOrErr.takeError();
|
|
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
|
|
llvm::copy(MB->getBuffer(), Output->getBufferStart());
|
|
if (Error E = Output->commit())
|
|
return std::move(E);
|
|
|
|
SmallString<128> PTXOutputFilePath;
|
|
EC = sys::fs::createTemporaryFile("nvptx-post-link-jit", "cubin",
|
|
PTXOutputFilePath);
|
|
if (EC)
|
|
return Plugin::error("Failed to create temporary file for ptxas");
|
|
|
|
// Try to find `ptxas` in the path to compile the PTX to a binary.
|
|
const auto ErrorOrPath = sys::findProgramByName("ptxas");
|
|
if (!ErrorOrPath)
|
|
return Plugin::error("Failed to find 'ptxas' on the PATH.");
|
|
|
|
std::string Arch = getComputeUnitKind();
|
|
StringRef Args[] = {*ErrorOrPath,
|
|
"-m64",
|
|
"-O2",
|
|
"--gpu-name",
|
|
Arch,
|
|
"--output-file",
|
|
PTXOutputFilePath,
|
|
PTXInputFilePath};
|
|
|
|
std::string ErrMsg;
|
|
if (sys::ExecuteAndWait(*ErrorOrPath, Args, std::nullopt, {}, 0, 0,
|
|
&ErrMsg))
|
|
return Plugin::error("Running 'ptxas' failed: %s\n", ErrMsg.c_str());
|
|
|
|
auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(PTXOutputFilePath.data());
|
|
if (!BufferOrErr)
|
|
return Plugin::error("Failed to open temporary file for ptxas");
|
|
|
|
// Clean up the temporary files afterwards.
|
|
if (sys::fs::remove(PTXOutputFilePath))
|
|
return Plugin::error("Failed to remove temporary file for ptxas");
|
|
if (sys::fs::remove(PTXInputFilePath))
|
|
return Plugin::error("Failed to remove temporary file for ptxas");
|
|
|
|
return std::move(*BufferOrErr);
|
|
}
|
|
|
|
/// Allocate and construct a CUDA kernel.
|
|
Expected<GenericKernelTy &> constructKernel(const char *Name) override {
|
|
// Allocate and construct the CUDA kernel.
|
|
CUDAKernelTy *CUDAKernel = Plugin.allocate<CUDAKernelTy>();
|
|
if (!CUDAKernel)
|
|
return Plugin::error("Failed to allocate memory for CUDA kernel");
|
|
|
|
new (CUDAKernel) CUDAKernelTy(Name);
|
|
|
|
return *CUDAKernel;
|
|
}
|
|
|
|
/// Set the current context to this device's context.
|
|
Error setContext() override {
|
|
CUresult Res = cuCtxSetCurrent(Context);
|
|
return Plugin::check(Res, "Error in cuCtxSetCurrent: %s");
|
|
}
|
|
|
|
/// NVIDIA returns the product of the SM count and the number of warps that
|
|
/// fit if the maximum number of threads were scheduled on each SM.
|
|
uint64_t getHardwareParallelism() const override {
|
|
return HardwareParallelism;
|
|
}
|
|
|
|
/// We want to set up the RPC server for host services to the GPU if it is
|
|
/// available.
|
|
bool shouldSetupRPCServer() const override { return true; }
|
|
|
|
/// The RPC interface should have enough space for all available parallelism.
|
|
uint64_t requestedRPCPortCount() const override {
|
|
return getHardwareParallelism();
|
|
}
|
|
|
|
/// Get the stream of the asynchronous info structure or get a new one.
|
|
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
|
|
// Get the stream (if any) from the async info.
|
|
Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
|
|
if (!Stream) {
|
|
// There was no stream; get an idle one.
|
|
if (auto Err = CUDAStreamManager.getResource(Stream))
|
|
return Err;
|
|
|
|
// Modify the async info's stream.
|
|
AsyncInfoWrapper.setQueueAs<CUstream>(Stream);
|
|
}
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Getters of CUDA references.
|
|
CUcontext getCUDAContext() const { return Context; }
|
|
CUdevice getCUDADevice() const { return Device; }
|
|
|
|
/// Load the binary image into the device and allocate an image object.
|
|
Expected<DeviceImageTy *> loadBinaryImpl(const __tgt_device_image *TgtImage,
|
|
int32_t ImageId) override {
|
|
if (auto Err = setContext())
|
|
return std::move(Err);
|
|
|
|
// Allocate and initialize the image object.
|
|
CUDADeviceImageTy *CUDAImage = Plugin.allocate<CUDADeviceImageTy>();
|
|
new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
|
|
|
|
// Load the CUDA module.
|
|
if (auto Err = CUDAImage->loadModule())
|
|
return std::move(Err);
|
|
|
|
return CUDAImage;
|
|
}
|
|
|
|
/// Allocate memory on the device or related to the device.
|
|
void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
|
|
if (Size == 0)
|
|
return nullptr;
|
|
|
|
if (auto Err = setContext()) {
|
|
REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
|
|
void *MemAlloc = nullptr;
|
|
CUdeviceptr DevicePtr;
|
|
CUresult Res;
|
|
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
Res = cuMemAlloc(&DevicePtr, Size);
|
|
MemAlloc = (void *)DevicePtr;
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
Res = cuMemAllocHost(&MemAlloc, Size);
|
|
break;
|
|
case TARGET_ALLOC_SHARED:
|
|
Res = cuMemAllocManaged(&DevicePtr, Size, CU_MEM_ATTACH_GLOBAL);
|
|
MemAlloc = (void *)DevicePtr;
|
|
break;
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
|
|
CUstream Stream;
|
|
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
|
|
break;
|
|
if ((Res = cuMemAllocAsync(&DevicePtr, Size, Stream)))
|
|
break;
|
|
cuStreamSynchronize(Stream);
|
|
Res = cuStreamDestroy(Stream);
|
|
MemAlloc = (void *)DevicePtr;
|
|
}
|
|
}
|
|
|
|
if (auto Err =
|
|
Plugin::check(Res, "Error in cuMemAlloc[Host|Managed]: %s")) {
|
|
REPORT("Failure to alloc memory: %s\n", toString(std::move(Err)).data());
|
|
return nullptr;
|
|
}
|
|
return MemAlloc;
|
|
}
|
|
|
|
/// Deallocate memory on the device or related to the device.
|
|
int free(void *TgtPtr, TargetAllocTy Kind) override {
|
|
if (TgtPtr == nullptr)
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
if (auto Err = setContext()) {
|
|
REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
CUresult Res;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
case TARGET_ALLOC_SHARED:
|
|
Res = cuMemFree((CUdeviceptr)TgtPtr);
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
Res = cuMemFreeHost(TgtPtr);
|
|
break;
|
|
case TARGET_ALLOC_DEVICE_NON_BLOCKING: {
|
|
CUstream Stream;
|
|
if ((Res = cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING)))
|
|
break;
|
|
cuMemFreeAsync(reinterpret_cast<CUdeviceptr>(TgtPtr), Stream);
|
|
cuStreamSynchronize(Stream);
|
|
if ((Res = cuStreamDestroy(Stream)))
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemFree[Host]: %s")) {
|
|
REPORT("Failure to free memory: %s\n", toString(std::move(Err)).data());
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// Synchronize current thread with the pending operations on the async info.
|
|
Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
|
|
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
|
|
CUresult Res;
|
|
Res = cuStreamSynchronize(Stream);
|
|
|
|
// Once the stream is synchronized, return it to stream pool and reset
|
|
// AsyncInfo. This is to make sure the synchronization only works for its
|
|
// own tasks.
|
|
AsyncInfo.Queue = nullptr;
|
|
if (auto Err = CUDAStreamManager.returnResource(Stream))
|
|
return Err;
|
|
|
|
return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
|
|
}
|
|
|
|
/// CUDA support VA management
|
|
bool supportVAManagement() const override {
|
|
#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000))
|
|
return true;
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
/// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
|
|
/// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
|
|
/// At return \p RSize contains the actual size
|
|
Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
|
|
CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
|
|
auto IHandle = DeviceMMaps.find(DVAddr);
|
|
size_t Size = *RSize;
|
|
|
|
if (Size == 0)
|
|
return Plugin::error("Memory Map Size must be larger than 0");
|
|
|
|
// Check if we have already mapped this address
|
|
if (IHandle != DeviceMMaps.end())
|
|
return Plugin::error("Address already memory mapped");
|
|
|
|
CUmemAllocationProp Prop = {};
|
|
size_t Granularity = 0;
|
|
|
|
size_t Free, Total;
|
|
CUresult Res = cuMemGetInfo(&Free, &Total);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
|
|
return Err;
|
|
|
|
if (Size >= Free) {
|
|
*Addr = nullptr;
|
|
return Plugin::error(
|
|
"Cannot map memory size larger than the available device memory");
|
|
}
|
|
|
|
// currently NVidia only supports pinned device types
|
|
Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
|
Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
|
|
Prop.location.id = DeviceId;
|
|
cuMemGetAllocationGranularity(&Granularity, &Prop,
|
|
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
|
if (auto Err =
|
|
Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
|
|
return Err;
|
|
|
|
if (Granularity == 0)
|
|
return Plugin::error("Wrong device Page size");
|
|
|
|
// Ceil to page size.
|
|
Size = utils::roundUp(Size, Granularity);
|
|
|
|
// Create a handler of our allocation
|
|
CUmemGenericAllocationHandle AHandle;
|
|
Res = cuMemCreate(&AHandle, Size, &Prop, 0);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
|
|
return Err;
|
|
|
|
CUdeviceptr DevPtr = 0;
|
|
Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
|
|
return Err;
|
|
|
|
Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
|
|
return Err;
|
|
|
|
CUmemAccessDesc ADesc = {};
|
|
ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
|
ADesc.location.id = DeviceId;
|
|
ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
|
|
|
// Sets address
|
|
Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
|
|
return Err;
|
|
|
|
*Addr = reinterpret_cast<void *>(DevPtr);
|
|
*RSize = Size;
|
|
DeviceMMaps.insert({DevPtr, AHandle});
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// De-allocates device memory and Unmaps the Virtual Addr
|
|
Error memoryVAUnMap(void *VAddr, size_t Size) override {
|
|
CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
|
|
auto IHandle = DeviceMMaps.find(DVAddr);
|
|
// Mapping does not exist
|
|
if (IHandle == DeviceMMaps.end()) {
|
|
return Plugin::error("Addr is not MemoryMapped");
|
|
}
|
|
|
|
if (IHandle == DeviceMMaps.end())
|
|
return Plugin::error("Addr is not MemoryMapped");
|
|
|
|
CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
|
|
|
|
CUresult Res = cuMemUnmap(DVAddr, Size);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
|
|
return Err;
|
|
|
|
Res = cuMemRelease(AllocHandle);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
|
|
return Err;
|
|
|
|
Res = cuMemAddressFree(DVAddr, Size);
|
|
if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
|
|
return Err;
|
|
|
|
DeviceMMaps.erase(IHandle);
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Query for the completion of the pending operations on the async info.
|
|
Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
|
|
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
|
|
CUresult Res = cuStreamQuery(Stream);
|
|
|
|
// Not ready streams must be considered as successful operations.
|
|
if (Res == CUDA_ERROR_NOT_READY)
|
|
return Plugin::success();
|
|
|
|
// Once the stream is synchronized and the operations completed (or an error
|
|
// occurs), return it to stream pool and reset AsyncInfo. This is to make
|
|
// sure the synchronization only works for its own tasks.
|
|
AsyncInfo.Queue = nullptr;
|
|
if (auto Err = CUDAStreamManager.returnResource(Stream))
|
|
return Err;
|
|
|
|
return Plugin::check(Res, "Error in cuStreamQuery: %s");
|
|
}
|
|
|
|
Expected<void *> dataLockImpl(void *HstPtr, int64_t Size) override {
|
|
// TODO: Register the buffer as CUDA host memory.
|
|
return HstPtr;
|
|
}
|
|
|
|
Error dataUnlockImpl(void *HstPtr) override { return Plugin::success(); }
|
|
|
|
Expected<bool> isPinnedPtrImpl(void *HstPtr, void *&BaseHstPtr,
|
|
void *&BaseDevAccessiblePtr,
|
|
size_t &BaseSize) const override {
|
|
// TODO: Implement pinning feature for CUDA.
|
|
return false;
|
|
}
|
|
|
|
/// Submit data to the device (host to device transfer).
|
|
Error dataSubmitImpl(void *TgtPtr, const void *HstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
CUresult Res = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
|
|
return Plugin::check(Res, "Error in cuMemcpyHtoDAsync: %s");
|
|
}
|
|
|
|
/// Retrieve data from the device (device to host transfer).
|
|
Error dataRetrieveImpl(void *HstPtr, const void *TgtPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
|
|
return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
|
|
}
|
|
|
|
/// Exchange data between two devices directly. We may use peer access if
|
|
/// the CUDA devices and driver allow them.
|
|
Error dataExchangeImpl(const void *SrcPtr, GenericDeviceTy &DstGenericDevice,
|
|
void *DstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override;
|
|
|
|
/// Initialize the async info for interoperability purposes.
|
|
Error initAsyncInfoImpl(AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Initialize the device info for interoperability purposes.
|
|
Error initDeviceInfoImpl(__tgt_device_info *DeviceInfo) override {
|
|
assert(Context && "Context is null");
|
|
assert(Device != CU_DEVICE_INVALID && "Invalid CUDA device");
|
|
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
if (!DeviceInfo->Context)
|
|
DeviceInfo->Context = Context;
|
|
|
|
if (!DeviceInfo->Device)
|
|
DeviceInfo->Device = reinterpret_cast<void *>(Device);
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
/// Create an event.
|
|
Error createEventImpl(void **EventPtrStorage) override {
|
|
CUevent *Event = reinterpret_cast<CUevent *>(EventPtrStorage);
|
|
return CUDAEventManager.getResource(*Event);
|
|
}
|
|
|
|
/// Destroy a previously created event.
|
|
Error destroyEventImpl(void *EventPtr) override {
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
return CUDAEventManager.returnResource(Event);
|
|
}
|
|
|
|
/// Record the event.
|
|
Error recordEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
CUresult Res = cuEventRecord(Event, Stream);
|
|
return Plugin::check(Res, "Error in cuEventRecord: %s");
|
|
}
|
|
|
|
/// Make the stream wait on the event.
|
|
Error waitEventImpl(void *EventPtr,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) override {
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
// Do not use CU_EVENT_WAIT_DEFAULT here as it is only available from
|
|
// specific CUDA version, and defined as 0x0. In previous version, per CUDA
|
|
// API document, that argument has to be 0x0.
|
|
CUresult Res = cuStreamWaitEvent(Stream, Event, 0);
|
|
return Plugin::check(Res, "Error in cuStreamWaitEvent: %s");
|
|
}
|
|
|
|
/// Synchronize the current thread with the event.
|
|
Error syncEventImpl(void *EventPtr) override {
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
CUresult Res = cuEventSynchronize(Event);
|
|
return Plugin::check(Res, "Error in cuEventSynchronize: %s");
|
|
}
|
|
|
|
/// Print information about the device.
|
|
Error obtainInfoImpl(InfoQueueTy &Info) override {
|
|
char TmpChar[1000];
|
|
const char *TmpCharPtr;
|
|
size_t TmpSt;
|
|
int TmpInt;
|
|
|
|
CUresult Res = cuDriverGetVersion(&TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("CUDA Driver Version", TmpInt);
|
|
|
|
Info.add("CUDA OpenMP Device Number", DeviceId);
|
|
|
|
Res = cuDeviceGetName(TmpChar, 1000, Device);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Device Name", TmpChar);
|
|
|
|
Res = cuDeviceTotalMem(&TmpSt, Device);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Global Memory Size", TmpSt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Number of Multiprocessors", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Concurrent Copy and Execution", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Total Constant Memory", TmpInt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
|
|
TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Max Shared Memory per Block", TmpInt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Registers per Block", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_WARP_SIZE, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Warp Size", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Maximum Threads per Block", TmpInt);
|
|
|
|
Info.add("Maximum Block Dimensions", "");
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("x", TmpInt);
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("y", TmpInt);
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("z", TmpInt);
|
|
|
|
Info.add("Maximum Grid Dimensions", "");
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("x", TmpInt);
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("y", TmpInt);
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add<InfoLevel2>("z", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_PITCH, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Maximum Memory Pitch", TmpInt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Texture Alignment", TmpInt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CLOCK_RATE, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Clock Rate", TmpInt, "kHz");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Execution Timeout", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_INTEGRATED, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Integrated Device", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Can Map Host Memory", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, TmpInt);
|
|
if (Res == CUDA_SUCCESS) {
|
|
if (TmpInt == CU_COMPUTEMODE_DEFAULT)
|
|
TmpCharPtr = "Default";
|
|
else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
|
|
TmpCharPtr = "Prohibited";
|
|
else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
|
TmpCharPtr = "Exclusive process";
|
|
else
|
|
TmpCharPtr = "Unknown";
|
|
Info.add("Compute Mode", TmpCharPtr);
|
|
}
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Concurrent Kernels", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ECC_ENABLED, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("ECC Enabled", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Memory Clock Rate", TmpInt, "kHz");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Memory Bus Width", TmpInt, "bits");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("L2 Cache Size", TmpInt, "bytes");
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
|
|
TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Max Threads Per SMP", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Async Engines", TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Unified Addressing", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Managed Memory", (bool)TmpInt);
|
|
|
|
Res =
|
|
getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Concurrent Managed Memory", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
|
|
TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Preemption Supported", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Cooperative Launch", (bool)TmpInt);
|
|
|
|
Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, TmpInt);
|
|
if (Res == CUDA_SUCCESS)
|
|
Info.add("Multi-Device Boars", (bool)TmpInt);
|
|
|
|
Info.add("Compute Capabilities", ComputeCapability.str());
|
|
|
|
return Plugin::success();
|
|
}
|
|
|
|
virtual bool shouldSetupDeviceMemoryPool() const override {
|
|
/// We use the CUDA malloc for now.
|
|
return false;
|
|
}
|
|
|
|
/// Getters and setters for stack and heap sizes.
|
|
Error getDeviceStackSize(uint64_t &Value) override {
|
|
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
|
|
}
|
|
Error setDeviceStackSize(uint64_t Value) override {
|
|
return setCtxLimit(CU_LIMIT_STACK_SIZE, Value);
|
|
}
|
|
Error getDeviceHeapSize(uint64_t &Value) override {
|
|
return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
|
|
}
|
|
Error setDeviceHeapSize(uint64_t Value) override {
|
|
return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
|
|
}
|
|
Error getDeviceMemorySize(uint64_t &Value) override {
|
|
CUresult Res = cuDeviceTotalMem(&Value, Device);
|
|
return Plugin::check(Res, "Error in getDeviceMemorySize %s");
|
|
}
|
|
|
|
/// CUDA-specific functions for getting and setting context limits.
|
|
Error setCtxLimit(CUlimit Kind, uint64_t Value) {
|
|
CUresult Res = cuCtxSetLimit(Kind, Value);
|
|
return Plugin::check(Res, "Error in cuCtxSetLimit: %s");
|
|
}
|
|
Error getCtxLimit(CUlimit Kind, uint64_t &Value) {
|
|
CUresult Res = cuCtxGetLimit(&Value, Kind);
|
|
return Plugin::check(Res, "Error in cuCtxGetLimit: %s");
|
|
}
|
|
|
|
/// CUDA-specific function to get device attributes.
|
|
Error getDeviceAttr(uint32_t Kind, uint32_t &Value) {
|
|
// TODO: Warn if the new value is larger than the old.
|
|
CUresult Res =
|
|
cuDeviceGetAttribute((int *)&Value, (CUdevice_attribute)Kind, Device);
|
|
return Plugin::check(Res, "Error in cuDeviceGetAttribute: %s");
|
|
}
|
|
|
|
CUresult getDeviceAttrRaw(uint32_t Kind, int &Value) {
|
|
return cuDeviceGetAttribute(&Value, (CUdevice_attribute)Kind, Device);
|
|
}
|
|
|
|
/// See GenericDeviceTy::getComputeUnitKind().
|
|
std::string getComputeUnitKind() const override {
|
|
return ComputeCapability.str();
|
|
}
|
|
|
|
/// Returns the clock frequency for the given NVPTX device.
|
|
uint64_t getClockFrequency() const override { return 1000000000; }
|
|
|
|
private:
|
|
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
|
|
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
|
|
|
|
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
|
bool IsCtor) {
|
|
const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
|
|
// Perform a quick check for the named kernel in the image. The kernel
|
|
// should be created by the 'nvptx-lower-ctor-dtor' pass.
|
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
|
if (IsCtor && !Handler.isSymbolInImage(*this, Image, KernelName))
|
|
return Plugin::success();
|
|
|
|
// The Nvidia backend cannot handle creating the ctor / dtor array
|
|
// automatically so we must create it ourselves. The backend will emit
|
|
// several globals that contain function pointers we can call. These are
|
|
// prefixed with a known name due to Nvidia's lack of section support.
|
|
auto ELFObjOrErr = Handler.getELFObjectFile(Image);
|
|
if (!ELFObjOrErr)
|
|
return ELFObjOrErr.takeError();
|
|
|
|
// Search for all symbols that contain a constructor or destructor.
|
|
SmallVector<std::pair<StringRef, uint16_t>> Funcs;
|
|
for (ELFSymbolRef Sym : (*ELFObjOrErr)->symbols()) {
|
|
auto NameOrErr = Sym.getName();
|
|
if (!NameOrErr)
|
|
return NameOrErr.takeError();
|
|
|
|
if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
|
|
: "__fini_array_object_"))
|
|
continue;
|
|
|
|
uint16_t Priority;
|
|
if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
|
|
return Plugin::error("Invalid priority for constructor or destructor");
|
|
|
|
Funcs.emplace_back(*NameOrErr, Priority);
|
|
}
|
|
|
|
// Sort the created array to be in priority order.
|
|
llvm::sort(Funcs, [=](auto X, auto Y) { return X.second < Y.second; });
|
|
|
|
// Allocate a buffer to store all of the known constructor / destructor
|
|
// functions in so we can iterate them on the device.
|
|
void *Buffer =
|
|
allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
|
|
if (!Buffer)
|
|
return Plugin::error("Failed to allocate memory for global buffer");
|
|
|
|
auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
|
|
auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
|
|
|
|
SmallVector<void *> FunctionPtrs(Funcs.size());
|
|
std::size_t Idx = 0;
|
|
for (auto [Name, Priority] : Funcs) {
|
|
GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
|
|
if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
|
|
return Err;
|
|
}
|
|
|
|
// Copy the local buffer to the device.
|
|
if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
|
|
FunctionPtrs.size() * sizeof(void *), nullptr))
|
|
return Err;
|
|
|
|
// Copy the created buffer to the appropriate symbols so the kernel can
|
|
// iterate through them.
|
|
GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
|
|
sizeof(void *), &GlobalPtrStart);
|
|
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
|
|
return Err;
|
|
|
|
GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
|
|
sizeof(void *), &GlobalPtrStop);
|
|
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
|
|
return Err;
|
|
|
|
CUDAKernelTy CUDAKernel(KernelName);
|
|
|
|
if (auto Err = CUDAKernel.init(*this, Image))
|
|
return Err;
|
|
|
|
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
|
|
|
|
KernelArgsTy KernelArgs = {};
|
|
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
|
|
if (auto Err = CUDAKernel.launchImpl(
|
|
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
|
|
KernelLaunchParamsTy{}, AsyncInfoWrapper))
|
|
return Err;
|
|
|
|
Error Err = Plugin::success();
|
|
AsyncInfoWrapper.finalize(Err);
|
|
|
|
if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
|
|
return Plugin::error("Failed to free memory for global buffer");
|
|
|
|
return Err;
|
|
}
|
|
|
|
/// Stream manager for CUDA streams.
|
|
CUDAStreamManagerTy CUDAStreamManager;
|
|
|
|
/// Event manager for CUDA events.
|
|
CUDAEventManagerTy CUDAEventManager;
|
|
|
|
/// The device's context. This context should be set before performing
|
|
/// operations on the device.
|
|
CUcontext Context = nullptr;
|
|
|
|
/// The CUDA device handler.
|
|
CUdevice Device = CU_DEVICE_INVALID;
|
|
|
|
/// The memory mapped addresses and their handles
|
|
std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
|
|
|
|
/// The compute capability of the corresponding CUDA device.
|
|
struct ComputeCapabilityTy {
|
|
uint32_t Major;
|
|
uint32_t Minor;
|
|
std::string str() const {
|
|
return "sm_" + std::to_string(Major * 10 + Minor);
|
|
}
|
|
} ComputeCapability;
|
|
|
|
/// The maximum number of warps that can be resident on all the SMs
|
|
/// simultaneously.
|
|
uint32_t HardwareParallelism = 0;
|
|
};
|
|
|
|
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
|
uint32_t NumThreads[3], uint32_t NumBlocks[3],
|
|
KernelArgsTy &KernelArgs,
|
|
KernelLaunchParamsTy LaunchParams,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
|
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
|
|
|
|
CUstream Stream;
|
|
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
uint32_t MaxDynCGroupMem =
|
|
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
|
|
|
|
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
|
|
CU_LAUNCH_PARAM_BUFFER_SIZE,
|
|
reinterpret_cast<void *>(&LaunchParams.Size),
|
|
CU_LAUNCH_PARAM_END};
|
|
|
|
// If we are running an RPC server we want to wake up the server thread
|
|
// whenever there is a kernel running and let it sleep otherwise.
|
|
if (GenericDevice.getRPCServer())
|
|
GenericDevice.Plugin.getRPCServer().Thread->notify();
|
|
|
|
CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
|
|
NumThreads[0], NumThreads[1], NumThreads[2],
|
|
MaxDynCGroupMem, Stream, nullptr, Config);
|
|
|
|
// Register a callback to indicate when the kernel is complete.
|
|
if (GenericDevice.getRPCServer())
|
|
cuLaunchHostFunc(
|
|
Stream,
|
|
[](void *Data) {
|
|
GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
|
|
Plugin.getRPCServer().Thread->finish();
|
|
},
|
|
&GenericDevice.Plugin);
|
|
|
|
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
|
|
}
|
|
|
|
/// Class implementing the CUDA-specific functionalities of the global handler.
|
|
class CUDAGlobalHandlerTy final : public GenericGlobalHandlerTy {
|
|
public:
|
|
/// Get the metadata of a global from the device. The name and size of the
|
|
/// global is read from DeviceGlobal and the address of the global is written
|
|
/// to DeviceGlobal.
|
|
Error getGlobalMetadataFromDevice(GenericDeviceTy &Device,
|
|
DeviceImageTy &Image,
|
|
GlobalTy &DeviceGlobal) override {
|
|
CUDADeviceImageTy &CUDAImage = static_cast<CUDADeviceImageTy &>(Image);
|
|
|
|
const char *GlobalName = DeviceGlobal.getName().data();
|
|
|
|
size_t CUSize;
|
|
CUdeviceptr CUPtr;
|
|
CUresult Res =
|
|
cuModuleGetGlobal(&CUPtr, &CUSize, CUDAImage.getModule(), GlobalName);
|
|
if (auto Err = Plugin::check(Res, "Error in cuModuleGetGlobal for '%s': %s",
|
|
GlobalName))
|
|
return Err;
|
|
|
|
if (CUSize != DeviceGlobal.getSize())
|
|
return Plugin::error(
|
|
"Failed to load global '%s' due to size mismatch (%zu != %zu)",
|
|
GlobalName, CUSize, (size_t)DeviceGlobal.getSize());
|
|
|
|
DeviceGlobal.setPtr(reinterpret_cast<void *>(CUPtr));
|
|
return Plugin::success();
|
|
}
|
|
};
|
|
|
|
/// Class implementing the CUDA-specific functionalities of the plugin.
|
|
struct CUDAPluginTy final : public GenericPluginTy {
|
|
/// Create a CUDA plugin.
|
|
CUDAPluginTy() : GenericPluginTy(getTripleArch()) {}
|
|
|
|
/// This class should not be copied.
|
|
CUDAPluginTy(const CUDAPluginTy &) = delete;
|
|
CUDAPluginTy(CUDAPluginTy &&) = delete;
|
|
|
|
/// Initialize the plugin and return the number of devices.
|
|
Expected<int32_t> initImpl() override {
|
|
CUresult Res = cuInit(0);
|
|
if (Res == CUDA_ERROR_INVALID_HANDLE) {
|
|
// Cannot call cuGetErrorString if dlsym failed.
|
|
DP("Failed to load CUDA shared library\n");
|
|
return 0;
|
|
}
|
|
|
|
if (Res == CUDA_ERROR_NO_DEVICE) {
|
|
// Do not initialize if there are no devices.
|
|
DP("There are no devices supporting CUDA.\n");
|
|
return 0;
|
|
}
|
|
|
|
if (auto Err = Plugin::check(Res, "Error in cuInit: %s"))
|
|
return std::move(Err);
|
|
|
|
// Get the number of devices.
|
|
int NumDevices;
|
|
Res = cuDeviceGetCount(&NumDevices);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceGetCount: %s"))
|
|
return std::move(Err);
|
|
|
|
// Do not initialize if there are no devices.
|
|
if (NumDevices == 0)
|
|
DP("There are no devices supporting CUDA.\n");
|
|
|
|
return NumDevices;
|
|
}
|
|
|
|
/// Deinitialize the plugin.
|
|
Error deinitImpl() override { return Plugin::success(); }
|
|
|
|
/// Creates a CUDA device to use for offloading.
|
|
GenericDeviceTy *createDevice(GenericPluginTy &Plugin, int32_t DeviceId,
|
|
int32_t NumDevices) override {
|
|
return new CUDADeviceTy(Plugin, DeviceId, NumDevices);
|
|
}
|
|
|
|
/// Creates a CUDA global handler.
|
|
GenericGlobalHandlerTy *createGlobalHandler() override {
|
|
return new CUDAGlobalHandlerTy();
|
|
}
|
|
|
|
/// Get the ELF code for recognizing the compatible image binary.
|
|
uint16_t getMagicElfBits() const override { return ELF::EM_CUDA; }
|
|
|
|
Triple::ArchType getTripleArch() const override {
|
|
// TODO: I think we can drop the support for 32-bit NVPTX devices.
|
|
return Triple::nvptx64;
|
|
}
|
|
|
|
const char *getName() const override { return GETNAME(TARGET_NAME); }
|
|
|
|
/// Check whether the image is compatible with a CUDA device.
|
|
Expected<bool> isELFCompatible(uint32_t DeviceId,
|
|
StringRef Image) const override {
|
|
auto ElfOrErr =
|
|
ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""),
|
|
/*InitContent=*/false);
|
|
if (!ElfOrErr)
|
|
return ElfOrErr.takeError();
|
|
|
|
// Get the numeric value for the image's `sm_` value.
|
|
auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM;
|
|
|
|
CUdevice Device;
|
|
CUresult Res = cuDeviceGet(&Device, DeviceId);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s"))
|
|
return std::move(Err);
|
|
|
|
int32_t Major, Minor;
|
|
Res = cuDeviceGetAttribute(
|
|
&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
|
|
return std::move(Err);
|
|
|
|
Res = cuDeviceGetAttribute(
|
|
&Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s"))
|
|
return std::move(Err);
|
|
|
|
int32_t ImageMajor = SM / 10;
|
|
int32_t ImageMinor = SM % 10;
|
|
|
|
// A cubin generated for a certain compute capability is supported to
|
|
// run on any GPU with the same major revision and same or higher minor
|
|
// revision.
|
|
return Major == ImageMajor && Minor >= ImageMinor;
|
|
}
|
|
};
|
|
|
|
Error CUDADeviceTy::dataExchangeImpl(const void *SrcPtr,
|
|
GenericDeviceTy &DstGenericDevice,
|
|
void *DstPtr, int64_t Size,
|
|
AsyncInfoWrapperTy &AsyncInfoWrapper) {
|
|
if (auto Err = setContext())
|
|
return Err;
|
|
|
|
CUDADeviceTy &DstDevice = static_cast<CUDADeviceTy &>(DstGenericDevice);
|
|
|
|
CUresult Res;
|
|
int32_t DstDeviceId = DstDevice.DeviceId;
|
|
CUdeviceptr CUSrcPtr = (CUdeviceptr)SrcPtr;
|
|
CUdeviceptr CUDstPtr = (CUdeviceptr)DstPtr;
|
|
|
|
int CanAccessPeer = 0;
|
|
if (DeviceId != DstDeviceId) {
|
|
// Make sure the lock is released before performing the copies.
|
|
std::lock_guard<std::mutex> Lock(PeerAccessesLock);
|
|
|
|
switch (PeerAccesses[DstDeviceId]) {
|
|
case PeerAccessState::AVAILABLE:
|
|
CanAccessPeer = 1;
|
|
break;
|
|
case PeerAccessState::UNAVAILABLE:
|
|
CanAccessPeer = 0;
|
|
break;
|
|
case PeerAccessState::PENDING:
|
|
// Check whether the source device can access the destination device.
|
|
Res = cuDeviceCanAccessPeer(&CanAccessPeer, Device, DstDevice.Device);
|
|
if (auto Err = Plugin::check(Res, "Error in cuDeviceCanAccessPeer: %s"))
|
|
return Err;
|
|
|
|
if (CanAccessPeer) {
|
|
Res = cuCtxEnablePeerAccess(DstDevice.Context, 0);
|
|
if (Res == CUDA_ERROR_TOO_MANY_PEERS) {
|
|
// Resources may be exhausted due to many P2P links.
|
|
CanAccessPeer = 0;
|
|
DP("Too many P2P so fall back to D2D memcpy");
|
|
} else if (auto Err =
|
|
Plugin::check(Res, "Error in cuCtxEnablePeerAccess: %s"))
|
|
return Err;
|
|
}
|
|
PeerAccesses[DstDeviceId] = (CanAccessPeer)
|
|
? PeerAccessState::AVAILABLE
|
|
: PeerAccessState::UNAVAILABLE;
|
|
}
|
|
}
|
|
|
|
CUstream Stream;
|
|
if (auto Err = getStream(AsyncInfoWrapper, Stream))
|
|
return Err;
|
|
|
|
if (CanAccessPeer) {
|
|
// TODO: Should we fallback to D2D if peer access fails?
|
|
Res = cuMemcpyPeerAsync(CUDstPtr, Context, CUSrcPtr, DstDevice.Context,
|
|
Size, Stream);
|
|
return Plugin::check(Res, "Error in cuMemcpyPeerAsync: %s");
|
|
}
|
|
|
|
// Fallback to D2D copy.
|
|
Res = cuMemcpyDtoDAsync(CUDstPtr, CUSrcPtr, Size, Stream);
|
|
return Plugin::check(Res, "Error in cuMemcpyDtoDAsync: %s");
|
|
}
|
|
|
|
template <typename... ArgsTy>
|
|
static Error Plugin::check(int32_t Code, const char *ErrFmt, ArgsTy... Args) {
|
|
CUresult ResultCode = static_cast<CUresult>(Code);
|
|
if (ResultCode == CUDA_SUCCESS)
|
|
return Error::success();
|
|
|
|
const char *Desc = "Unknown error";
|
|
CUresult Ret = cuGetErrorString(ResultCode, &Desc);
|
|
if (Ret != CUDA_SUCCESS)
|
|
REPORT("Unrecognized " GETNAME(TARGET_NAME) " error code %d\n", Code);
|
|
|
|
return createStringError<ArgsTy..., const char *>(inconvertibleErrorCode(),
|
|
ErrFmt, Args..., Desc);
|
|
}
|
|
|
|
} // namespace plugin
|
|
} // namespace target
|
|
} // namespace omp
|
|
} // namespace llvm
|
|
|
|
extern "C" {
|
|
llvm::omp::target::plugin::GenericPluginTy *createPlugin_cuda() {
|
|
return new llvm::omp::target::plugin::CUDAPluginTy();
|
|
}
|
|
}
|