This patch extends the `is_valid_binary` routine to also check if the binary's architecture string matches the one parsed from the runtime. This should allow us to only use the binary whose compute capability matches, allowing us to support basic multi-architecture binaries for CUDA. Depends on D127432 Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D127505
1859 lines
69 KiB
C++
1859 lines
69 KiB
C++
//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// RTL for CUDA machine
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstddef>
|
|
#include <cuda.h>
|
|
#include <list>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
#include "Debug.h"
|
|
#include "DeviceEnvironment.h"
|
|
#include "omptarget.h"
|
|
#include "omptargetplugin.h"
|
|
|
|
#define TARGET_NAME CUDA
|
|
#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
|
|
|
|
#include "MemoryManager.h"
|
|
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
|
|
|
// Utility for retrieving and printing CUDA error string.
|
|
#ifdef OMPTARGET_DEBUG
|
|
#define CUDA_ERR_STRING(err) \
|
|
do { \
|
|
if (getDebugLevel() > 0) { \
|
|
const char *errStr = nullptr; \
|
|
CUresult errStr_status = cuGetErrorString(err, &errStr); \
|
|
if (errStr_status == CUDA_ERROR_INVALID_VALUE) \
|
|
REPORT("Unrecognized CUDA error code: %d\n", err); \
|
|
else if (errStr_status == CUDA_SUCCESS) \
|
|
REPORT("CUDA error is: %s\n", errStr); \
|
|
else { \
|
|
REPORT("Unresolved CUDA error code: %d\n", err); \
|
|
REPORT("Unsuccessful cuGetErrorString return status: %d\n", \
|
|
errStr_status); \
|
|
} \
|
|
} else { \
|
|
const char *errStr = nullptr; \
|
|
CUresult errStr_status = cuGetErrorString(err, &errStr); \
|
|
if (errStr_status == CUDA_SUCCESS) \
|
|
REPORT("%s \n", errStr); \
|
|
} \
|
|
} while (false)
|
|
#else // OMPTARGET_DEBUG
|
|
#define CUDA_ERR_STRING(err) \
|
|
do { \
|
|
const char *errStr = nullptr; \
|
|
CUresult errStr_status = cuGetErrorString(err, &errStr); \
|
|
if (errStr_status == CUDA_SUCCESS) \
|
|
REPORT("%s \n", errStr); \
|
|
} while (false)
|
|
#endif // OMPTARGET_DEBUG
|
|
|
|
#define BOOL2TEXT(b) ((b) ? "Yes" : "No")
|
|
|
|
#include "elf_common.h"
|
|
|
|
/// Keep entries table per device.
|
|
struct FuncOrGblEntryTy {
|
|
__tgt_target_table Table;
|
|
std::vector<__tgt_offload_entry> Entries;
|
|
};
|
|
|
|
/// Use a single entity to encode a kernel and a set of flags.
|
|
struct KernelTy {
|
|
CUfunction Func;
|
|
|
|
// execution mode of kernel
|
|
llvm::omp::OMPTgtExecModeFlags ExecutionMode;
|
|
|
|
/// Maximal number of threads per block for this kernel.
|
|
int MaxThreadsPerBlock = 0;
|
|
|
|
KernelTy(CUfunction Func, llvm::omp::OMPTgtExecModeFlags ExecutionMode)
|
|
: Func(Func), ExecutionMode(ExecutionMode) {}
|
|
};
|
|
|
|
namespace {
|
|
bool checkResult(CUresult Err, const char *ErrMsg) {
|
|
if (Err == CUDA_SUCCESS)
|
|
return true;
|
|
|
|
REPORT("%s", ErrMsg);
|
|
CUDA_ERR_STRING(Err);
|
|
return false;
|
|
}
|
|
|
|
int memcpyDtoD(const void *SrcPtr, void *DstPtr, int64_t Size,
|
|
CUstream Stream) {
|
|
CUresult Err =
|
|
cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream);
|
|
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when copying data from device to device. Pointers: src "
|
|
"= " DPxMOD ", dst = " DPxMOD ", size = %" PRId64 "\n",
|
|
DPxPTR(SrcPtr), DPxPTR(DstPtr), Size);
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int recordEvent(void *EventPtr, __tgt_async_info *AsyncInfo) {
|
|
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
|
|
CUresult Err = cuEventRecord(Event, Stream);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when recording event. stream = " DPxMOD ", event = " DPxMOD "\n",
|
|
DPxPTR(Stream), DPxPTR(Event));
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int syncEvent(void *EventPtr) {
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
|
|
CUresult Err = cuEventSynchronize(Event);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when syncing event = " DPxMOD "\n", DPxPTR(Event));
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
namespace {
|
|
|
|
// Structure contains per-device data
|
|
struct DeviceDataTy {
|
|
/// List that contains all the kernels.
|
|
std::list<KernelTy> KernelsList;
|
|
|
|
std::list<FuncOrGblEntryTy> FuncGblEntries;
|
|
|
|
CUcontext Context = nullptr;
|
|
// Device properties
|
|
unsigned int ThreadsPerBlock = 0;
|
|
unsigned int BlocksPerGrid = 0;
|
|
unsigned int WarpSize = 0;
|
|
// OpenMP properties
|
|
int NumTeams = 0;
|
|
int NumThreads = 0;
|
|
};
|
|
|
|
/// Resource allocator where \p T is the resource type.
|
|
/// Functions \p create and \p destroy return OFFLOAD_SUCCESS and OFFLOAD_FAIL
|
|
/// accordingly. The implementation should not raise any exception.
|
|
template <typename T> struct AllocatorTy {
|
|
using ElementTy = T;
|
|
virtual ~AllocatorTy() {}
|
|
|
|
/// Create a resource and assign to R.
|
|
virtual int create(T &R) noexcept = 0;
|
|
/// Destroy the resource.
|
|
virtual int destroy(T) noexcept = 0;
|
|
};
|
|
|
|
/// Allocator for CUstream.
|
|
struct StreamAllocatorTy final : public AllocatorTy<CUstream> {
|
|
/// See AllocatorTy<T>::create.
|
|
int create(CUstream &Stream) noexcept override {
|
|
if (!checkResult(cuStreamCreate(&Stream, CU_STREAM_NON_BLOCKING),
|
|
"Error returned from cuStreamCreate\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// See AllocatorTy<T>::destroy.
|
|
int destroy(CUstream Stream) noexcept override {
|
|
if (!checkResult(cuStreamDestroy(Stream),
|
|
"Error returned from cuStreamDestroy\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
};
|
|
|
|
/// Allocator for CUevent.
|
|
struct EventAllocatorTy final : public AllocatorTy<CUevent> {
|
|
/// See AllocatorTy<T>::create.
|
|
int create(CUevent &Event) noexcept override {
|
|
if (!checkResult(cuEventCreate(&Event, CU_EVENT_DEFAULT),
|
|
"Error returned from cuEventCreate\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// See AllocatorTy<T>::destroy.
|
|
int destroy(CUevent Event) noexcept override {
|
|
if (!checkResult(cuEventDestroy(Event),
|
|
"Error returned from cuEventDestroy\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
};
|
|
|
|
/// A generic pool of resources where \p T is the resource type.
|
|
/// \p T should be copyable as the object is stored in \p std::vector .
|
|
template <typename AllocTy> class ResourcePoolTy {
|
|
using ElementTy = typename AllocTy::ElementTy;
|
|
/// Index of the next available resource.
|
|
size_t Next = 0;
|
|
/// Mutex to guard the pool.
|
|
std::mutex Mutex;
|
|
/// Pool of resources. The difference between \p Resources and \p Pool is,
|
|
/// when a resource is acquired and released, it is all on \p Resources. When
|
|
/// a batch of new resources are needed, they are both added to \p Resources
|
|
/// and \p Pool. The reason for this setting is, \p Resources could contain
|
|
/// redundant elements because resources are not released, which can cause
|
|
/// double free. This setting makes sure that \p Pool always has every
|
|
/// resource allocated from the device.
|
|
std::vector<ElementTy> Resources;
|
|
std::vector<ElementTy> Pool;
|
|
/// A reference to the corresponding allocator.
|
|
AllocTy Allocator;
|
|
|
|
/// If `Resources` is used up, we will fill in more resources. It assumes that
|
|
/// the new size `Size` should be always larger than the current size.
|
|
bool resize(size_t Size) {
|
|
assert(Resources.size() == Pool.size() && "size mismatch");
|
|
auto CurSize = Resources.size();
|
|
assert(Size > CurSize && "Unexpected smaller size");
|
|
Pool.reserve(Size);
|
|
Resources.reserve(Size);
|
|
for (auto I = CurSize; I < Size; ++I) {
|
|
ElementTy NewItem;
|
|
int Ret = Allocator.create(NewItem);
|
|
if (Ret != OFFLOAD_SUCCESS)
|
|
return false;
|
|
Pool.push_back(NewItem);
|
|
Resources.push_back(NewItem);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
public:
|
|
ResourcePoolTy(AllocTy &&A, size_t Size = 0) noexcept
|
|
: Allocator(std::move(A)) {
|
|
if (Size)
|
|
(void)resize(Size);
|
|
}
|
|
|
|
~ResourcePoolTy() noexcept { clear(); }
|
|
|
|
/// Get a resource from pool. `Next` always points to the next available
|
|
/// resource. That means, `[0, next-1]` have been assigned, and `[id,]` are
|
|
/// still available. If there is no resource left, we will ask for more. Each
|
|
/// time a resource is assigned, the id will increase one.
|
|
/// xxxxxs+++++++++
|
|
/// ^
|
|
/// Next
|
|
/// After assignment, the pool becomes the following and s is assigned.
|
|
/// xxxxxs+++++++++
|
|
/// ^
|
|
/// Next
|
|
int acquire(ElementTy &R) noexcept {
|
|
std::lock_guard<std::mutex> LG(Mutex);
|
|
if (Next == Resources.size()) {
|
|
auto NewSize = Resources.size() ? Resources.size() * 2 : 1;
|
|
if (!resize(NewSize))
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
assert(Next < Resources.size());
|
|
|
|
R = Resources[Next++];
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
/// Return the resource back to the pool. When we return a resource, we need
|
|
/// to first decrease `Next`, and then copy the resource back. It is worth
|
|
/// noting that, the order of resources return might be different from that
|
|
/// they're assigned, that saying, at some point, there might be two identical
|
|
/// resources.
|
|
/// xxax+a+++++
|
|
/// ^
|
|
/// Next
|
|
/// However, it doesn't matter, because they're always on the two sides of
|
|
/// `Next`. The left one will in the end be overwritten by another resource.
|
|
/// Therefore, after several execution, the order of pool might be different
|
|
/// from its initial state.
|
|
void release(ElementTy R) noexcept {
|
|
std::lock_guard<std::mutex> LG(Mutex);
|
|
Resources[--Next] = R;
|
|
}
|
|
|
|
/// Released all stored resources and clear the pool.
|
|
/// Note: This function is not thread safe. Be sure to guard it if necessary.
|
|
void clear() noexcept {
|
|
for (auto &R : Pool)
|
|
(void)Allocator.destroy(R);
|
|
Pool.clear();
|
|
Resources.clear();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
class DeviceRTLTy {
|
|
int NumberOfDevices;
|
|
// OpenMP environment properties
|
|
int EnvNumTeams;
|
|
int EnvTeamLimit;
|
|
int EnvTeamThreadLimit;
|
|
// OpenMP requires flags
|
|
int64_t RequiresFlags;
|
|
// Amount of dynamic shared memory to use at launch.
|
|
uint64_t DynamicMemorySize;
|
|
|
|
/// Number of initial streams for each device.
|
|
int NumInitialStreams = 32;
|
|
|
|
/// Number of initial events for each device.
|
|
int NumInitialEvents = 8;
|
|
|
|
static constexpr const int32_t HardThreadLimit = 1024;
|
|
static constexpr const int32_t DefaultNumTeams = 128;
|
|
static constexpr const int32_t DefaultNumThreads = 128;
|
|
|
|
using StreamPoolTy = ResourcePoolTy<StreamAllocatorTy>;
|
|
std::vector<std::unique_ptr<StreamPoolTy>> StreamPool;
|
|
|
|
using EventPoolTy = ResourcePoolTy<EventAllocatorTy>;
|
|
std::vector<std::unique_ptr<EventPoolTy>> EventPool;
|
|
|
|
std::vector<DeviceDataTy> DeviceData;
|
|
std::vector<std::vector<CUmodule>> Modules;
|
|
|
|
/// Vector of flags indicating the initalization status of all associated
|
|
/// devices.
|
|
std::vector<bool> InitializedFlags;
|
|
|
|
enum class PeerAccessState : uint8_t { Unkown, Yes, No };
|
|
std::vector<std::vector<PeerAccessState>> PeerAccessMatrix;
|
|
std::mutex PeerAccessMatrixLock;
|
|
|
|
/// A class responsible for interacting with device native runtime library to
|
|
/// allocate and free memory.
|
|
class CUDADeviceAllocatorTy : public DeviceAllocatorTy {
|
|
std::unordered_map<void *, TargetAllocTy> HostPinnedAllocs;
|
|
|
|
public:
|
|
void *allocate(size_t Size, void *, TargetAllocTy Kind) override {
|
|
if (Size == 0)
|
|
return nullptr;
|
|
|
|
void *MemAlloc = nullptr;
|
|
CUresult Err;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
CUdeviceptr DevicePtr;
|
|
Err = cuMemAlloc(&DevicePtr, Size);
|
|
MemAlloc = (void *)DevicePtr;
|
|
if (!checkResult(Err, "Error returned from cuMemAlloc\n"))
|
|
return nullptr;
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
void *HostPtr;
|
|
Err = cuMemAllocHost(&HostPtr, Size);
|
|
MemAlloc = HostPtr;
|
|
if (!checkResult(Err, "Error returned from cuMemAllocHost\n"))
|
|
return nullptr;
|
|
HostPinnedAllocs[MemAlloc] = Kind;
|
|
break;
|
|
case TARGET_ALLOC_SHARED:
|
|
CUdeviceptr SharedPtr;
|
|
Err = cuMemAllocManaged(&SharedPtr, Size, CU_MEM_ATTACH_GLOBAL);
|
|
MemAlloc = (void *)SharedPtr;
|
|
if (!checkResult(Err, "Error returned from cuMemAllocManaged\n"))
|
|
return nullptr;
|
|
break;
|
|
}
|
|
|
|
return MemAlloc;
|
|
}
|
|
|
|
int free(void *TgtPtr) override {
|
|
CUresult Err;
|
|
// Host pinned memory must be freed differently.
|
|
TargetAllocTy Kind =
|
|
(HostPinnedAllocs.find(TgtPtr) == HostPinnedAllocs.end())
|
|
? TARGET_ALLOC_DEFAULT
|
|
: TARGET_ALLOC_HOST;
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
case TARGET_ALLOC_SHARED:
|
|
Err = cuMemFree((CUdeviceptr)TgtPtr);
|
|
if (!checkResult(Err, "Error returned from cuMemFree\n"))
|
|
return OFFLOAD_FAIL;
|
|
break;
|
|
case TARGET_ALLOC_HOST:
|
|
Err = cuMemFreeHost(TgtPtr);
|
|
if (!checkResult(Err, "Error returned from cuMemFreeHost\n"))
|
|
return OFFLOAD_FAIL;
|
|
break;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
};
|
|
|
|
/// A vector of device allocators
|
|
std::vector<CUDADeviceAllocatorTy> DeviceAllocators;
|
|
|
|
/// A vector of memory managers. Since the memory manager is non-copyable and
|
|
// non-removable, we wrap them into std::unique_ptr.
|
|
std::vector<std::unique_ptr<MemoryManagerTy>> MemoryManagers;
|
|
|
|
/// Whether use memory manager
|
|
bool UseMemoryManager = true;
|
|
|
|
// Record entry point associated with device
|
|
void addOffloadEntry(const int DeviceId, const __tgt_offload_entry Entry) {
|
|
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
|
|
E.Entries.push_back(Entry);
|
|
}
|
|
|
|
// Return a pointer to the entry associated with the pointer
|
|
const __tgt_offload_entry *getOffloadEntry(const int DeviceId,
|
|
const void *Addr) const {
|
|
for (const __tgt_offload_entry &Itr :
|
|
DeviceData[DeviceId].FuncGblEntries.back().Entries)
|
|
if (Itr.addr == Addr)
|
|
return &Itr;
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
// Return the pointer to the target entries table
|
|
__tgt_target_table *getOffloadEntriesTable(const int DeviceId) {
|
|
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
|
|
|
|
if (E.Entries.empty())
|
|
return nullptr;
|
|
|
|
// Update table info according to the entries and return the pointer
|
|
E.Table.EntriesBegin = E.Entries.data();
|
|
E.Table.EntriesEnd = E.Entries.data() + E.Entries.size();
|
|
|
|
return &E.Table;
|
|
}
|
|
|
|
// Clear entries table for a device
|
|
void clearOffloadEntriesTable(const int DeviceId) {
|
|
DeviceData[DeviceId].FuncGblEntries.emplace_back();
|
|
FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back();
|
|
E.Entries.clear();
|
|
E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr;
|
|
}
|
|
|
|
public:
|
|
CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfo) const {
|
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
|
|
|
if (!AsyncInfo->Queue) {
|
|
CUstream S;
|
|
if (StreamPool[DeviceId]->acquire(S) != OFFLOAD_SUCCESS)
|
|
return nullptr;
|
|
|
|
AsyncInfo->Queue = S;
|
|
}
|
|
|
|
return reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
|
}
|
|
|
|
// This class should not be copied
|
|
DeviceRTLTy(const DeviceRTLTy &) = delete;
|
|
DeviceRTLTy(DeviceRTLTy &&) = delete;
|
|
|
|
DeviceRTLTy()
|
|
: NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1),
|
|
EnvTeamThreadLimit(-1), RequiresFlags(OMP_REQ_UNDEFINED),
|
|
DynamicMemorySize(0) {
|
|
|
|
DP("Start initializing CUDA\n");
|
|
|
|
CUresult Err = cuInit(0);
|
|
if (Err == CUDA_ERROR_INVALID_HANDLE) {
|
|
// Can't call cuGetErrorString if dlsym failed
|
|
DP("Failed to load CUDA shared library\n");
|
|
return;
|
|
}
|
|
if (!checkResult(Err, "Error returned from cuInit\n")) {
|
|
return;
|
|
}
|
|
|
|
Err = cuDeviceGetCount(&NumberOfDevices);
|
|
if (!checkResult(Err, "Error returned from cuDeviceGetCount\n"))
|
|
return;
|
|
|
|
if (NumberOfDevices == 0) {
|
|
DP("There are no devices supporting CUDA.\n");
|
|
return;
|
|
}
|
|
|
|
DeviceData.resize(NumberOfDevices);
|
|
Modules.resize(NumberOfDevices);
|
|
StreamPool.resize(NumberOfDevices);
|
|
EventPool.resize(NumberOfDevices);
|
|
PeerAccessMatrix.resize(NumberOfDevices);
|
|
for (auto &V : PeerAccessMatrix)
|
|
V.resize(NumberOfDevices, PeerAccessState::Unkown);
|
|
|
|
// Get environment variables regarding teams
|
|
if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) {
|
|
// OMP_TEAM_LIMIT has been set
|
|
EnvTeamLimit = std::stoi(EnvStr);
|
|
DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit);
|
|
}
|
|
if (const char *EnvStr = getenv("OMP_TEAMS_THREAD_LIMIT")) {
|
|
// OMP_TEAMS_THREAD_LIMIT has been set
|
|
EnvTeamThreadLimit = std::stoi(EnvStr);
|
|
DP("Parsed OMP_TEAMS_THREAD_LIMIT=%d\n", EnvTeamThreadLimit);
|
|
}
|
|
if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) {
|
|
// OMP_NUM_TEAMS has been set
|
|
EnvNumTeams = std::stoi(EnvStr);
|
|
DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams);
|
|
}
|
|
if (const char *EnvStr = getenv("LIBOMPTARGET_SHARED_MEMORY_SIZE")) {
|
|
// LIBOMPTARGET_SHARED_MEMORY_SIZE has been set
|
|
DynamicMemorySize = std::stoi(EnvStr);
|
|
DP("Parsed LIBOMPTARGET_SHARED_MEMORY_SIZE = %" PRIu64 "\n",
|
|
DynamicMemorySize);
|
|
}
|
|
if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) {
|
|
// LIBOMPTARGET_NUM_INITIAL_STREAMS has been set
|
|
NumInitialStreams = std::stoi(EnvStr);
|
|
DP("Parsed LIBOMPTARGET_NUM_INITIAL_STREAMS=%d\n", NumInitialStreams);
|
|
}
|
|
|
|
for (int I = 0; I < NumberOfDevices; ++I)
|
|
DeviceAllocators.emplace_back();
|
|
|
|
// Get the size threshold from environment variable
|
|
std::pair<size_t, bool> Res = MemoryManagerTy::getSizeThresholdFromEnv();
|
|
UseMemoryManager = Res.second;
|
|
size_t MemoryManagerThreshold = Res.first;
|
|
|
|
if (UseMemoryManager)
|
|
for (int I = 0; I < NumberOfDevices; ++I)
|
|
MemoryManagers.emplace_back(std::make_unique<MemoryManagerTy>(
|
|
DeviceAllocators[I], MemoryManagerThreshold));
|
|
|
|
// We lazily initialize all devices later.
|
|
InitializedFlags.assign(NumberOfDevices, false);
|
|
}
|
|
|
|
~DeviceRTLTy() {
|
|
for (int DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId)
|
|
deinitDevice(DeviceId);
|
|
}
|
|
|
|
// Check whether a given DeviceId is valid
|
|
bool isValidDeviceId(const int DeviceId) const {
|
|
return DeviceId >= 0 && DeviceId < NumberOfDevices;
|
|
}
|
|
|
|
int getNumOfDevices() const { return NumberOfDevices; }
|
|
|
|
void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; }
|
|
|
|
int initDevice(const int DeviceId) {
|
|
CUdevice Device;
|
|
|
|
DP("Getting device %d\n", DeviceId);
|
|
CUresult Err = cuDeviceGet(&Device, DeviceId);
|
|
if (!checkResult(Err, "Error returned from cuDeviceGet\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
assert(InitializedFlags[DeviceId] == false && "Reinitializing device!");
|
|
InitializedFlags[DeviceId] = true;
|
|
|
|
// Query the current flags of the primary context and set its flags if
|
|
// it is inactive
|
|
unsigned int FormerPrimaryCtxFlags = 0;
|
|
int FormerPrimaryCtxIsActive = 0;
|
|
Err = cuDevicePrimaryCtxGetState(Device, &FormerPrimaryCtxFlags,
|
|
&FormerPrimaryCtxIsActive);
|
|
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxGetState\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
if (FormerPrimaryCtxIsActive) {
|
|
DP("The primary context is active, no change to its flags\n");
|
|
if ((FormerPrimaryCtxFlags & CU_CTX_SCHED_MASK) !=
|
|
CU_CTX_SCHED_BLOCKING_SYNC)
|
|
DP("Warning the current flags are not CU_CTX_SCHED_BLOCKING_SYNC\n");
|
|
} else {
|
|
DP("The primary context is inactive, set its flags to "
|
|
"CU_CTX_SCHED_BLOCKING_SYNC\n");
|
|
Err = cuDevicePrimaryCtxSetFlags(Device, CU_CTX_SCHED_BLOCKING_SYNC);
|
|
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxSetFlags\n"))
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
// Retain the per device primary context and save it to use whenever this
|
|
// device is selected.
|
|
Err = cuDevicePrimaryCtxRetain(&DeviceData[DeviceId].Context, Device);
|
|
if (!checkResult(Err, "Error returned from cuDevicePrimaryCtxRetain\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
|
|
if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
// Initialize the stream pool.
|
|
if (!StreamPool[DeviceId])
|
|
StreamPool[DeviceId] = std::make_unique<StreamPoolTy>(StreamAllocatorTy(),
|
|
NumInitialStreams);
|
|
|
|
// Initialize the event pool.
|
|
if (!EventPool[DeviceId])
|
|
EventPool[DeviceId] =
|
|
std::make_unique<EventPoolTy>(EventAllocatorTy(), NumInitialEvents);
|
|
|
|
// Query attributes to determine number of threads/block and blocks/grid.
|
|
int MaxGridDimX;
|
|
Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
|
|
Device);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error getting max grid dimension, use default value %d\n",
|
|
DeviceRTLTy::DefaultNumTeams);
|
|
DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams;
|
|
} else {
|
|
DP("Using %d CUDA blocks per grid\n", MaxGridDimX);
|
|
DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX;
|
|
}
|
|
|
|
// We are only exploiting threads along the x axis.
|
|
int MaxBlockDimX;
|
|
Err = cuDeviceGetAttribute(&MaxBlockDimX,
|
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error getting max block dimension, use default value %d\n",
|
|
DeviceRTLTy::DefaultNumThreads);
|
|
DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads;
|
|
} else {
|
|
DP("Using %d CUDA threads per block\n", MaxBlockDimX);
|
|
DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX;
|
|
|
|
if (EnvTeamThreadLimit > 0 &&
|
|
DeviceData[DeviceId].ThreadsPerBlock > EnvTeamThreadLimit) {
|
|
DP("Max CUDA threads per block %d exceeds the thread limit %d set by "
|
|
"OMP_TEAMS_THREAD_LIMIT, capping at the limit\n",
|
|
DeviceData[DeviceId].ThreadsPerBlock, EnvTeamThreadLimit);
|
|
DeviceData[DeviceId].ThreadsPerBlock = EnvTeamThreadLimit;
|
|
}
|
|
if (DeviceData[DeviceId].ThreadsPerBlock > DeviceRTLTy::HardThreadLimit) {
|
|
DP("Max CUDA threads per block %d exceeds the hard thread limit %d, "
|
|
"capping at the hard limit\n",
|
|
DeviceData[DeviceId].ThreadsPerBlock, DeviceRTLTy::HardThreadLimit);
|
|
DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit;
|
|
}
|
|
}
|
|
|
|
// Get and set warp size
|
|
int WarpSize;
|
|
Err =
|
|
cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error getting warp size, assume default value 32\n");
|
|
DeviceData[DeviceId].WarpSize = 32;
|
|
} else {
|
|
DP("Using warp size %d\n", WarpSize);
|
|
DeviceData[DeviceId].WarpSize = WarpSize;
|
|
}
|
|
|
|
// Adjust teams to the env variables
|
|
if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) {
|
|
DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n",
|
|
EnvTeamLimit);
|
|
DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit;
|
|
}
|
|
|
|
size_t StackLimit;
|
|
size_t HeapLimit;
|
|
if (const char *EnvStr = getenv("LIBOMPTARGET_STACK_SIZE")) {
|
|
StackLimit = std::stol(EnvStr);
|
|
if (cuCtxSetLimit(CU_LIMIT_STACK_SIZE, StackLimit) != CUDA_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
} else {
|
|
if (cuCtxGetLimit(&StackLimit, CU_LIMIT_STACK_SIZE) != CUDA_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
if (const char *EnvStr = getenv("LIBOMPTARGET_HEAP_SIZE")) {
|
|
HeapLimit = std::stol(EnvStr);
|
|
if (cuCtxSetLimit(CU_LIMIT_MALLOC_HEAP_SIZE, HeapLimit) != CUDA_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
} else {
|
|
if (cuCtxGetLimit(&HeapLimit, CU_LIMIT_MALLOC_HEAP_SIZE) != CUDA_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"Device supports up to %d CUDA blocks and %d threads with a "
|
|
"warp size of %d\n",
|
|
DeviceData[DeviceId].BlocksPerGrid,
|
|
DeviceData[DeviceId].ThreadsPerBlock, DeviceData[DeviceId].WarpSize);
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"Device heap size is %d Bytes, device stack size is %d Bytes per "
|
|
"thread\n",
|
|
(int)HeapLimit, (int)StackLimit);
|
|
|
|
// Set default number of teams
|
|
if (EnvNumTeams > 0) {
|
|
DP("Default number of teams set according to environment %d\n",
|
|
EnvNumTeams);
|
|
DeviceData[DeviceId].NumTeams = EnvNumTeams;
|
|
} else {
|
|
DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams;
|
|
DP("Default number of teams set according to library's default %d\n",
|
|
DeviceRTLTy::DefaultNumTeams);
|
|
}
|
|
|
|
if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) {
|
|
DP("Default number of teams exceeds device limit, capping at %d\n",
|
|
DeviceData[DeviceId].BlocksPerGrid);
|
|
DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid;
|
|
}
|
|
|
|
// Set default number of threads
|
|
DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads;
|
|
DP("Default number of threads set according to library's default %d\n",
|
|
DeviceRTLTy::DefaultNumThreads);
|
|
if (DeviceData[DeviceId].NumThreads >
|
|
DeviceData[DeviceId].ThreadsPerBlock) {
|
|
DP("Default number of threads exceeds device limit, capping at %d\n",
|
|
DeviceData[DeviceId].ThreadsPerBlock);
|
|
DeviceData[DeviceId].NumThreads = DeviceData[DeviceId].ThreadsPerBlock;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int deinitDevice(const int DeviceId) {
|
|
auto IsInitialized = InitializedFlags[DeviceId];
|
|
if (!IsInitialized)
|
|
return OFFLOAD_SUCCESS;
|
|
InitializedFlags[DeviceId] = false;
|
|
|
|
if (UseMemoryManager)
|
|
MemoryManagers[DeviceId].release();
|
|
|
|
StreamPool[DeviceId].reset();
|
|
EventPool[DeviceId].reset();
|
|
|
|
DeviceDataTy &D = DeviceData[DeviceId];
|
|
if (!checkResult(cuCtxSetCurrent(D.Context),
|
|
"Error returned from cuCtxSetCurrent\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
// Unload all modules.
|
|
for (auto &M : Modules[DeviceId])
|
|
if (!checkResult(cuModuleUnload(M),
|
|
"Error returned from cuModuleUnload\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
// Destroy context.
|
|
CUdevice Device;
|
|
if (!checkResult(cuCtxGetDevice(&Device),
|
|
"Error returned from cuCtxGetDevice\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
if (!checkResult(cuDevicePrimaryCtxRelease(Device),
|
|
"Error returned from cuDevicePrimaryCtxRelease\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
__tgt_target_table *loadBinary(const int DeviceId,
|
|
const __tgt_device_image *Image) {
|
|
// Clear the offload table as we are going to create a new one.
|
|
clearOffloadEntriesTable(DeviceId);
|
|
|
|
// Create the module and extract the function pointers.
|
|
CUmodule Module;
|
|
DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart));
|
|
CUresult Err =
|
|
cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr);
|
|
if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n"))
|
|
return nullptr;
|
|
|
|
DP("CUDA module successfully loaded!\n");
|
|
|
|
Modules[DeviceId].push_back(Module);
|
|
|
|
// Find the symbols in the module by name.
|
|
const __tgt_offload_entry *HostBegin = Image->EntriesBegin;
|
|
const __tgt_offload_entry *HostEnd = Image->EntriesEnd;
|
|
|
|
std::list<KernelTy> &KernelsList = DeviceData[DeviceId].KernelsList;
|
|
for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
|
|
if (!E->addr) {
|
|
// We return nullptr when something like this happens, the host should
|
|
// have always something in the address to uniquely identify the target
|
|
// region.
|
|
DP("Invalid binary: host entry '<null>' (size = %zd)...\n", E->size);
|
|
return nullptr;
|
|
}
|
|
|
|
if (E->size) {
|
|
__tgt_offload_entry Entry = *E;
|
|
CUdeviceptr CUPtr;
|
|
size_t CUSize;
|
|
Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name);
|
|
// We keep this style here because we need the name
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Loading global '%s' Failed\n", E->name);
|
|
CUDA_ERR_STRING(Err);
|
|
return nullptr;
|
|
}
|
|
|
|
if (CUSize != E->size) {
|
|
DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name,
|
|
CUSize, E->size);
|
|
return nullptr;
|
|
}
|
|
|
|
DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",
|
|
DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr));
|
|
|
|
Entry.addr = (void *)(CUPtr);
|
|
|
|
// Note: In the current implementation declare target variables
|
|
// can either be link or to. This means that once unified
|
|
// memory is activated via the requires directive, the variable
|
|
// can be used directly from the host in both cases.
|
|
// TODO: when variables types other than to or link are added,
|
|
// the below condition should be changed to explicitly
|
|
// check for to and link variables types:
|
|
// (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags &
|
|
// OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO))
|
|
if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
|
|
// If unified memory is present any target link or to variables
|
|
// can access host addresses directly. There is no longer a
|
|
// need for device copies.
|
|
cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *));
|
|
DP("Copy linked variable host address (" DPxMOD
|
|
") to device address (" DPxMOD ")\n",
|
|
DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr));
|
|
}
|
|
|
|
addOffloadEntry(DeviceId, Entry);
|
|
|
|
continue;
|
|
}
|
|
|
|
CUfunction Func;
|
|
Err = cuModuleGetFunction(&Func, Module, E->name);
|
|
// We keep this style here because we need the name
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Loading '%s' Failed\n", E->name);
|
|
CUDA_ERR_STRING(Err);
|
|
return nullptr;
|
|
}
|
|
|
|
DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n",
|
|
DPxPTR(E - HostBegin), E->name, DPxPTR(Func));
|
|
|
|
// default value GENERIC (in case symbol is missing from cubin file)
|
|
llvm::omp::OMPTgtExecModeFlags ExecModeVal;
|
|
std::string ExecModeNameStr(E->name);
|
|
ExecModeNameStr += "_exec_mode";
|
|
const char *ExecModeName = ExecModeNameStr.c_str();
|
|
|
|
CUdeviceptr ExecModePtr;
|
|
size_t CUSize;
|
|
Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName);
|
|
if (Err == CUDA_SUCCESS) {
|
|
if (CUSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
|
|
DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n",
|
|
ExecModeName, CUSize, sizeof(llvm::omp::OMPTgtExecModeFlags));
|
|
return nullptr;
|
|
}
|
|
|
|
Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize);
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Error when copying data from device to host. Pointers: "
|
|
"host = " DPxMOD ", device = " DPxMOD ", size = %zd\n",
|
|
DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize);
|
|
CUDA_ERR_STRING(Err);
|
|
return nullptr;
|
|
}
|
|
} else {
|
|
DP("Loading global exec_mode '%s' - symbol missing, using default "
|
|
"value GENERIC (1)\n",
|
|
ExecModeName);
|
|
}
|
|
|
|
KernelsList.emplace_back(Func, ExecModeVal);
|
|
|
|
__tgt_offload_entry Entry = *E;
|
|
Entry.addr = &KernelsList.back();
|
|
addOffloadEntry(DeviceId, Entry);
|
|
}
|
|
|
|
// send device environment data to the device
|
|
{
|
|
// TODO: The device ID used here is not the real device ID used by OpenMP.
|
|
DeviceEnvironmentTy DeviceEnv{0, static_cast<uint32_t>(NumberOfDevices),
|
|
static_cast<uint32_t>(DeviceId),
|
|
static_cast<uint32_t>(DynamicMemorySize)};
|
|
|
|
if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
|
|
DeviceEnv.DebugKind = std::stoi(EnvStr);
|
|
|
|
const char *DeviceEnvName = "omptarget_device_environment";
|
|
CUdeviceptr DeviceEnvPtr;
|
|
size_t CUSize;
|
|
|
|
Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName);
|
|
if (Err == CUDA_SUCCESS) {
|
|
if (CUSize != sizeof(DeviceEnv)) {
|
|
REPORT(
|
|
"Global device_environment '%s' - size mismatch (%zu != %zu)\n",
|
|
DeviceEnvName, CUSize, sizeof(int32_t));
|
|
CUDA_ERR_STRING(Err);
|
|
return nullptr;
|
|
}
|
|
|
|
Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize);
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Error when copying data from host to device. Pointers: "
|
|
"host = " DPxMOD ", device = " DPxMOD ", size = %zu\n",
|
|
DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize);
|
|
CUDA_ERR_STRING(Err);
|
|
return nullptr;
|
|
}
|
|
|
|
DP("Sending global device environment data %zu bytes\n", CUSize);
|
|
} else {
|
|
DP("Finding global device environment '%s' - symbol missing.\n",
|
|
DeviceEnvName);
|
|
DP("Continue, considering this is a device RTL which does not accept "
|
|
"environment setting.\n");
|
|
}
|
|
}
|
|
|
|
return getOffloadEntriesTable(DeviceId);
|
|
}
|
|
|
|
void *dataAlloc(const int DeviceId, const int64_t Size,
|
|
const TargetAllocTy Kind) {
|
|
switch (Kind) {
|
|
case TARGET_ALLOC_DEFAULT:
|
|
case TARGET_ALLOC_DEVICE:
|
|
if (UseMemoryManager)
|
|
return MemoryManagers[DeviceId]->allocate(Size, nullptr);
|
|
else
|
|
return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
|
|
case TARGET_ALLOC_HOST:
|
|
case TARGET_ALLOC_SHARED:
|
|
return DeviceAllocators[DeviceId].allocate(Size, nullptr, Kind);
|
|
}
|
|
|
|
REPORT("Invalid target data allocation kind or requested allocator not "
|
|
"implemented yet\n");
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr,
|
|
const int64_t Size, __tgt_async_info *AsyncInfo) const {
|
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
|
|
|
CUstream Stream = getStream(DeviceId, AsyncInfo);
|
|
CUresult Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when copying data from host to device. Pointers: host "
|
|
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
|
|
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr,
|
|
const int64_t Size, __tgt_async_info *AsyncInfo) const {
|
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
|
|
|
CUstream Stream = getStream(DeviceId, AsyncInfo);
|
|
CUresult Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when copying data from device to host. Pointers: host "
|
|
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 "\n",
|
|
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size);
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int dataExchange(int SrcDevId, const void *SrcPtr, int DstDevId, void *DstPtr,
|
|
int64_t Size, __tgt_async_info *AsyncInfo) {
|
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
|
|
|
CUresult Err;
|
|
CUstream Stream = getStream(SrcDevId, AsyncInfo);
|
|
|
|
// If they are two devices, we try peer to peer copy first
|
|
if (SrcDevId != DstDevId) {
|
|
std::lock_guard<std::mutex> LG(PeerAccessMatrixLock);
|
|
|
|
switch (PeerAccessMatrix[SrcDevId][DstDevId]) {
|
|
case PeerAccessState::No: {
|
|
REPORT("Peer access from %" PRId32 " to %" PRId32
|
|
" is not supported. Fall back to D2D memcpy.\n",
|
|
SrcDevId, DstDevId);
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
case PeerAccessState::Unkown: {
|
|
int CanAccessPeer = 0;
|
|
Err = cuDeviceCanAccessPeer(&CanAccessPeer, SrcDevId, DstDevId);
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Error returned from cuDeviceCanAccessPeer. src = %" PRId32
|
|
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
|
|
SrcDevId, DstDevId);
|
|
CUDA_ERR_STRING(Err);
|
|
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
|
|
if (!CanAccessPeer) {
|
|
REPORT("P2P access from %d to %d is not supported. Fall back to D2D "
|
|
"memcpy.\n",
|
|
SrcDevId, DstDevId);
|
|
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
|
|
Err = cuCtxEnablePeerAccess(DeviceData[DstDevId].Context, 0);
|
|
if (Err != CUDA_SUCCESS) {
|
|
REPORT("Error returned from cuCtxEnablePeerAccess. src = %" PRId32
|
|
", dst = %" PRId32 ". Fall back to D2D memcpy.\n",
|
|
SrcDevId, DstDevId);
|
|
CUDA_ERR_STRING(Err);
|
|
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::No;
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
|
|
PeerAccessMatrix[SrcDevId][DstDevId] = PeerAccessState::Yes;
|
|
|
|
LLVM_FALLTHROUGH;
|
|
}
|
|
case PeerAccessState::Yes: {
|
|
Err = cuMemcpyPeerAsync(
|
|
(CUdeviceptr)DstPtr, DeviceData[DstDevId].Context,
|
|
(CUdeviceptr)SrcPtr, DeviceData[SrcDevId].Context, Size, Stream);
|
|
if (Err == CUDA_SUCCESS)
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
DP("Error returned from cuMemcpyPeerAsync. src_ptr = " DPxMOD
|
|
", src_id =%" PRId32 ", dst_ptr = " DPxMOD ", dst_id =%" PRId32
|
|
". Fall back to D2D memcpy.\n",
|
|
DPxPTR(SrcPtr), SrcDevId, DPxPTR(DstPtr), DstDevId);
|
|
CUDA_ERR_STRING(Err);
|
|
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
}
|
|
}
|
|
|
|
return memcpyDtoD(SrcPtr, DstPtr, Size, Stream);
|
|
}
|
|
|
|
int dataDelete(const int DeviceId, void *TgtPtr) {
|
|
if (UseMemoryManager)
|
|
return MemoryManagers[DeviceId]->free(TgtPtr);
|
|
|
|
return DeviceAllocators[DeviceId].free(TgtPtr);
|
|
}
|
|
|
|
int runTargetTeamRegion(const int DeviceId, void *TgtEntryPtr, void **TgtArgs,
|
|
ptrdiff_t *TgtOffsets, const int ArgNum,
|
|
const int TeamNum, const int ThreadLimit,
|
|
const unsigned int LoopTripCount,
|
|
__tgt_async_info *AsyncInfo) const {
|
|
// All args are references.
|
|
std::vector<void *> Args(ArgNum);
|
|
std::vector<void *> Ptrs(ArgNum);
|
|
|
|
for (int I = 0; I < ArgNum; ++I) {
|
|
Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
|
|
Args[I] = &Ptrs[I];
|
|
}
|
|
|
|
KernelTy *KernelInfo = reinterpret_cast<KernelTy *>(TgtEntryPtr);
|
|
|
|
const bool IsSPMDGenericMode =
|
|
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD;
|
|
const bool IsSPMDMode =
|
|
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
|
|
const bool IsGenericMode =
|
|
KernelInfo->ExecutionMode == llvm::omp::OMP_TGT_EXEC_MODE_GENERIC;
|
|
|
|
int CudaThreadsPerBlock;
|
|
if (ThreadLimit > 0) {
|
|
DP("Setting CUDA threads per block to requested %d\n", ThreadLimit);
|
|
CudaThreadsPerBlock = ThreadLimit;
|
|
// Add master warp if necessary
|
|
if (IsGenericMode) {
|
|
DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize);
|
|
CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize;
|
|
}
|
|
} else {
|
|
DP("Setting CUDA threads per block to default %d\n",
|
|
DeviceData[DeviceId].NumThreads);
|
|
CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads;
|
|
}
|
|
|
|
if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) {
|
|
DP("Threads per block capped at device limit %d\n",
|
|
DeviceData[DeviceId].ThreadsPerBlock);
|
|
CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock;
|
|
}
|
|
|
|
CUresult Err;
|
|
if (!KernelInfo->MaxThreadsPerBlock) {
|
|
Err = cuFuncGetAttribute(&KernelInfo->MaxThreadsPerBlock,
|
|
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
|
|
KernelInfo->Func);
|
|
if (!checkResult(Err, "Error returned from cuFuncGetAttribute\n"))
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
if (KernelInfo->MaxThreadsPerBlock < CudaThreadsPerBlock) {
|
|
DP("Threads per block capped at kernel limit %d\n",
|
|
KernelInfo->MaxThreadsPerBlock);
|
|
CudaThreadsPerBlock = KernelInfo->MaxThreadsPerBlock;
|
|
}
|
|
|
|
unsigned int CudaBlocksPerGrid;
|
|
if (TeamNum <= 0) {
|
|
if (LoopTripCount > 0 && EnvNumTeams < 0) {
|
|
if (IsSPMDGenericMode) {
|
|
// If we reach this point, then we are executing a kernel that was
|
|
// transformed from Generic-mode to SPMD-mode. This kernel has
|
|
// SPMD-mode execution, but needs its blocks to be scheduled
|
|
// differently because the current loop trip count only applies to the
|
|
// `teams distribute` region and will create var too few blocks using
|
|
// the regular SPMD-mode method.
|
|
CudaBlocksPerGrid = LoopTripCount;
|
|
} else if (IsSPMDMode) {
|
|
// We have a combined construct, i.e. `target teams distribute
|
|
// parallel for [simd]`. We launch so many teams so that each thread
|
|
// will execute one iteration of the loop. round up to the nearest
|
|
// integer
|
|
CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1;
|
|
} else if (IsGenericMode) {
|
|
// If we reach this point, then we have a non-combined construct, i.e.
|
|
// `teams distribute` with a nested `parallel for` and each team is
|
|
// assigned one iteration of the `distribute` loop. E.g.:
|
|
//
|
|
// #pragma omp target teams distribute
|
|
// for(...loop_tripcount...) {
|
|
// #pragma omp parallel for
|
|
// for(...) {}
|
|
// }
|
|
//
|
|
// Threads within a team will execute the iterations of the `parallel`
|
|
// loop.
|
|
CudaBlocksPerGrid = LoopTripCount;
|
|
} else {
|
|
REPORT("Unknown execution mode: %d\n",
|
|
static_cast<int8_t>(KernelInfo->ExecutionMode));
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
DP("Using %d teams due to loop trip count %" PRIu32
|
|
" and number of threads per block %d\n",
|
|
CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock);
|
|
} else {
|
|
DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams);
|
|
CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams;
|
|
}
|
|
} else {
|
|
DP("Using requested number of teams %d\n", TeamNum);
|
|
CudaBlocksPerGrid = TeamNum;
|
|
}
|
|
|
|
if (CudaBlocksPerGrid > DeviceData[DeviceId].BlocksPerGrid) {
|
|
DP("Capping number of teams to team limit %d\n",
|
|
DeviceData[DeviceId].BlocksPerGrid);
|
|
CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid;
|
|
}
|
|
|
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
|
"Launching kernel %s with %d blocks and %d threads in %s mode\n",
|
|
(getOffloadEntry(DeviceId, TgtEntryPtr))
|
|
? getOffloadEntry(DeviceId, TgtEntryPtr)->name
|
|
: "(null)",
|
|
CudaBlocksPerGrid, CudaThreadsPerBlock,
|
|
(!IsSPMDMode ? (IsGenericMode ? "Generic" : "SPMD-Generic") : "SPMD"));
|
|
|
|
CUstream Stream = getStream(DeviceId, AsyncInfo);
|
|
Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1,
|
|
/* gridDimZ */ 1, CudaThreadsPerBlock,
|
|
/* blockDimY */ 1, /* blockDimZ */ 1,
|
|
DynamicMemorySize, Stream, &Args[0], nullptr);
|
|
if (!checkResult(Err, "Error returned from cuLaunchKernel\n"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
DP("Launch of entry point at " DPxMOD " successful!\n",
|
|
DPxPTR(TgtEntryPtr));
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int synchronize(const int DeviceId, __tgt_async_info *AsyncInfo) const {
|
|
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo->Queue);
|
|
CUresult Err = cuStreamSynchronize(Stream);
|
|
|
|
// Once the stream is synchronized, return it to stream pool and reset
|
|
// AsyncInfo. This is to make sure the synchronization only works for its
|
|
// own tasks.
|
|
StreamPool[DeviceId]->release(reinterpret_cast<CUstream>(AsyncInfo->Queue));
|
|
AsyncInfo->Queue = nullptr;
|
|
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when synchronizing stream. stream = " DPxMOD
|
|
", async info ptr = " DPxMOD "\n",
|
|
DPxPTR(Stream), DPxPTR(AsyncInfo));
|
|
CUDA_ERR_STRING(Err);
|
|
}
|
|
return (Err == CUDA_SUCCESS) ? OFFLOAD_SUCCESS : OFFLOAD_FAIL;
|
|
}
|
|
|
|
void printDeviceInfo(int32_t DeviceId) {
|
|
char TmpChar[1000];
|
|
std::string TmpStr;
|
|
size_t TmpSt;
|
|
int TmpInt, TmpInt2, TmpInt3;
|
|
|
|
CUdevice Device;
|
|
checkResult(cuDeviceGet(&Device, DeviceId),
|
|
"Error returned from cuCtxGetDevice\n");
|
|
|
|
cuDriverGetVersion(&TmpInt);
|
|
printf(" CUDA Driver Version: \t\t%d \n", TmpInt);
|
|
printf(" CUDA Device Number: \t\t%d \n", DeviceId);
|
|
checkResult(cuDeviceGetName(TmpChar, 1000, Device),
|
|
"Error returned from cuDeviceGetName\n");
|
|
printf(" Device Name: \t\t\t%s \n", TmpChar);
|
|
checkResult(cuDeviceTotalMem(&TmpSt, Device),
|
|
"Error returned from cuDeviceTotalMem\n");
|
|
printf(" Global Memory Size: \t\t%zu bytes \n", TmpSt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Number of Multiprocessors: \t\t%d \n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Concurrent Copy and Execution: \t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Total Constant Memory: \t\t%d bytes\n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Max Shared Memory per Block: \t%d bytes \n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Registers per Block: \t\t%d \n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Warp Size: \t\t\t\t%d Threads \n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Maximum Threads per Block: \t\t%d \n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Maximum Block Dimensions: \t\t%d, %d, %d \n", TmpInt, TmpInt2,
|
|
TmpInt3);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt2, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt3, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Maximum Grid Dimensions: \t\t%d x %d x %d \n", TmpInt, TmpInt2,
|
|
TmpInt3);
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_PITCH, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Maximum Memory Pitch: \t\t%d bytes \n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Texture Alignment: \t\t\t%d bytes \n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Clock Rate: \t\t\t%d kHz\n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Execution Timeout: \t\t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_INTEGRATED, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Integrated Device: \t\t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Can Map Host Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
if (TmpInt == CU_COMPUTEMODE_DEFAULT)
|
|
TmpStr = "DEFAULT";
|
|
else if (TmpInt == CU_COMPUTEMODE_PROHIBITED)
|
|
TmpStr = "PROHIBITED";
|
|
else if (TmpInt == CU_COMPUTEMODE_EXCLUSIVE_PROCESS)
|
|
TmpStr = "EXCLUSIVE PROCESS";
|
|
else
|
|
TmpStr = "unknown";
|
|
printf(" Compute Mode: \t\t\t%s \n", TmpStr.c_str());
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Concurrent Kernels: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" ECC Enabled: \t\t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Memory Clock Rate: \t\t\t%d kHz\n", TmpInt);
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Memory Bus Width: \t\t\t%d bits\n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(&TmpInt, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
|
|
Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" L2 Cache Size: \t\t\t%d bytes \n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
|
|
Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Max Threads Per SMP: \t\t%d \n", TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Async Engines: \t\t\t%s (%d) \n", BOOL2TEXT(TmpInt), TmpInt);
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Unified Addressing: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Managed Memory: \t\t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Concurrent Managed Memory: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Preemption Supported: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Cooperative Launch: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Multi-Device Boars: \t\t%s \n", BOOL2TEXT(TmpInt));
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
checkResult(
|
|
cuDeviceGetAttribute(
|
|
&TmpInt2, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device),
|
|
"Error returned from cuDeviceGetAttribute\n");
|
|
printf(" Compute Capabilities: \t\t%d%d \n", TmpInt, TmpInt2);
|
|
}
|
|
|
|
int createEvent(int DeviceId, void **P) {
|
|
CUevent Event = nullptr;
|
|
if (EventPool[DeviceId]->acquire(Event) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
*P = Event;
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int destroyEvent(int DeviceId, void *EventPtr) {
|
|
EventPool[DeviceId]->release(reinterpret_cast<CUevent>(EventPtr));
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int waitEvent(const int DeviceId, __tgt_async_info *AsyncInfo,
|
|
void *EventPtr) const {
|
|
CUstream Stream = getStream(DeviceId, AsyncInfo);
|
|
CUevent Event = reinterpret_cast<CUevent>(EventPtr);
|
|
|
|
// We don't use CU_EVENT_WAIT_DEFAULT here as it is only available from
|
|
// specific CUDA version, and defined as 0x0. In previous version, per CUDA
|
|
// API document, that argument has to be 0x0.
|
|
CUresult Err = cuStreamWaitEvent(Stream, Event, 0);
|
|
if (Err != CUDA_SUCCESS) {
|
|
DP("Error when waiting event. stream = " DPxMOD ", event = " DPxMOD "\n",
|
|
DPxPTR(Stream), DPxPTR(Event));
|
|
CUDA_ERR_STRING(Err);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int releaseAsyncInfo(int DeviceId, __tgt_async_info *AsyncInfo) const {
|
|
if (AsyncInfo->Queue) {
|
|
StreamPool[DeviceId]->release(
|
|
reinterpret_cast<CUstream>(AsyncInfo->Queue));
|
|
AsyncInfo->Queue = nullptr;
|
|
}
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int initAsyncInfo(int DeviceId, __tgt_async_info **AsyncInfo) const {
|
|
*AsyncInfo = new __tgt_async_info;
|
|
getStream(DeviceId, *AsyncInfo);
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int initDeviceInfo(int DeviceId, __tgt_device_info *DeviceInfo,
|
|
const char **ErrStr) const {
|
|
assert(DeviceInfo && "DeviceInfo is nullptr");
|
|
|
|
if (!DeviceInfo->Context)
|
|
DeviceInfo->Context = DeviceData[DeviceId].Context;
|
|
if (!DeviceInfo->Device) {
|
|
CUdevice Dev;
|
|
CUresult Err = cuDeviceGet(&Dev, DeviceId);
|
|
if (Err == CUDA_SUCCESS) {
|
|
DeviceInfo->Device = reinterpret_cast<void *>(Dev);
|
|
} else {
|
|
cuGetErrorString(Err, ErrStr);
|
|
return OFFLOAD_FAIL;
|
|
}
|
|
}
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
|
|
int setContext(int DeviceId) {
|
|
assert(InitializedFlags[DeviceId] && "Device is not initialized");
|
|
|
|
CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context);
|
|
if (!checkResult(Err, "error returned from cuCtxSetCurrent"))
|
|
return OFFLOAD_FAIL;
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
}
|
|
};
|
|
|
|
DeviceRTLTy DeviceRTL;
|
|
} // namespace
|
|
|
|
// Exposed library API function
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
|
|
return elf_check_machine(Image, /* EM_CUDA */ 190);
|
|
}
|
|
|
|
int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
|
|
__tgt_image_info *info) {
|
|
if (!__tgt_rtl_is_valid_binary(image))
|
|
return false;
|
|
|
|
// A subarchitecture was not specified. Assume it is compatible.
|
|
if (!info->Arch)
|
|
return true;
|
|
|
|
int32_t NumberOfDevices = 0;
|
|
if (cuDeviceGetCount(&NumberOfDevices) != CUDA_SUCCESS)
|
|
return false;
|
|
|
|
for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
|
|
CUdevice Device;
|
|
if (cuDeviceGet(&Device, DeviceId) != CUDA_SUCCESS)
|
|
return false;
|
|
|
|
int32_t Major, Minor;
|
|
if (cuDeviceGetAttribute(&Major,
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
|
Device) != CUDA_SUCCESS)
|
|
return false;
|
|
if (cuDeviceGetAttribute(&Minor,
|
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
Device) != CUDA_SUCCESS)
|
|
return false;
|
|
|
|
std::string ArchStr = "sm_" + std::to_string(Major) + std::to_string(Minor);
|
|
if (ArchStr != info->Arch)
|
|
return false;
|
|
}
|
|
|
|
DP("Image has compatible compute capability: %s\n", info->Arch);
|
|
return true;
|
|
}
|
|
|
|
int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); }
|
|
|
|
int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
|
|
DP("Init requires flags to %" PRId64 "\n", RequiresFlags);
|
|
DeviceRTL.setRequiresFlag(RequiresFlags);
|
|
return RequiresFlags;
|
|
}
|
|
|
|
int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDevId, int DstDevId) {
|
|
if (DeviceRTL.isValidDeviceId(SrcDevId) &&
|
|
DeviceRTL.isValidDeviceId(DstDevId))
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
int32_t __tgt_rtl_init_device(int32_t DeviceId) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set when init the device.
|
|
|
|
return DeviceRTL.initDevice(DeviceId);
|
|
}
|
|
|
|
int32_t __tgt_rtl_deinit_device(int32_t DeviceId) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set when deinit the device.
|
|
|
|
return DeviceRTL.deinitDevice(DeviceId);
|
|
}
|
|
|
|
__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
|
|
__tgt_device_image *Image) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return nullptr;
|
|
|
|
return DeviceRTL.loadBinary(DeviceId, Image);
|
|
}
|
|
|
|
void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *,
|
|
int32_t Kind) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return nullptr;
|
|
|
|
return DeviceRTL.dataAlloc(DeviceId, Size, (TargetAllocTy)Kind);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
|
|
int64_t Size) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set in __tgt_rtl_data_submit_async.
|
|
|
|
__tgt_async_info AsyncInfo;
|
|
const int32_t Rc =
|
|
__tgt_rtl_data_submit_async(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
|
|
if (Rc != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_submit_async(int32_t DeviceId, void *TgtPtr,
|
|
void *HstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfoPtr && "async_info_ptr is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfoPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
|
int64_t Size) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set in __tgt_rtl_data_retrieve_async.
|
|
|
|
__tgt_async_info AsyncInfo;
|
|
const int32_t Rc =
|
|
__tgt_rtl_data_retrieve_async(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
|
|
if (Rc != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_retrieve_async(int32_t DeviceId, void *HstPtr,
|
|
void *TgtPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfoPtr && "async_info_ptr is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfoPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_exchange_async(int32_t SrcDevId, void *SrcPtr,
|
|
int DstDevId, void *DstPtr, int64_t Size,
|
|
__tgt_async_info *AsyncInfo) {
|
|
assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
|
|
assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
|
|
assert(AsyncInfo && "AsyncInfo is nullptr");
|
|
|
|
if (DeviceRTL.setContext(SrcDevId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.dataExchange(SrcDevId, SrcPtr, DstDevId, DstPtr, Size,
|
|
AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_exchange(int32_t SrcDevId, void *SrcPtr,
|
|
int32_t DstDevId, void *DstPtr, int64_t Size) {
|
|
assert(DeviceRTL.isValidDeviceId(SrcDevId) && "src_dev_id is invalid");
|
|
assert(DeviceRTL.isValidDeviceId(DstDevId) && "dst_dev_id is invalid");
|
|
// Context is set in __tgt_rtl_data_exchange_async.
|
|
|
|
__tgt_async_info AsyncInfo;
|
|
const int32_t Rc = __tgt_rtl_data_exchange_async(SrcDevId, SrcPtr, DstDevId,
|
|
DstPtr, Size, &AsyncInfo);
|
|
if (Rc != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return __tgt_rtl_synchronize(SrcDevId, &AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_data_delete(int32_t DeviceId, void *TgtPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.dataDelete(DeviceId, TgtPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
|
|
void **TgtArgs, ptrdiff_t *TgtOffsets,
|
|
int32_t ArgNum, int32_t TeamNum,
|
|
int32_t ThreadLimit,
|
|
uint64_t LoopTripcount) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set in __tgt_rtl_run_target_team_region_async.
|
|
|
|
__tgt_async_info AsyncInfo;
|
|
const int32_t Rc = __tgt_rtl_run_target_team_region_async(
|
|
DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, TeamNum, ThreadLimit,
|
|
LoopTripcount, &AsyncInfo);
|
|
if (Rc != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_run_target_team_region_async(
|
|
int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
|
|
int32_t ArgNum, int32_t TeamNum, int32_t ThreadLimit,
|
|
uint64_t LoopTripcount, __tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.runTargetTeamRegion(DeviceId, TgtEntryPtr, TgtArgs,
|
|
TgtOffsets, ArgNum, TeamNum, ThreadLimit,
|
|
LoopTripcount, AsyncInfoPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
|
|
void **TgtArgs, ptrdiff_t *TgtOffsets,
|
|
int32_t ArgNum) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set in __tgt_rtl_run_target_region_async.
|
|
|
|
__tgt_async_info AsyncInfo;
|
|
const int32_t Rc = __tgt_rtl_run_target_region_async(
|
|
DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum, &AsyncInfo);
|
|
if (Rc != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
|
|
void **TgtArgs, ptrdiff_t *TgtOffsets,
|
|
int32_t ArgNum,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// Context is set in __tgt_rtl_run_target_team_region_async.
|
|
return __tgt_rtl_run_target_team_region_async(
|
|
DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, ArgNum,
|
|
/* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0,
|
|
AsyncInfoPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_synchronize(int32_t DeviceId,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfoPtr && "async_info_ptr is nullptr");
|
|
assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
|
|
// NOTE: We don't need to set context for stream sync.
|
|
return DeviceRTL.synchronize(DeviceId, AsyncInfoPtr);
|
|
}
|
|
|
|
void __tgt_rtl_set_info_flag(uint32_t NewInfoLevel) {
|
|
std::atomic<uint32_t> &InfoLevel = getInfoLevelInternal();
|
|
InfoLevel.store(NewInfoLevel);
|
|
}
|
|
|
|
void __tgt_rtl_print_device_info(int32_t DeviceId) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
// NOTE: We don't need to set context for print device info.
|
|
DeviceRTL.printDeviceInfo(DeviceId);
|
|
}
|
|
|
|
int32_t __tgt_rtl_create_event(int32_t DeviceId, void **Event) {
|
|
assert(Event && "event is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.createEvent(DeviceId, Event);
|
|
}
|
|
|
|
int32_t __tgt_rtl_record_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(AsyncInfoPtr && "async_info_ptr is nullptr");
|
|
assert(AsyncInfoPtr->Queue && "async_info_ptr->Queue is nullptr");
|
|
assert(EventPtr && "event_ptr is nullptr");
|
|
// NOTE: We might not need to set context for event record.
|
|
return recordEvent(EventPtr, AsyncInfoPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_wait_event(int32_t DeviceId, void *EventPtr,
|
|
__tgt_async_info *AsyncInfoPtr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfoPtr && "async_info_ptr is nullptr");
|
|
assert(EventPtr && "event is nullptr");
|
|
// If we don't have a queue we need to set the context.
|
|
if (!AsyncInfoPtr->Queue && DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
return DeviceRTL.waitEvent(DeviceId, AsyncInfoPtr, EventPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_sync_event(int32_t DeviceId, void *EventPtr) {
|
|
assert(EventPtr && "event is nullptr");
|
|
// NOTE: We might not need to set context for event sync.
|
|
return syncEvent(EventPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_destroy_event(int32_t DeviceId, void *EventPtr) {
|
|
assert(EventPtr && "event is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.destroyEvent(DeviceId, EventPtr);
|
|
}
|
|
|
|
int32_t __tgt_rtl_release_async_info(int32_t DeviceId,
|
|
__tgt_async_info *AsyncInfo) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfo && "async_info is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.releaseAsyncInfo(DeviceId, AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_init_async_info(int32_t DeviceId,
|
|
__tgt_async_info **AsyncInfo) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(AsyncInfo && "async_info is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.initAsyncInfo(DeviceId, AsyncInfo);
|
|
}
|
|
|
|
int32_t __tgt_rtl_init_device_info(int32_t DeviceId,
|
|
__tgt_device_info *DeviceInfoPtr,
|
|
const char **ErrStr) {
|
|
assert(DeviceRTL.isValidDeviceId(DeviceId) && "device_id is invalid");
|
|
assert(DeviceInfoPtr && "device_info_ptr is nullptr");
|
|
|
|
if (DeviceRTL.setContext(DeviceId) != OFFLOAD_SUCCESS)
|
|
return OFFLOAD_FAIL;
|
|
|
|
return DeviceRTL.initDeviceInfo(DeviceId, DeviceInfoPtr, ErrStr);
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|