[Offload][NFC] Fix typos discovered by codespell (#125119)
https://github.com/codespell-project/codespell % `codespell --ignore-words-list=archtype,hsa,identty,inout,iself,nd,te,ths,vertexes --write-changes`
This commit is contained in:
parent
55be370f37
commit
1f56bb3137
@ -27,7 +27,7 @@ uint32_t getNumDevices();
|
|||||||
/// Return the device number in the system for omp_get_device_num.
|
/// Return the device number in the system for omp_get_device_num.
|
||||||
uint32_t getDeviceNum();
|
uint32_t getDeviceNum();
|
||||||
|
|
||||||
/// Return the user choosen debug level.
|
/// Return the user chosen debug level.
|
||||||
uint32_t getDebugKind();
|
uint32_t getDebugKind();
|
||||||
|
|
||||||
/// Return if teams oversubscription is assumed
|
/// Return if teams oversubscription is assumed
|
||||||
|
@ -63,7 +63,7 @@ LaneMaskTy activemask();
|
|||||||
/// Return a mask describing all threads with a smaller Id in the warp.
|
/// Return a mask describing all threads with a smaller Id in the warp.
|
||||||
LaneMaskTy lanemaskLT();
|
LaneMaskTy lanemaskLT();
|
||||||
|
|
||||||
/// Return a mask describing all threads with a larget Id in the warp.
|
/// Return a mask describing all threads with a larger Id in the warp.
|
||||||
LaneMaskTy lanemaskGT();
|
LaneMaskTy lanemaskGT();
|
||||||
|
|
||||||
/// Return the thread Id in the warp, in [0, getWarpSize()).
|
/// Return the thread Id in the warp, in [0, getWarpSize()).
|
||||||
|
@ -33,7 +33,7 @@ namespace memory {
|
|||||||
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
|
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
|
||||||
void *allocShared(uint64_t Size, const char *Reason);
|
void *allocShared(uint64_t Size, const char *Reason);
|
||||||
|
|
||||||
/// Free \p Ptr, alloated via allocShared, for \p Reason.
|
/// Free \p Ptr, allocated via allocShared, for \p Reason.
|
||||||
///
|
///
|
||||||
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
|
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
|
||||||
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
|
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
|
||||||
@ -44,7 +44,7 @@ void *allocGlobal(uint64_t Size, const char *Reason);
|
|||||||
/// Return a pointer to the dynamic shared memory buffer.
|
/// Return a pointer to the dynamic shared memory buffer.
|
||||||
void *getDynamicBuffer();
|
void *getDynamicBuffer();
|
||||||
|
|
||||||
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
|
/// Free \p Ptr, allocated via allocGlobal, for \p Reason.
|
||||||
void freeGlobal(void *Ptr, const char *Reason);
|
void freeGlobal(void *Ptr, const char *Reason);
|
||||||
|
|
||||||
} // namespace memory
|
} // namespace memory
|
||||||
@ -365,7 +365,7 @@ inline state::Value<uint32_t, state::VK_Level> Level;
|
|||||||
/// The `active-level` describes which of the parallel level counted with the
|
/// The `active-level` describes which of the parallel level counted with the
|
||||||
/// `level-var` is active. There can only be one.
|
/// `level-var` is active. There can only be one.
|
||||||
///
|
///
|
||||||
/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
|
/// active-level-var is 1, if ActiveLevelVar is not 0, otherwise it is 0.
|
||||||
inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
|
inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
|
||||||
|
|
||||||
/// TODO
|
/// TODO
|
||||||
|
@ -22,7 +22,7 @@ namespace atomic {
|
|||||||
|
|
||||||
enum OrderingTy {
|
enum OrderingTy {
|
||||||
relaxed = __ATOMIC_RELAXED,
|
relaxed = __ATOMIC_RELAXED,
|
||||||
aquire = __ATOMIC_ACQUIRE,
|
acquire = __ATOMIC_ACQUIRE,
|
||||||
release = __ATOMIC_RELEASE,
|
release = __ATOMIC_RELEASE,
|
||||||
acq_rel = __ATOMIC_ACQ_REL,
|
acq_rel = __ATOMIC_ACQ_REL,
|
||||||
seq_cst = __ATOMIC_SEQ_CST,
|
seq_cst = __ATOMIC_SEQ_CST,
|
||||||
|
@ -27,7 +27,7 @@ using namespace ompx;
|
|||||||
0;
|
0;
|
||||||
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
|
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_teams_oversubscription = 0;
|
||||||
|
|
||||||
// This variable should be visibile to the plugin so we override the default
|
// This variable should be visible to the plugin so we override the default
|
||||||
// hidden visibility.
|
// hidden visibility.
|
||||||
[[gnu::used, gnu::retain, gnu::weak,
|
[[gnu::used, gnu::retain, gnu::weak,
|
||||||
gnu::visibility("protected")]] DeviceEnvironmentTy
|
gnu::visibility("protected")]] DeviceEnvironmentTy
|
||||||
|
@ -33,7 +33,7 @@ double getWTime();
|
|||||||
|
|
||||||
double getWTick() {
|
double getWTick() {
|
||||||
// The number of ticks per second for the AMDGPU clock varies by card and can
|
// The number of ticks per second for the AMDGPU clock varies by card and can
|
||||||
// only be retrived by querying the driver. We rely on the device environment
|
// only be retrieved by querying the driver. We rely on the device environment
|
||||||
// to inform us what the proper frequency is.
|
// to inform us what the proper frequency is.
|
||||||
return 1.0 / config::getClockFrequency();
|
return 1.0 / config::getClockFrequency();
|
||||||
}
|
}
|
||||||
|
@ -206,7 +206,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
|||||||
// to the number of slots in the buffer.
|
// to the number of slots in the buffer.
|
||||||
bool IsMaster = (ThreadId == 0);
|
bool IsMaster = (ThreadId == 0);
|
||||||
while (IsMaster) {
|
while (IsMaster) {
|
||||||
Bound = atomic::load(&IterCnt, atomic::aquire);
|
Bound = atomic::load(&IterCnt, atomic::acquire);
|
||||||
if (TeamId < Bound + num_of_records)
|
if (TeamId < Bound + num_of_records)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
|||||||
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
|
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
|
||||||
if (ChunkTeamCount == NumTeams - Bound - 1) {
|
if (ChunkTeamCount == NumTeams - Bound - 1) {
|
||||||
// Ensure we see the global memory writes by other teams
|
// Ensure we see the global memory writes by other teams
|
||||||
fence::kernel(atomic::aquire);
|
fence::kernel(atomic::acquire);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Last team processing.
|
// Last team processing.
|
||||||
|
@ -84,7 +84,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
|||||||
default:
|
default:
|
||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
Case(atomic::relaxed);
|
Case(atomic::relaxed);
|
||||||
Case(atomic::aquire);
|
Case(atomic::acquire);
|
||||||
Case(atomic::release);
|
Case(atomic::release);
|
||||||
Case(atomic::acq_rel);
|
Case(atomic::acq_rel);
|
||||||
Case(atomic::seq_cst);
|
Case(atomic::seq_cst);
|
||||||
@ -107,7 +107,7 @@ void namedBarrier() {
|
|||||||
uint32_t WarpSize = mapping::getWarpSize();
|
uint32_t WarpSize = mapping::getWarpSize();
|
||||||
uint32_t NumWaves = NumThreads / WarpSize;
|
uint32_t NumWaves = NumThreads / WarpSize;
|
||||||
|
|
||||||
fence::team(atomic::aquire);
|
fence::team(atomic::acquire);
|
||||||
|
|
||||||
// named barrier implementation for amdgcn.
|
// named barrier implementation for amdgcn.
|
||||||
// Uses two 16 bit unsigned counters. One for the number of waves to have
|
// Uses two 16 bit unsigned counters. One for the number of waves to have
|
||||||
@ -172,7 +172,7 @@ void syncThreads(atomic::OrderingTy Ordering) {
|
|||||||
__builtin_amdgcn_s_barrier();
|
__builtin_amdgcn_s_barrier();
|
||||||
|
|
||||||
if (Ordering != atomic::relaxed)
|
if (Ordering != atomic::relaxed)
|
||||||
fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst);
|
fenceTeam(Ordering == atomic::acq_rel ? atomic::acquire : atomic::seq_cst);
|
||||||
}
|
}
|
||||||
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
|
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
|
||||||
|
|
||||||
@ -198,7 +198,7 @@ void setCriticalLock(omp_lock_t *Lock) {
|
|||||||
!cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
|
!cas((uint32_t *)Lock, UNSET, SET, atomic::relaxed, atomic::relaxed)) {
|
||||||
__builtin_amdgcn_s_sleep(32);
|
__builtin_amdgcn_s_sleep(32);
|
||||||
}
|
}
|
||||||
fenceKernel(atomic::aquire);
|
fenceKernel(atomic::acquire);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,7 +79,7 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
|||||||
lb = lb + entityId * chunk;
|
lb = lb + entityId * chunk;
|
||||||
T inputUb = ub;
|
T inputUb = ub;
|
||||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||||
// Say ub' is the begining of the last chunk. Then who ever has a
|
// Say ub' is the beginning of the last chunk. Then who ever has a
|
||||||
// lower bound plus a multiple of the increment equal to ub' is
|
// lower bound plus a multiple of the increment equal to ub' is
|
||||||
// the last one.
|
// the last one.
|
||||||
T beginingLastChunk = inputUb - (inputUb % chunk);
|
T beginingLastChunk = inputUb - (inputUb % chunk);
|
||||||
@ -806,7 +806,7 @@ public:
|
|||||||
NumIters, OneIterationPerThread);
|
NumIters, OneIterationPerThread);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Worksharing `distrbute`-loop.
|
/// Worksharing `distribute`-loop.
|
||||||
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg,
|
||||||
Ty NumIters, Ty BlockChunk) {
|
Ty NumIters, Ty BlockChunk) {
|
||||||
ASSERT(icv::Level == 0, "Bad distribute");
|
ASSERT(icv::Level == 0, "Bad distribute");
|
||||||
@ -853,7 +853,7 @@ public:
|
|||||||
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
|
ASSERT(state::ParallelTeamSize == 1, "Bad distribute");
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Worksharing `distrbute parallel for`-loop.
|
/// Worksharing `distribute parallel for`-loop.
|
||||||
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
|
static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *),
|
||||||
void *Arg, Ty NumIters, Ty NumThreads,
|
void *Arg, Ty NumIters, Ty NumThreads,
|
||||||
Ty BlockChunk, Ty ThreadChunk) {
|
Ty BlockChunk, Ty ThreadChunk) {
|
||||||
|
@ -56,7 +56,7 @@ FOREACH_OMPT_EMI_EVENT(declareOmptCallback)
|
|||||||
|
|
||||||
/// This function will call an OpenMP API function. Which in turn will lookup a
|
/// This function will call an OpenMP API function. Which in turn will lookup a
|
||||||
/// given enum value of type \p ompt_callbacks_t and copy the address of the
|
/// given enum value of type \p ompt_callbacks_t and copy the address of the
|
||||||
/// corresponding callback funtion into the provided pointer.
|
/// corresponding callback function into the provided pointer.
|
||||||
/// The pointer to the runtime function is passed during 'initializeLibrary'.
|
/// The pointer to the runtime function is passed during 'initializeLibrary'.
|
||||||
/// \p which the enum value of the requested callback function
|
/// \p which the enum value of the requested callback function
|
||||||
/// \p callback the destination pointer where the address shall be copied
|
/// \p callback the destination pointer where the address shall be copied
|
||||||
|
@ -77,7 +77,7 @@ struct PluginManager {
|
|||||||
/// Iterate over all device images registered with this plugin.
|
/// Iterate over all device images registered with this plugin.
|
||||||
auto deviceImages() { return llvm::make_pointee_range(DeviceImages); }
|
auto deviceImages() { return llvm::make_pointee_range(DeviceImages); }
|
||||||
|
|
||||||
/// Translation table retreived from the binary
|
/// Translation table retrieved from the binary
|
||||||
HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
|
HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
|
||||||
std::mutex TrlTblMtx; ///< For Translation Table
|
std::mutex TrlTblMtx; ///< For Translation Table
|
||||||
/// Host offload entries in order of image registration
|
/// Host offload entries in order of image registration
|
||||||
@ -171,7 +171,7 @@ private:
|
|||||||
/// Devices associated with plugins, accesses to the container are exclusive.
|
/// Devices associated with plugins, accesses to the container are exclusive.
|
||||||
ProtectedObj<DeviceContainerTy> Devices;
|
ProtectedObj<DeviceContainerTy> Devices;
|
||||||
|
|
||||||
/// References to upgraded legacy offloading entires.
|
/// References to upgraded legacy offloading entries.
|
||||||
std::list<llvm::SmallVector<llvm::offloading::EntryTy, 0>> LegacyEntries;
|
std::list<llvm::SmallVector<llvm::offloading::EntryTy, 0>> LegacyEntries;
|
||||||
std::list<llvm::SmallVector<__tgt_device_image, 0>> LegacyImages;
|
std::list<llvm::SmallVector<__tgt_device_image, 0>> LegacyImages;
|
||||||
llvm::DenseMap<__tgt_bin_desc *, __tgt_bin_desc> UpgradedDescriptors;
|
llvm::DenseMap<__tgt_bin_desc *, __tgt_bin_desc> UpgradedDescriptors;
|
||||||
|
@ -134,7 +134,7 @@ struct DeviceTy {
|
|||||||
int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo);
|
int32_t recordEvent(void *Event, AsyncInfoTy &AsyncInfo);
|
||||||
|
|
||||||
/// Wait for an event. This function can be blocking or non-blocking,
|
/// Wait for an event. This function can be blocking or non-blocking,
|
||||||
/// depending on the implmentation. It is expected to set a dependence on the
|
/// depending on the implementation. It is expected to set a dependence on the
|
||||||
/// event such that corresponding operations shall only start once the event
|
/// event such that corresponding operations shall only start once the event
|
||||||
/// is fulfilled.
|
/// is fulfilled.
|
||||||
int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo);
|
int32_t waitEvent(void *Event, AsyncInfoTy &AsyncInfo);
|
||||||
|
@ -42,7 +42,7 @@ enum __tgt_target_return_t : int {
|
|||||||
OMP_TGT_SUCCESS = 0,
|
OMP_TGT_SUCCESS = 0,
|
||||||
/// offload may not execute on the requested target device
|
/// offload may not execute on the requested target device
|
||||||
/// this scenario can be caused by the device not available or unsupported
|
/// this scenario can be caused by the device not available or unsupported
|
||||||
/// as described in the Execution Model in the specifcation
|
/// as described in the Execution Model in the specification
|
||||||
/// this status may not be used for target device execution failure
|
/// this status may not be used for target device execution failure
|
||||||
/// which should be handled internally in libomptarget
|
/// which should be handled internally in libomptarget
|
||||||
OMP_TGT_FAIL = ~0
|
OMP_TGT_FAIL = ~0
|
||||||
|
@ -36,7 +36,7 @@ class IsPointerType<string Type> {
|
|||||||
bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1);
|
bit ret = !ne(!find(Type, "*", !sub(!size(Type), 1)), -1);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Describes the valid range of a pointer parameter that reperesents an array
|
// Describes the valid range of a pointer parameter that represents an array
|
||||||
class Range<string Begin, string End> {
|
class Range<string Begin, string End> {
|
||||||
string begin = Begin;
|
string begin = Begin;
|
||||||
string end = End;
|
string end = End;
|
||||||
|
@ -62,7 +62,7 @@ which preserves ABI compatibility with C.
|
|||||||
Represents a C-style enum. Contains a list of `etor` values, which have a name
|
Represents a C-style enum. Contains a list of `etor` values, which have a name
|
||||||
and description.
|
and description.
|
||||||
|
|
||||||
A `TaggedEtor` record type also exists which addtionally takes a type. This type
|
A `TaggedEtor` record type also exists which additionally takes a type. This type
|
||||||
is used when the enum is used as a parameter to a function with a type-tagged
|
is used when the enum is used as a parameter to a function with a type-tagged
|
||||||
function parameter (e.g. `olGetDeviceInfo`).
|
function parameter (e.g. `olGetDeviceInfo`).
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ OffloadConfig &offloadConfig() {
|
|||||||
return Config;
|
return Config;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pull in the declarations for the implementation funtions. The actual entry
|
// Pull in the declarations for the implementation functions. The actual entry
|
||||||
// points in this file wrap these.
|
// points in this file wrap these.
|
||||||
#include "OffloadImplFuncDecls.inc"
|
#include "OffloadImplFuncDecls.inc"
|
||||||
|
|
||||||
|
@ -436,7 +436,7 @@ private:
|
|||||||
/// have more previously allocated buffers.
|
/// have more previously allocated buffers.
|
||||||
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
|
void *allocate(size_t Size, void *HstPtr, TargetAllocTy Kind) override;
|
||||||
|
|
||||||
/// Deallocation callack that will be called by the memory manager.
|
/// Deallocation callback that will be called by the memory manager.
|
||||||
int free(void *TgtPtr, TargetAllocTy Kind) override {
|
int free(void *TgtPtr, TargetAllocTy Kind) override {
|
||||||
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
|
if (auto Err = MemoryPool->deallocate(TgtPtr)) {
|
||||||
consumeError(std::move(Err));
|
consumeError(std::move(Err));
|
||||||
@ -493,7 +493,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// The exectuable loaded on the agent.
|
/// The executable loaded on the agent.
|
||||||
hsa_executable_t Executable;
|
hsa_executable_t Executable;
|
||||||
StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
|
StringMap<offloading::amdgpu::AMDGPUKernelMetaData> KernelInfoMap;
|
||||||
uint16_t ELFABIVersion;
|
uint16_t ELFABIVersion;
|
||||||
@ -876,7 +876,7 @@ private:
|
|||||||
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
|
hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Callack that will be called when an error is detected on the HSA queue.
|
/// Callback that will be called when an error is detected on the HSA queue.
|
||||||
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
|
static void callbackError(hsa_status_t Status, hsa_queue_t *Source,
|
||||||
void *Data);
|
void *Data);
|
||||||
|
|
||||||
@ -932,7 +932,7 @@ private:
|
|||||||
/// operation's output signal is set to the consumed slot's signal. If there
|
/// operation's output signal is set to the consumed slot's signal. If there
|
||||||
/// is a previous asynchronous operation on the previous slot, the HSA async
|
/// is a previous asynchronous operation on the previous slot, the HSA async
|
||||||
/// operation's input signal is set to the signal of the previous slot. This
|
/// operation's input signal is set to the signal of the previous slot. This
|
||||||
/// way, we obtain a chain of dependant async operations. The action is a
|
/// way, we obtain a chain of dependent async operations. The action is a
|
||||||
/// function that will be executed eventually after the operation is
|
/// function that will be executed eventually after the operation is
|
||||||
/// completed, e.g., for releasing a buffer.
|
/// completed, e.g., for releasing a buffer.
|
||||||
struct StreamSlotTy {
|
struct StreamSlotTy {
|
||||||
@ -1055,10 +1055,10 @@ private:
|
|||||||
/// Timeout hint for HSA actively waiting for signal value to change
|
/// Timeout hint for HSA actively waiting for signal value to change
|
||||||
const uint64_t StreamBusyWaitMicroseconds;
|
const uint64_t StreamBusyWaitMicroseconds;
|
||||||
|
|
||||||
/// Indicate to spread data transfers across all avilable SDMAs
|
/// Indicate to spread data transfers across all available SDMAs
|
||||||
bool UseMultipleSdmaEngines;
|
bool UseMultipleSdmaEngines;
|
||||||
|
|
||||||
/// Return the current number of asychronous operations on the stream.
|
/// Return the current number of asynchronous operations on the stream.
|
||||||
uint32_t size() const { return NextSlot; }
|
uint32_t size() const { return NextSlot; }
|
||||||
|
|
||||||
/// Return the last valid slot on the stream.
|
/// Return the last valid slot on the stream.
|
||||||
@ -1155,12 +1155,12 @@ private:
|
|||||||
// changes on the slot.
|
// changes on the slot.
|
||||||
std::atomic_thread_fence(std::memory_order_acquire);
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
|
|
||||||
// Peform the operation.
|
// Perform the operation.
|
||||||
if (auto Err = Slot->performAction())
|
if (auto Err = Slot->performAction())
|
||||||
FATAL_MESSAGE(1, "Error peforming post action: %s",
|
FATAL_MESSAGE(1, "Error performing post action: %s",
|
||||||
toString(std::move(Err)).data());
|
toString(std::move(Err)).data());
|
||||||
|
|
||||||
// Signal the output signal to notify the asycnhronous operation finalized.
|
// Signal the output signal to notify the asynchronous operation finalized.
|
||||||
Slot->Signal->signal();
|
Slot->Signal->signal();
|
||||||
|
|
||||||
// Unregister callback.
|
// Unregister callback.
|
||||||
@ -1183,9 +1183,9 @@ private:
|
|||||||
/// action. There are two kinds of memory buffers:
|
/// action. There are two kinds of memory buffers:
|
||||||
/// 1. For kernel arguments. This buffer can be freed after receiving the
|
/// 1. For kernel arguments. This buffer can be freed after receiving the
|
||||||
/// kernel completion signal.
|
/// kernel completion signal.
|
||||||
/// 2. For H2D tranfers that need pinned memory space for staging. This
|
/// 2. For H2D transfers that need pinned memory space for staging. This
|
||||||
/// buffer can be freed after receiving the transfer completion signal.
|
/// buffer can be freed after receiving the transfer completion signal.
|
||||||
/// 3. For D2H tranfers that need pinned memory space for staging. This
|
/// 3. For D2H transfers that need pinned memory space for staging. This
|
||||||
/// buffer cannot be freed after receiving the transfer completion signal
|
/// buffer cannot be freed after receiving the transfer completion signal
|
||||||
/// because of the following asynchronous H2H callback.
|
/// because of the following asynchronous H2H callback.
|
||||||
/// For this reason, This action can only be taken at
|
/// For this reason, This action can only be taken at
|
||||||
@ -1222,7 +1222,7 @@ public:
|
|||||||
/// Create an empty stream associated with a specific device.
|
/// Create an empty stream associated with a specific device.
|
||||||
AMDGPUStreamTy(AMDGPUDeviceTy &Device);
|
AMDGPUStreamTy(AMDGPUDeviceTy &Device);
|
||||||
|
|
||||||
/// Intialize the stream's signals.
|
/// Initialize the stream's signals.
|
||||||
Error init() { return Plugin::success(); }
|
Error init() { return Plugin::success(); }
|
||||||
|
|
||||||
/// Deinitialize the stream's signals.
|
/// Deinitialize the stream's signals.
|
||||||
@ -1312,7 +1312,7 @@ public:
|
|||||||
/// Push an asynchronous memory copy device-to-host involving an unpinned
|
/// Push an asynchronous memory copy device-to-host involving an unpinned
|
||||||
/// memory buffer. The operation consists of a two-step copy from the
|
/// memory buffer. The operation consists of a two-step copy from the
|
||||||
/// device buffer to an intermediate pinned host buffer, and then, to a
|
/// device buffer to an intermediate pinned host buffer, and then, to a
|
||||||
/// unpinned host buffer. Both operations are asynchronous and dependant.
|
/// unpinned host buffer. Both operations are asynchronous and dependent.
|
||||||
/// The intermediate pinned buffer will be released to the specified memory
|
/// The intermediate pinned buffer will be released to the specified memory
|
||||||
/// manager once the operation completes.
|
/// manager once the operation completes.
|
||||||
Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
|
Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
|
||||||
@ -1374,7 +1374,7 @@ public:
|
|||||||
/// Push an asynchronous memory copy host-to-device involving an unpinned
|
/// Push an asynchronous memory copy host-to-device involving an unpinned
|
||||||
/// memory buffer. The operation consists of a two-step copy from the
|
/// memory buffer. The operation consists of a two-step copy from the
|
||||||
/// unpinned host buffer to an intermediate pinned host buffer, and then, to
|
/// unpinned host buffer to an intermediate pinned host buffer, and then, to
|
||||||
/// the pinned host buffer. Both operations are asynchronous and dependant.
|
/// the pinned host buffer. Both operations are asynchronous and dependent.
|
||||||
/// The intermediate pinned buffer will be released to the specified memory
|
/// The intermediate pinned buffer will be released to the specified memory
|
||||||
/// manager once the operation completes.
|
/// manager once the operation completes.
|
||||||
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
|
Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
|
||||||
@ -1672,7 +1672,7 @@ struct AMDGPUStreamManagerTy final
|
|||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// Search for and assign an prefereably idle queue to the given Stream. If
|
/// Search for and assign an preferably idle queue to the given Stream. If
|
||||||
/// there is no queue without current users, choose the queue with the lowest
|
/// there is no queue without current users, choose the queue with the lowest
|
||||||
/// user count. If utilization is ignored: use round robin selection.
|
/// user count. If utilization is ignored: use round robin selection.
|
||||||
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
|
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
|
||||||
@ -1856,13 +1856,13 @@ struct AMDHostDeviceTy : public AMDGenericDeviceTy {
|
|||||||
/// Get a memory pool for fine-grained allocations.
|
/// Get a memory pool for fine-grained allocations.
|
||||||
AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
|
AMDGPUMemoryPoolTy &getFineGrainedMemoryPool() {
|
||||||
assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
|
assert(!FineGrainedMemoryPools.empty() && "No fine-grained mempool");
|
||||||
// Retrive any memory pool.
|
// Retrieve any memory pool.
|
||||||
return *FineGrainedMemoryPools[0];
|
return *FineGrainedMemoryPools[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
|
AMDGPUMemoryPoolTy &getCoarseGrainedMemoryPool() {
|
||||||
assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
|
assert(!CoarseGrainedMemoryPools.empty() && "No coarse-grained mempool");
|
||||||
// Retrive any memory pool.
|
// Retrieve any memory pool.
|
||||||
return *CoarseGrainedMemoryPools[0];
|
return *CoarseGrainedMemoryPools[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1937,7 +1937,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|||||||
ClockFrequency) != HSA_STATUS_SUCCESS)
|
ClockFrequency) != HSA_STATUS_SUCCESS)
|
||||||
ClockFrequency = 0;
|
ClockFrequency = 0;
|
||||||
|
|
||||||
// Load the grid values dependending on the wavefront.
|
// Load the grid values depending on the wavefront.
|
||||||
if (WavefrontSize == 32)
|
if (WavefrontSize == 32)
|
||||||
GridValues = getAMDGPUGridValues<32>();
|
GridValues = getAMDGPUGridValues<32>();
|
||||||
else if (WavefrontSize == 64)
|
else if (WavefrontSize == 64)
|
||||||
@ -2097,7 +2097,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|||||||
|
|
||||||
std::string LLDPath = ErrorOrPath.get();
|
std::string LLDPath = ErrorOrPath.get();
|
||||||
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
|
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, getDeviceId(),
|
||||||
"Using `%s` to link JITed amdgcn ouput.", LLDPath.c_str());
|
"Using `%s` to link JITed amdgcn output.", LLDPath.c_str());
|
||||||
|
|
||||||
std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
|
std::string MCPU = "-plugin-opt=mcpu=" + getComputeUnitKind();
|
||||||
StringRef Args[] = {LLDPath,
|
StringRef Args[] = {LLDPath,
|
||||||
@ -2158,15 +2158,15 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// We want to set up the RPC server for host services to the GPU if it is
|
/// We want to set up the RPC server for host services to the GPU if it is
|
||||||
/// availible.
|
/// available.
|
||||||
bool shouldSetupRPCServer() const override { return true; }
|
bool shouldSetupRPCServer() const override { return true; }
|
||||||
|
|
||||||
/// The RPC interface should have enough space for all availible parallelism.
|
/// The RPC interface should have enough space for all available parallelism.
|
||||||
uint64_t requestedRPCPortCount() const override {
|
uint64_t requestedRPCPortCount() const override {
|
||||||
return getHardwareParallelism();
|
return getHardwareParallelism();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the stream of the asynchronous info sructure or get a new one.
|
/// Get the stream of the asynchronous info structure or get a new one.
|
||||||
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
|
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper,
|
||||||
AMDGPUStreamTy *&Stream) {
|
AMDGPUStreamTy *&Stream) {
|
||||||
// Get the stream (if any) from the async info.
|
// Get the stream (if any) from the async info.
|
||||||
@ -2716,7 +2716,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
|||||||
Status =
|
Status =
|
||||||
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
|
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, TmpBool);
|
||||||
if (Status == HSA_STATUS_SUCCESS)
|
if (Status == HSA_STATUS_SUCCESS)
|
||||||
Info.add<InfoLevel3>("Accessable by all", TmpBool);
|
Info.add<InfoLevel3>("Accessible by all", TmpBool);
|
||||||
}
|
}
|
||||||
|
|
||||||
Info.add("ISAs");
|
Info.add("ISAs");
|
||||||
@ -2895,7 +2895,7 @@ private:
|
|||||||
|
|
||||||
/// Envar specifying the maximum size in bytes where the memory copies are
|
/// Envar specifying the maximum size in bytes where the memory copies are
|
||||||
/// asynchronous operations. Up to this transfer size, the memory copies are
|
/// asynchronous operations. Up to this transfer size, the memory copies are
|
||||||
/// asychronous operations pushed to the corresponding stream. For larger
|
/// asynchronous operations pushed to the corresponding stream. For larger
|
||||||
/// transfers, they are synchronous transfers.
|
/// transfers, they are synchronous transfers.
|
||||||
UInt32Envar OMPX_MaxAsyncCopyBytes;
|
UInt32Envar OMPX_MaxAsyncCopyBytes;
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ if (NOT LLVM_LINK_LLVM_DYLIB)
|
|||||||
endforeach()
|
endforeach()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include the RPC server from the `libc` project if availible.
|
# Include the RPC server from the `libc` project if available.
|
||||||
include(FindLibcCommonUtils)
|
include(FindLibcCommonUtils)
|
||||||
target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
|
target_link_libraries(PluginCommon PRIVATE llvm-libc-common-utilities)
|
||||||
if(TARGET llvmlibc_rpc_server AND ${LIBOMPTARGET_GPU_LIBC_SUPPORT})
|
if(TARGET llvmlibc_rpc_server AND ${LIBOMPTARGET_GPU_LIBC_SUPPORT})
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
//===- GlobalHandler.h - Target independent global & enviroment handling --===//
|
//===- GlobalHandler.h - Target independent global & environment handling -===//
|
||||||
//
|
//
|
||||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||||
// See https://llvm.org/LICENSE.txt for license information.
|
// See https://llvm.org/LICENSE.txt for license information.
|
||||||
@ -100,7 +100,7 @@ public:
|
|||||||
|
|
||||||
/// Helper class to do the heavy lifting when it comes to moving globals between
|
/// Helper class to do the heavy lifting when it comes to moving globals between
|
||||||
/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
|
/// host and device. Through the GenericDeviceTy we access memcpy DtoH and HtoD,
|
||||||
/// which means the only things specialized by the subclass is the retrival of
|
/// which means the only things specialized by the subclass is the retrieval of
|
||||||
/// global metadata (size, addr) from the device.
|
/// global metadata (size, addr) from the device.
|
||||||
/// \see getGlobalMetadataFromDevice
|
/// \see getGlobalMetadataFromDevice
|
||||||
class GenericGlobalHandlerTy {
|
class GenericGlobalHandlerTy {
|
||||||
|
@ -59,7 +59,7 @@ private:
|
|||||||
/// Compile the bitcode image \p Image and generate the binary image that can
|
/// Compile the bitcode image \p Image and generate the binary image that can
|
||||||
/// be loaded to the target device of the triple \p Triple architecture \p
|
/// be loaded to the target device of the triple \p Triple architecture \p
|
||||||
/// MCpu. \p PostProcessing will be called after codegen to handle cases such
|
/// MCpu. \p PostProcessing will be called after codegen to handle cases such
|
||||||
/// as assember as an external tool.
|
/// as assembler as an external tool.
|
||||||
Expected<const __tgt_device_image *>
|
Expected<const __tgt_device_image *>
|
||||||
compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
|
compile(const __tgt_device_image &Image, const std::string &ComputeUnitKind,
|
||||||
PostProcessingFn PostProcessing);
|
PostProcessingFn PostProcessing);
|
||||||
|
@ -92,14 +92,14 @@ struct AsyncInfoWrapperTy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Synchronize with the __tgt_async_info's pending operations if it's the
|
/// Synchronize with the __tgt_async_info's pending operations if it's the
|
||||||
/// internal async info. The error associated to the aysnchronous operations
|
/// internal async info. The error associated to the asynchronous operations
|
||||||
/// issued in this queue must be provided in \p Err. This function will update
|
/// issued in this queue must be provided in \p Err. This function will update
|
||||||
/// the error parameter with the result of the synchronization if it was
|
/// the error parameter with the result of the synchronization if it was
|
||||||
/// actually executed. This function must be called before destroying the
|
/// actually executed. This function must be called before destroying the
|
||||||
/// object and only once.
|
/// object and only once.
|
||||||
void finalize(Error &Err);
|
void finalize(Error &Err);
|
||||||
|
|
||||||
/// Register \p Ptr as an associated alloction that is freed after
|
/// Register \p Ptr as an associated allocation that is freed after
|
||||||
/// finalization.
|
/// finalization.
|
||||||
void freeAllocationAfterSynchronization(void *Ptr) {
|
void freeAllocationAfterSynchronization(void *Ptr) {
|
||||||
AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
|
AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
|
||||||
@ -456,7 +456,7 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
/// Class representing a map of host pinned allocations. We track these pinned
|
/// Class representing a map of host pinned allocations. We track these pinned
|
||||||
/// allocations, so memory tranfers invloving these buffers can be optimized.
|
/// allocations, so memory transfers involving these buffers can be optimized.
|
||||||
class PinnedAllocationMapTy {
|
class PinnedAllocationMapTy {
|
||||||
|
|
||||||
/// Struct representing a map entry.
|
/// Struct representing a map entry.
|
||||||
@ -482,7 +482,7 @@ class PinnedAllocationMapTy {
|
|||||||
/// becomes zero.
|
/// becomes zero.
|
||||||
mutable size_t References;
|
mutable size_t References;
|
||||||
|
|
||||||
/// Create an entry with the host and device acessible pointers, the buffer
|
/// Create an entry with the host and device accessible pointers, the buffer
|
||||||
/// size, and a boolean indicating whether the buffer was locked externally.
|
/// size, and a boolean indicating whether the buffer was locked externally.
|
||||||
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
EntryTy(void *HstPtr, void *DevAccessiblePtr, size_t Size,
|
||||||
bool ExternallyLocked)
|
bool ExternallyLocked)
|
||||||
@ -517,7 +517,7 @@ class PinnedAllocationMapTy {
|
|||||||
/// Indicate whether mapped host buffers should be locked automatically.
|
/// Indicate whether mapped host buffers should be locked automatically.
|
||||||
bool LockMappedBuffers;
|
bool LockMappedBuffers;
|
||||||
|
|
||||||
/// Indicate whether failures when locking mapped buffers should be ingored.
|
/// Indicate whether failures when locking mapped buffers should be ignored.
|
||||||
bool IgnoreLockMappedFailures;
|
bool IgnoreLockMappedFailures;
|
||||||
|
|
||||||
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
|
/// Find an allocation that intersects with \p HstPtr pointer. Assume the
|
||||||
@ -1122,7 +1122,7 @@ struct GenericPluginTy {
|
|||||||
/// Get the reference to the device with a certain device id.
|
/// Get the reference to the device with a certain device id.
|
||||||
GenericDeviceTy &getDevice(int32_t DeviceId) {
|
GenericDeviceTy &getDevice(int32_t DeviceId) {
|
||||||
assert(isValidDeviceId(DeviceId) && "Invalid device id");
|
assert(isValidDeviceId(DeviceId) && "Invalid device id");
|
||||||
assert(Devices[DeviceId] && "Device is unitialized");
|
assert(Devices[DeviceId] && "Device is uninitialized");
|
||||||
|
|
||||||
return *Devices[DeviceId];
|
return *Devices[DeviceId];
|
||||||
}
|
}
|
||||||
@ -1270,7 +1270,7 @@ public:
|
|||||||
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
||||||
int64_t Size);
|
int64_t Size);
|
||||||
|
|
||||||
/// Copy data from the given device asynchornously.
|
/// Copy data from the given device asynchronously.
|
||||||
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
|
||||||
int64_t Size, __tgt_async_info *AsyncInfoPtr);
|
int64_t Size, __tgt_async_info *AsyncInfoPtr);
|
||||||
|
|
||||||
@ -1308,7 +1308,7 @@ public:
|
|||||||
int32_t wait_event(int32_t DeviceId, void *EventPtr,
|
int32_t wait_event(int32_t DeviceId, void *EventPtr,
|
||||||
__tgt_async_info *AsyncInfoPtr);
|
__tgt_async_info *AsyncInfoPtr);
|
||||||
|
|
||||||
/// Syncrhonize execution until an event is done.
|
/// Synchronize execution until an event is done.
|
||||||
int32_t sync_event(int32_t DeviceId, void *EventPtr);
|
int32_t sync_event(int32_t DeviceId, void *EventPtr);
|
||||||
|
|
||||||
/// Remove the event from the plugin.
|
/// Remove the event from the plugin.
|
||||||
@ -1327,7 +1327,7 @@ public:
|
|||||||
/// Sets the offset into the devices for use by OMPT.
|
/// Sets the offset into the devices for use by OMPT.
|
||||||
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
|
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
|
||||||
|
|
||||||
/// Returns if the plugin can support auotmatic copy.
|
/// Returns if the plugin can support automatic copy.
|
||||||
int32_t use_auto_zero_copy(int32_t DeviceId);
|
int32_t use_auto_zero_copy(int32_t DeviceId);
|
||||||
|
|
||||||
/// Look up a global symbol in the given binary.
|
/// Look up a global symbol in the given binary.
|
||||||
|
@ -34,7 +34,7 @@ class DeviceImageTy;
|
|||||||
} // namespace plugin
|
} // namespace plugin
|
||||||
|
|
||||||
/// A generic class implementing the interface between the RPC server provided
|
/// A generic class implementing the interface between the RPC server provided
|
||||||
/// by the 'libc' project and 'libomptarget'. If the RPC server is not availible
|
/// by the 'libc' project and 'libomptarget'. If the RPC server is not available
|
||||||
/// these routines will perform no action.
|
/// these routines will perform no action.
|
||||||
struct RPCServerTy {
|
struct RPCServerTy {
|
||||||
public:
|
public:
|
||||||
@ -48,7 +48,7 @@ public:
|
|||||||
llvm::Error startThread();
|
llvm::Error startThread();
|
||||||
|
|
||||||
/// Check if this device image is using an RPC server. This checks for the
|
/// Check if this device image is using an RPC server. This checks for the
|
||||||
/// precense of an externally visible symbol in the device image that will
|
/// presence of an externally visible symbol in the device image that will
|
||||||
/// be present whenever RPC code is called.
|
/// be present whenever RPC code is called.
|
||||||
llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
|
llvm::Expected<bool> isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
|
||||||
plugin::GenericGlobalHandlerTy &Handler,
|
plugin::GenericGlobalHandlerTy &Handler,
|
||||||
|
@ -67,7 +67,7 @@ Error GenericGlobalHandlerTy::moveGlobalBetweenDeviceAndHost(
|
|||||||
return Err;
|
return Err;
|
||||||
}
|
}
|
||||||
|
|
||||||
DP("Succesfully %s %u bytes associated with global symbol '%s' %s the "
|
DP("Successfully %s %u bytes associated with global symbol '%s' %s the "
|
||||||
"device "
|
"device "
|
||||||
"(%p -> %p).\n",
|
"(%p -> %p).\n",
|
||||||
Device2Host ? "read" : "write", HostGlobal.getSize(),
|
Device2Host ? "read" : "write", HostGlobal.getSize(),
|
||||||
|
@ -738,7 +738,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
|
|||||||
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
|
OMPX_SharedMemorySize("LIBOMPTARGET_SHARED_MEMORY_SIZE"),
|
||||||
// Do not initialize the following two envars since they depend on the
|
// Do not initialize the following two envars since they depend on the
|
||||||
// device initialization. These cannot be consulted until the device is
|
// device initialization. These cannot be consulted until the device is
|
||||||
// initialized correctly. We intialize them in GenericDeviceTy::init().
|
// initialized correctly. We initialize them in GenericDeviceTy::init().
|
||||||
OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
|
OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
|
||||||
// By default, the initial number of streams and events is 1.
|
// By default, the initial number of streams and events is 1.
|
||||||
OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
|
OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
|
||||||
@ -1040,7 +1040,7 @@ Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
|
|||||||
|
|
||||||
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
||||||
DeviceImageTy &Image) {
|
DeviceImageTy &Image) {
|
||||||
// The plugin either does not need an RPC server or it is unavailible.
|
// The plugin either does not need an RPC server or it is unavailable.
|
||||||
if (!shouldSetupRPCServer())
|
if (!shouldSetupRPCServer())
|
||||||
return Plugin::success();
|
return Plugin::success();
|
||||||
|
|
||||||
@ -1325,16 +1325,16 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
|
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
|
||||||
return Plugin::error("Device does not suppport VA Management");
|
return Plugin::error("Device does not support VA Management");
|
||||||
}
|
}
|
||||||
|
|
||||||
Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
|
Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
|
||||||
return Plugin::error("Device does not suppport VA Management");
|
return Plugin::error("Device does not support VA Management");
|
||||||
}
|
}
|
||||||
|
|
||||||
Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
|
Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
|
||||||
return Plugin::error(
|
return Plugin::error(
|
||||||
"Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
|
"Missing getDeviceMemorySize implementation (required by RR-heuristic");
|
||||||
}
|
}
|
||||||
|
|
||||||
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
|
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
|
||||||
@ -1814,7 +1814,7 @@ int32_t GenericPluginTy::initialize_record_replay(int32_t DeviceId,
|
|||||||
|
|
||||||
if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
|
if (auto Err = RecordReplay->init(&Device, MemorySize, VAddr, Status,
|
||||||
SaveOutput, ReqPtrArgOffset)) {
|
SaveOutput, ReqPtrArgOffset)) {
|
||||||
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
|
REPORT("WARNING RR did not initialize RR-properly with %lu bytes"
|
||||||
"(Error: %s)\n",
|
"(Error: %s)\n",
|
||||||
MemorySize, toString(std::move(Err)).data());
|
MemorySize, toString(std::move(Err)).data());
|
||||||
RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
|
RecordReplay->setStatus(RecordReplayTy::RRStatusTy::RRDeactivated);
|
||||||
@ -1961,7 +1961,7 @@ int32_t GenericPluginTy::data_retrieve_async(int32_t DeviceId, void *HstPtr,
|
|||||||
auto Err =
|
auto Err =
|
||||||
getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
|
getDevice(DeviceId).dataRetrieve(HstPtr, TgtPtr, Size, AsyncInfoPtr);
|
||||||
if (Err) {
|
if (Err) {
|
||||||
REPORT("Faliure to copy data from device to host. Pointers: host "
|
REPORT("Failure to copy data from device to host. Pointers: host "
|
||||||
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
|
"= " DPxMOD ", device = " DPxMOD ", size = %" PRId64 ": %s\n",
|
||||||
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
|
DPxPTR(HstPtr), DPxPTR(TgtPtr), Size,
|
||||||
toString(std::move(Err)).data());
|
toString(std::move(Err)).data());
|
||||||
|
@ -395,7 +395,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
|||||||
|
|
||||||
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
|
||||||
DeviceImageTy &Image) override {
|
DeviceImageTy &Image) override {
|
||||||
// Check for the presense of global destructors at initialization time. This
|
// Check for the presence of global destructors at initialization time. This
|
||||||
// is required when the image may be deallocated before destructors are run.
|
// is required when the image may be deallocated before destructors are run.
|
||||||
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
||||||
if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
|
if (Handler.isSymbolInImage(*this, Image, "nvptx$device$fini"))
|
||||||
@ -495,15 +495,15 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// We want to set up the RPC server for host services to the GPU if it is
|
/// We want to set up the RPC server for host services to the GPU if it is
|
||||||
/// availible.
|
/// available.
|
||||||
bool shouldSetupRPCServer() const override { return true; }
|
bool shouldSetupRPCServer() const override { return true; }
|
||||||
|
|
||||||
/// The RPC interface should have enough space for all availible parallelism.
|
/// The RPC interface should have enough space for all available parallelism.
|
||||||
uint64_t requestedRPCPortCount() const override {
|
uint64_t requestedRPCPortCount() const override {
|
||||||
return getHardwareParallelism();
|
return getHardwareParallelism();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the stream of the asynchronous info sructure or get a new one.
|
/// Get the stream of the asynchronous info structure or get a new one.
|
||||||
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
|
Error getStream(AsyncInfoWrapperTy &AsyncInfoWrapper, CUstream &Stream) {
|
||||||
// Get the stream (if any) from the async info.
|
// Get the stream (if any) from the async info.
|
||||||
Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
|
Stream = AsyncInfoWrapper.getQueueAs<CUstream>();
|
||||||
@ -675,7 +675,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
|||||||
if (Size >= Free) {
|
if (Size >= Free) {
|
||||||
*Addr = nullptr;
|
*Addr = nullptr;
|
||||||
return Plugin::error(
|
return Plugin::error(
|
||||||
"Canot map memory size larger than the available device memory");
|
"Cannot map memory size larger than the available device memory");
|
||||||
}
|
}
|
||||||
|
|
||||||
// currently NVidia only supports pinned device types
|
// currently NVidia only supports pinned device types
|
||||||
|
@ -38,8 +38,8 @@ typedef enum {
|
|||||||
FFI_BAD_ARGTYPE
|
FFI_BAD_ARGTYPE
|
||||||
} ffi_status;
|
} ffi_status;
|
||||||
|
|
||||||
// These are target depenent so we set them manually for each ABI by referencing
|
// These are target dependent so we set them manually for each ABI by
|
||||||
// the FFI source.
|
// referencing the FFI source.
|
||||||
typedef enum ffi_abi {
|
typedef enum ffi_abi {
|
||||||
#if (defined(_M_X64) || defined(__x86_64__))
|
#if (defined(_M_X64) || defined(__x86_64__))
|
||||||
FFI_DEFAULT_ABI = 2, // FFI_UNIX64.
|
FFI_DEFAULT_ABI = 2, // FFI_UNIX64.
|
||||||
|
@ -185,7 +185,7 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
|
|||||||
// omp_target_is_present tests whether a host pointer refers to storage that
|
// omp_target_is_present tests whether a host pointer refers to storage that
|
||||||
// is mapped to a given device. However, due to the lack of the storage size,
|
// is mapped to a given device. However, due to the lack of the storage size,
|
||||||
// only check 1 byte. Cannot set size 0 which checks whether the pointer (zero
|
// only check 1 byte. Cannot set size 0 which checks whether the pointer (zero
|
||||||
// lengh array) is mapped instead of the referred storage.
|
// length array) is mapped instead of the referred storage.
|
||||||
TargetPointerResultTy TPR =
|
TargetPointerResultTy TPR =
|
||||||
DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
|
DeviceOrErr->getMappingInfo().getTgtPtrBegin(const_cast<void *>(Ptr), 1,
|
||||||
/*UpdateRefCount=*/false,
|
/*UpdateRefCount=*/false,
|
||||||
@ -256,7 +256,7 @@ EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
|
|||||||
FATAL_MESSAGE(DstDevice, "%s",
|
FATAL_MESSAGE(DstDevice, "%s",
|
||||||
toString(DstDeviceOrErr.takeError()).c_str());
|
toString(DstDeviceOrErr.takeError()).c_str());
|
||||||
// First try to use D2D memcpy which is more efficient. If fails, fall back
|
// First try to use D2D memcpy which is more efficient. If fails, fall back
|
||||||
// to unefficient way.
|
// to inefficient way.
|
||||||
if (SrcDeviceOrErr->isDataExchangable(*DstDeviceOrErr)) {
|
if (SrcDeviceOrErr->isDataExchangable(*DstDeviceOrErr)) {
|
||||||
AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
|
AsyncInfoTy AsyncInfo(*SrcDeviceOrErr);
|
||||||
Rc = SrcDeviceOrErr->dataExchange(SrcAddr, *DstDeviceOrErr, DstAddr,
|
Rc = SrcDeviceOrErr->dataExchange(SrcAddr, *DstDeviceOrErr, DstAddr,
|
||||||
|
@ -149,7 +149,7 @@ LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
|
|||||||
// std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
|
// std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
|
||||||
if (Upper != HDTTMap->begin()) {
|
if (Upper != HDTTMap->begin()) {
|
||||||
LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
|
LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
|
||||||
// the left side of extended address range is satisified.
|
// the left side of extended address range is satisfied.
|
||||||
// hp >= LR.TPR.getEntry()->HstPtrBegin || hp >=
|
// hp >= LR.TPR.getEntry()->HstPtrBegin || hp >=
|
||||||
// LR.TPR.getEntry()->HstPtrBase
|
// LR.TPR.getEntry()->HstPtrBase
|
||||||
LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd ||
|
LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd ||
|
||||||
@ -158,7 +158,7 @@ LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
|
|||||||
|
|
||||||
if (!LR.Flags.IsContained && Upper != HDTTMap->end()) {
|
if (!LR.Flags.IsContained && Upper != HDTTMap->end()) {
|
||||||
LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
|
LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
|
||||||
// the right side of extended address range is satisified.
|
// the right side of extended address range is satisfied.
|
||||||
// hp < LR.TPR.getEntry()->HstPtrEnd || hp < LR.TPR.getEntry()->HstPtrBase
|
// hp < LR.TPR.getEntry()->HstPtrEnd || hp < LR.TPR.getEntry()->HstPtrBase
|
||||||
LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBase;
|
LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBase;
|
||||||
}
|
}
|
||||||
|
@ -189,7 +189,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
|
|||||||
if (Entry.Flags == OMP_REGISTER_REQUIRES)
|
if (Entry.Flags == OMP_REGISTER_REQUIRES)
|
||||||
PM->addRequirements(Entry.Data);
|
PM->addRequirements(Entry.Data);
|
||||||
|
|
||||||
// Extract the exectuable image and extra information if availible.
|
// Extract the executable image and extra information if available.
|
||||||
for (int32_t i = 0; i < Desc->NumDeviceImages; ++i)
|
for (int32_t i = 0; i < Desc->NumDeviceImages; ++i)
|
||||||
PM->addDeviceImage(*Desc, Desc->DeviceImages[i]);
|
PM->addDeviceImage(*Desc, Desc->DeviceImages[i]);
|
||||||
|
|
||||||
@ -273,7 +273,7 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) {
|
|||||||
|
|
||||||
// Auto Zero-Copy can only be currently triggered when the system is an
|
// Auto Zero-Copy can only be currently triggered when the system is an
|
||||||
// homogeneous APU architecture without attached discrete GPUs.
|
// homogeneous APU architecture without attached discrete GPUs.
|
||||||
// If all devices suggest to use it, change requirment flags to trigger
|
// If all devices suggest to use it, change requirement flags to trigger
|
||||||
// zero-copy behavior when mapping memory.
|
// zero-copy behavior when mapping memory.
|
||||||
if (UseAutoZeroCopy)
|
if (UseAutoZeroCopy)
|
||||||
addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
|
addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
|
||||||
|
@ -196,7 +196,7 @@ int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
|
|||||||
DPxPTR(HstPtr), Size);
|
DPxPTR(HstPtr), Size);
|
||||||
|
|
||||||
if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
|
if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
|
||||||
REPORT("Notifiying about data mapping failed.\n");
|
REPORT("Notifying about data mapping failed.\n");
|
||||||
return OFFLOAD_FAIL;
|
return OFFLOAD_FAIL;
|
||||||
}
|
}
|
||||||
return OFFLOAD_SUCCESS;
|
return OFFLOAD_SUCCESS;
|
||||||
@ -206,7 +206,7 @@ int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
|
|||||||
DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
|
DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
|
||||||
|
|
||||||
if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
|
if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
|
||||||
REPORT("Notifiying about data unmapping failed.\n");
|
REPORT("Notifying about data unmapping failed.\n");
|
||||||
return OFFLOAD_FAIL;
|
return OFFLOAD_FAIL;
|
||||||
}
|
}
|
||||||
return OFFLOAD_SUCCESS;
|
return OFFLOAD_SUCCESS;
|
||||||
|
@ -39,7 +39,7 @@ using namespace llvm::omp::target::ompt;
|
|||||||
//
|
//
|
||||||
// The return bool indicates if the offload is to the host device
|
// The return bool indicates if the offload is to the host device
|
||||||
// There are three possible results:
|
// There are three possible results:
|
||||||
// - Return false if the taregt device is ready for offload
|
// - Return false if the target device is ready for offload
|
||||||
// - Return true without reporting a runtime error if offload is
|
// - Return true without reporting a runtime error if offload is
|
||||||
// disabled, perhaps because the initial device was specified.
|
// disabled, perhaps because the initial device was specified.
|
||||||
// - Report a runtime error and return true.
|
// - Report a runtime error and return true.
|
||||||
@ -366,8 +366,8 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
|
|||||||
|
|
||||||
int Rc = OFFLOAD_SUCCESS;
|
int Rc = OFFLOAD_SUCCESS;
|
||||||
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
|
Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo);
|
||||||
{ // required to show syncronization
|
{ // required to show synchronization
|
||||||
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: syncronize", "", Loc);
|
TIMESCOPE_WITH_DETAILS_AND_IDENT("Runtime: synchronize", "", Loc);
|
||||||
if (Rc == OFFLOAD_SUCCESS)
|
if (Rc == OFFLOAD_SUCCESS)
|
||||||
Rc = AsyncInfo.synchronize();
|
Rc = AsyncInfo.synchronize();
|
||||||
|
|
||||||
|
@ -1409,7 +1409,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
|||||||
// API, we need the begin address itself, i.e. &A[N], as the API operates on
|
// API, we need the begin address itself, i.e. &A[N], as the API operates on
|
||||||
// begin addresses, not bases. That's why we pass args and offsets as two
|
// begin addresses, not bases. That's why we pass args and offsets as two
|
||||||
// separate entities so that each plugin can do what it needs. This behavior
|
// separate entities so that each plugin can do what it needs. This behavior
|
||||||
// was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
|
// was introduced via https://reviews.llvm.org/D33028 and commit 1546d319244c.
|
||||||
SmallVector<void *> TgtArgs;
|
SmallVector<void *> TgtArgs;
|
||||||
SmallVector<ptrdiff_t> TgtOffsets;
|
SmallVector<ptrdiff_t> TgtOffsets;
|
||||||
|
|
||||||
@ -1431,7 +1431,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
|||||||
|
|
||||||
// Clang might pass more values via the ArgPtrs to the runtime that we pass
|
// Clang might pass more values via the ArgPtrs to the runtime that we pass
|
||||||
// on to the kernel.
|
// on to the kernel.
|
||||||
// TOOD: Next time we adjust the KernelArgsTy we should introduce a new
|
// TODO: Next time we adjust the KernelArgsTy we should introduce a new
|
||||||
// NumKernelArgs field.
|
// NumKernelArgs field.
|
||||||
KernelArgs.NumArgs = TgtArgs.size();
|
KernelArgs.NumArgs = TgtArgs.size();
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
// RUN: %libomptarget-compile-and-run-generic
|
// RUN: %libomptarget-compile-and-run-generic
|
||||||
|
|
||||||
// Test case for omp_target_memcpy_async, oringally from GCC
|
// Test case for omp_target_memcpy_async, originally from GCC
|
||||||
|
|
||||||
#include "stdio.h"
|
#include "stdio.h"
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
|
@ -54,7 +54,7 @@ int test_omp_aligned_alloc_on_device() {
|
|||||||
int main() {
|
int main() {
|
||||||
int errors = 0;
|
int errors = 0;
|
||||||
if (test_omp_aligned_alloc_on_device())
|
if (test_omp_aligned_alloc_on_device())
|
||||||
printf("FAILE\n");
|
printf("FAILED\n");
|
||||||
else
|
else
|
||||||
// CHECK: PASSED
|
// CHECK: PASSED
|
||||||
printf("PASSED\n");
|
printf("PASSED\n");
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
! Offloading test with runtine call to ompx_dump_mapping_tables Fortran array
|
! Offloading test with runtime call to ompx_dump_mapping_tables Fortran array
|
||||||
! writing some values and printing the variable mapped to device correctly
|
! writing some values and printing the variable mapped to device correctly
|
||||||
! receives the updates made on the device.
|
! receives the updates made on the device.
|
||||||
! REQUIRES: flang
|
! REQUIRES: flang
|
||||||
|
@ -3,7 +3,7 @@
|
|||||||
! REQUIRES: flang, amdgpu
|
! REQUIRES: flang, amdgpu
|
||||||
|
|
||||||
! This fails only because it needs the Fortran runtime built for device. If this
|
! This fails only because it needs the Fortran runtime built for device. If this
|
||||||
! is avaialbe, this test succeeds when run.
|
! is available, this test succeeds when run.
|
||||||
! XFAIL: *
|
! XFAIL: *
|
||||||
|
|
||||||
! RUN: %libomptarget-compile-fortran-generic
|
! RUN: %libomptarget-compile-fortran-generic
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
! Small regression test that checks that we do not cause a runtime map error in
|
! Small regression test that checks that we do not cause a runtime map error in
|
||||||
! cases where we are required to allocate a local variable for the fortran
|
! cases where we are required to allocate a local variable for the fortran
|
||||||
! descriptor to store into and then load from it, done so by re-using the
|
! descriptor to store into and then load from it, done so by reusing the
|
||||||
! temporary local variable across all maps related to the mapped variable and
|
! temporary local variable across all maps related to the mapped variable and
|
||||||
! associated local variable to make sure that each map does as it is intended
|
! associated local variable to make sure that each map does as it is intended
|
||||||
! to do with the original data. This prevents blobs of local descriptor data
|
! to do with the original data. This prevents blobs of local descriptor data
|
||||||
! remaining attatched on device long after it's supposed to, which can cause
|
! remaining attached on device long after it's supposed to, which can cause
|
||||||
! weird map issues later in susbequent function invocations. However, it
|
! weird map issues later in susbequent function invocations. However, it
|
||||||
! doesn't avoid a user shooting themselves in the foot by mapping data via
|
! doesn't avoid a user shooting themselves in the foot by mapping data via
|
||||||
! enter and then not providing a corresponding exit.
|
! enter and then not providing a corresponding exit.
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
! Offloading test checking interaction of an explicit derived type mapping when
|
! Offloading test checking interaction of an explicit derived type mapping when
|
||||||
! mapped to target and assinging one derived type to another
|
! mapped to target and assigning one derived type to another
|
||||||
! REQUIRES: flang, amdgpu
|
! REQUIRES: flang, amdgpu
|
||||||
|
|
||||||
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
! Offloading test checking interaction of an implicit derived type mapping when
|
! Offloading test checking interaction of an implicit derived type mapping when
|
||||||
! mapped to target and assinging one derived type to another
|
! mapped to target and assigning one derived type to another
|
||||||
! REQUIRES: flang, amdgpu
|
! REQUIRES: flang, amdgpu
|
||||||
|
|
||||||
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
! Offloading test checking interaction of an explicit derived type mapping when
|
! Offloading test checking interaction of an explicit derived type mapping when
|
||||||
! mapped to target and assinging one derived type to another
|
! mapped to target and assigning one derived type to another
|
||||||
! REQUIRES: flang, amdgpu
|
! REQUIRES: flang, amdgpu
|
||||||
|
|
||||||
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
! RUN: %libomptarget-compile-fortran-run-and-check-generic
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
// REQUIRES: gpu
|
// REQUIRES: gpu
|
||||||
|
|
||||||
// This test validates that the OpenMP target reductions to find a maximum work
|
// This test validates that the OpenMP target reductions to find a maximum work
|
||||||
// as indended for a few common data types.
|
// as intended for a few common data types.
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
// REQUIRES: gpu
|
// REQUIRES: gpu
|
||||||
|
|
||||||
// This test validates that the OpenMP target reductions to find a minimum work
|
// This test validates that the OpenMP target reductions to find a minimum work
|
||||||
// as indended for a few common data types.
|
// as intended for a few common data types.
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -32,7 +32,7 @@ int main() {
|
|||||||
dat.datum[dat.arr[0][0]] = 0;
|
dat.datum[dat.arr[0][0]] = 0;
|
||||||
|
|
||||||
/// The struct is mapped with type 0x0 when the pointer fields are mapped.
|
/// The struct is mapped with type 0x0 when the pointer fields are mapped.
|
||||||
/// The struct is also map explicitely by the user. The second mapping by
|
/// The struct is also map explicitly by the user. The second mapping by
|
||||||
/// the user must not overwrite the mapping set up for the pointer fields
|
/// the user must not overwrite the mapping set up for the pointer fields
|
||||||
/// when mapping the struct happens after the mapping of the pointers.
|
/// when mapping the struct happens after the mapping of the pointers.
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ int main(int argc, char *argv[]) {
|
|||||||
int rc = omp_target_associate_ptr(&x, x_dev, sizeof x, 0, dev);
|
int rc = omp_target_associate_ptr(&x, x_dev, sizeof x, 0, dev);
|
||||||
assert(!rc && "expected omp_target_associate_ptr to succeed");
|
assert(!rc && "expected omp_target_associate_ptr to succeed");
|
||||||
|
|
||||||
// To determine whether x needs to be transfered, the runtime cannot simply
|
// To determine whether x needs to be transferred, the runtime cannot simply
|
||||||
// check whether unified shared memory is enabled and the 'close' modifier is
|
// check whether unified shared memory is enabled and the 'close' modifier is
|
||||||
// specified. It must check whether x was previously placed in device memory
|
// specified. It must check whether x was previously placed in device memory
|
||||||
// by, for example, omp_target_associate_ptr.
|
// by, for example, omp_target_associate_ptr.
|
||||||
|
@ -23,7 +23,7 @@ int main(int argc, char *argv[]) {
|
|||||||
s.x = 11;
|
s.x = 11;
|
||||||
s.y = 21;
|
s.y = 21;
|
||||||
}
|
}
|
||||||
// To determine whether x needs to be transfered or deleted, the runtime
|
// To determine whether x needs to be transferred or deleted, the runtime
|
||||||
// cannot simply check whether unified shared memory is enabled and the
|
// cannot simply check whether unified shared memory is enabled and the
|
||||||
// 'close' modifier is specified. It must check whether x was previously
|
// 'close' modifier is specified. It must check whether x was previously
|
||||||
// placed in device memory by, for example, a 'close' modifier that isn't
|
// placed in device memory by, for example, a 'close' modifier that isn't
|
||||||
|
@ -74,7 +74,7 @@ int main(int argc, char **argv) {
|
|||||||
unsigned NumThreads =
|
unsigned NumThreads =
|
||||||
(NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
|
(NumThreadsOpt > 0 ? NumThreadsOpt : NumThreadsJson.value());
|
||||||
// TODO: Print a warning if number of teams/threads is explicitly set in the
|
// TODO: Print a warning if number of teams/threads is explicitly set in the
|
||||||
// kernel info but overriden through command line options.
|
// kernel info but overridden through command line options.
|
||||||
auto LoopTripCount =
|
auto LoopTripCount =
|
||||||
JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
|
JsonKernelInfo->getAsObject()->getInteger("LoopTripCount");
|
||||||
auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
|
auto KernelFunc = JsonKernelInfo->getAsObject()->getString("Name");
|
||||||
|
Loading…
x
Reference in New Issue
Block a user