[OFFLOAD] Build DeviceRTL with SPIRV backend (#174675)
This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <huberjn@outlook.com>
This commit is contained in:
parent
0c64017c99
commit
8db9774118
@ -158,8 +158,8 @@ else()
|
||||
endif()
|
||||
|
||||
# Use the current compiler target to determine the appropriate runtime to build.
|
||||
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
|
||||
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
|
||||
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR
|
||||
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64")
|
||||
add_subdirectory(device)
|
||||
else()
|
||||
add_subdirectory(module)
|
||||
|
||||
@ -33,6 +33,7 @@ list(APPEND compile_options -fno-rtti)
|
||||
list(APPEND compile_options -fno-exceptions)
|
||||
list(APPEND compile_options -fconvergent-functions)
|
||||
list(APPEND compile_options -Wno-unknown-cuda-version)
|
||||
|
||||
if(LLVM_DEFAULT_TARGET_TRIPLE)
|
||||
list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
|
||||
endif()
|
||||
@ -52,6 +53,9 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
|
||||
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
|
||||
set(target_name "nvptx")
|
||||
list(APPEND compile_options --cuda-feature=+ptx63)
|
||||
elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
|
||||
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
|
||||
set(target_name "spirv")
|
||||
endif()
|
||||
|
||||
# Trick to combine these into a bitcode file via the linker's LTO pass.
|
||||
@ -74,8 +78,15 @@ target_include_directories(libompdevice PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../libc
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
|
||||
target_compile_options(libompdevice PRIVATE ${compile_options})
|
||||
target_link_options(libompdevice PRIVATE
|
||||
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
|
||||
if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
|
||||
NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
|
||||
target_link_options(libompdevice PRIVATE
|
||||
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
|
||||
else()
|
||||
target_link_options(libompdevice PRIVATE
|
||||
"-nostdlib" "-emit-llvm")
|
||||
endif()
|
||||
|
||||
if(LLVM_DEFAULT_TARGET_TRIPLE)
|
||||
target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
|
||||
endif()
|
||||
|
||||
@ -131,7 +131,17 @@ struct IdentTy {
|
||||
|
||||
using __kmpc_impl_lanemask_t = LaneMaskTy;
|
||||
|
||||
using ParallelRegionFnTy = void *;
|
||||
#ifdef __SPIRV__
|
||||
// Function pointers in SPIRV backend have a special address space 9.
|
||||
// Since function pointers are passed as regular void * pointers it is
|
||||
// necessary to annotate them with proper address space to avoid casting
|
||||
// errors during compilation.
|
||||
using FnPtrTy = void [[clang::address_space(9)]] *;
|
||||
#else
|
||||
using FnPtrTy = void *;
|
||||
#endif
|
||||
|
||||
using ParallelRegionFnTy = FnPtrTy;
|
||||
|
||||
using CriticalNameTy = int32_t[8];
|
||||
|
||||
|
||||
@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
[[gnu::always_inline, gnu::flatten]] inline void *&
|
||||
[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
|
||||
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
|
||||
switch (Kind) {
|
||||
case state::VK_ParallelRegionFn:
|
||||
|
||||
@ -42,7 +42,20 @@ enum MemScopeTy {
|
||||
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
|
||||
V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
|
||||
MemScopeTy MemScope = MemScopeTy::device) {
|
||||
#if defined(__SPIRV__)
|
||||
uint32_t Old;
|
||||
while (true) {
|
||||
Old = load(Address, Ordering, MemScope);
|
||||
if (Old >= Val) {
|
||||
if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
|
||||
break;
|
||||
} else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
|
||||
break;
|
||||
}
|
||||
return Old;
|
||||
#else
|
||||
return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
|
||||
|
||||
@ -23,7 +23,7 @@ using namespace allocator;
|
||||
// Provide a default implementation of malloc / free for AMDGPU platforms built
|
||||
// without 'libc' support.
|
||||
extern "C" {
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
|
||||
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
|
||||
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
|
||||
#else
|
||||
|
||||
@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||
|
||||
// Invoke an outlined parallel function unwrapping arguments (up to 32).
|
||||
[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
|
||||
int32_t bound_tid, void *fn,
|
||||
int32_t bound_tid, FnPtrTy fn,
|
||||
void **args, int64_t nargs) {
|
||||
switch (nargs) {
|
||||
#include "generated_microtask_cases.gen"
|
||||
@ -84,7 +84,7 @@ extern "C" {
|
||||
|
||||
[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
|
||||
int32_t num_threads,
|
||||
void *fn, void **args,
|
||||
FnPtrTy fn, void **args,
|
||||
const int64_t nargs) {
|
||||
uint32_t TId = mapping::getThreadIdInBlock();
|
||||
uint32_t NumThreads = determineNumberOfThreads(num_threads);
|
||||
@ -142,8 +142,8 @@ extern "C" {
|
||||
|
||||
[[clang::always_inline]] void
|
||||
__kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
|
||||
int32_t num_threads, int proc_bind, void *fn,
|
||||
void *wrapper_fn, void **args, int64_t nargs,
|
||||
int32_t num_threads, int proc_bind, FnPtrTy fn,
|
||||
FnPtrTy wrapper_fn, void **args, int64_t nargs,
|
||||
int32_t nt_strict) {
|
||||
uint32_t TId = mapping::getThreadIdInBlock();
|
||||
|
||||
@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
|
||||
1u, true, ident,
|
||||
/*ForceTeamState=*/true);
|
||||
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
|
||||
(void *)nullptr, true, ident,
|
||||
(FnPtrTy) nullptr, true, ident,
|
||||
/*ForceTeamState=*/true);
|
||||
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
|
||||
/*ForceTeamState=*/true);
|
||||
|
||||
@ -186,6 +186,37 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
|
||||
#endif
|
||||
///}
|
||||
|
||||
#if defined(__SPIRV__)
|
||||
void namedBarrierInit() { __builtin_trap(); } // TODO
|
||||
void namedBarrier() { __builtin_trap(); } // TODO
|
||||
|
||||
void unsetLock(omp_lock_t *Lock) {
|
||||
atomic::store((int32_t *)Lock, 0, atomic::seq_cst);
|
||||
}
|
||||
int testLock(omp_lock_t *Lock) {
|
||||
return atomic::add((int32_t *)Lock, 0, atomic::seq_cst);
|
||||
}
|
||||
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
|
||||
void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
|
||||
void setLock(omp_lock_t *Lock) {
|
||||
int32_t *Lock_ptr = (int32_t *)Lock;
|
||||
bool Acquired = false;
|
||||
int32_t Expected;
|
||||
while (!Acquired) {
|
||||
Expected = 0;
|
||||
if (Expected == atomic::load(Lock_ptr, atomic::seq_cst))
|
||||
Acquired =
|
||||
atomic::cas(Lock_ptr, Expected, 1, atomic::seq_cst, atomic::seq_cst);
|
||||
}
|
||||
}
|
||||
|
||||
void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
|
||||
void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
|
||||
void syncThreadsAligned(atomic::OrderingTy Ordering) {
|
||||
synchronize::threads(Ordering);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace impl
|
||||
|
||||
void synchronize::init(bool IsSPMD) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user