[OFFLOAD] Build DeviceRTL with SPIRV backend (#174675)

This PR adds configuration to build DeviceRTL with SPIRV backend. It is
primarily used for level-zero plugin for Intel GPUs

---------

Co-authored-by: Joseph Huber <huberjn@outlook.com>
This commit is contained in:
fineg74 2026-01-26 16:09:28 -08:00 committed by GitHub
parent 0c64017c99
commit 8db9774118
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 77 additions and 12 deletions

View File

@ -158,8 +158,8 @@ else()
endif()
# Use the current compiler target to determine the appropriate runtime to build.
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64")
add_subdirectory(device)
else()
add_subdirectory(module)

View File

@ -33,6 +33,7 @@ list(APPEND compile_options -fno-rtti)
list(APPEND compile_options -fno-exceptions)
list(APPEND compile_options -fconvergent-functions)
list(APPEND compile_options -Wno-unknown-cuda-version)
if(LLVM_DEFAULT_TARGET_TRIPLE)
list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
endif()
@ -52,6 +53,9 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
set(target_name "nvptx")
list(APPEND compile_options --cuda-feature=+ptx63)
elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
"${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
set(target_name "spirv")
endif()
# Trick to combine these into a bitcode file via the linker's LTO pass.
@ -74,8 +78,15 @@ target_include_directories(libompdevice PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../../libc
${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
target_compile_options(libompdevice PRIVATE ${compile_options})
target_link_options(libompdevice PRIVATE
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
target_link_options(libompdevice PRIVATE
"-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
else()
target_link_options(libompdevice PRIVATE
"-nostdlib" "-emit-llvm")
endif()
if(LLVM_DEFAULT_TARGET_TRIPLE)
target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
endif()

View File

@ -131,7 +131,17 @@ struct IdentTy {
using __kmpc_impl_lanemask_t = LaneMaskTy;
using ParallelRegionFnTy = void *;
#ifdef __SPIRV__
// Function pointers in SPIRV backend have a special address space 9.
// Since function pointers are passed as regular void * pointers it is
// necessary to annotate them with proper address space to avoid casting
// errors during compilation.
using FnPtrTy = void [[clang::address_space(9)]] *;
#else
using FnPtrTy = void *;
#endif
using ParallelRegionFnTy = FnPtrTy;
using CriticalNameTy = int32_t[8];

View File

@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
__builtin_unreachable();
}
[[gnu::always_inline, gnu::flatten]] inline void *&
[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:

View File

@ -42,7 +42,20 @@ enum MemScopeTy {
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
MemScopeTy MemScope = MemScopeTy::device) {
#if defined(__SPIRV__)
uint32_t Old;
while (true) {
Old = load(Address, Ordering, MemScope);
if (Old >= Val) {
if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
break;
} else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
break;
}
return Old;
#else
return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
#endif
}
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>

View File

@ -23,7 +23,7 @@ using namespace allocator;
// Provide a default implementation of malloc / free for AMDGPU platforms built
// without 'libc' support.
extern "C" {
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
#else

View File

@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
// Invoke an outlined parallel function unwrapping arguments (up to 32).
[[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
int32_t bound_tid, void *fn,
int32_t bound_tid, FnPtrTy fn,
void **args, int64_t nargs) {
switch (nargs) {
#include "generated_microtask_cases.gen"
@ -84,7 +84,7 @@ extern "C" {
[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
int32_t num_threads,
void *fn, void **args,
FnPtrTy fn, void **args,
const int64_t nargs) {
uint32_t TId = mapping::getThreadIdInBlock();
uint32_t NumThreads = determineNumberOfThreads(num_threads);
@ -142,8 +142,8 @@ extern "C" {
[[clang::always_inline]] void
__kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
int32_t num_threads, int proc_bind, void *fn,
void *wrapper_fn, void **args, int64_t nargs,
int32_t num_threads, int proc_bind, FnPtrTy fn,
FnPtrTy wrapper_fn, void **args, int64_t nargs,
int32_t nt_strict) {
uint32_t TId = mapping::getThreadIdInBlock();
@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
1u, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
(void *)nullptr, true, ident,
(FnPtrTy) nullptr, true, ident,
/*ForceTeamState=*/true);
state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
/*ForceTeamState=*/true);

View File

@ -186,6 +186,37 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
#endif
///}
#if defined(__SPIRV__)
void namedBarrierInit() { __builtin_trap(); } // TODO
void namedBarrier() { __builtin_trap(); } // TODO
void unsetLock(omp_lock_t *Lock) {
atomic::store((int32_t *)Lock, 0, atomic::seq_cst);
}
int testLock(omp_lock_t *Lock) {
return atomic::add((int32_t *)Lock, 0, atomic::seq_cst);
}
void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setLock(omp_lock_t *Lock) {
int32_t *Lock_ptr = (int32_t *)Lock;
bool Acquired = false;
int32_t Expected;
while (!Acquired) {
Expected = 0;
if (Expected == atomic::load(Lock_ptr, atomic::seq_cst))
Acquired =
atomic::cas(Lock_ptr, Expected, 1, atomic::seq_cst, atomic::seq_cst);
}
}
void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
void syncThreadsAligned(atomic::OrderingTy Ordering) {
synchronize::threads(Ordering);
}
#endif
} // namespace impl
void synchronize::init(bool IsSPMD) {