[OFFLOAD] Build DeviceRTL with SPIRV backend (#174675)

This PR adds configuration to build DeviceRTL with SPIRV backend. It is primarily used for level-zero plugin for Intel GPUs --------- Co-authored-by: Joseph Huber <huberjn@outlook.com>
2026-01-26 16:09:28 -08:00 · 2026-01-26 16:09:28 -08:00 · 8db9774118
commit 8db9774118
parent 0c64017c99
8 changed files with 77 additions and 12 deletions
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@ -158,8 +158,8 @@ else()
 endif()

 # Use the current compiler target to determine the appropriate runtime to build.
-if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
-   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
+if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx|^spirv64" OR
+   "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx|^spirv64")
  add_subdirectory(device)
 else()
  add_subdirectory(module)
--- a/openmp/device/CMakeLists.txt
+++ b/openmp/device/CMakeLists.txt
@ -33,6 +33,7 @@ list(APPEND compile_options -fno-rtti)
 list(APPEND compile_options -fno-exceptions)
 list(APPEND compile_options -fconvergent-functions)
 list(APPEND compile_options -Wno-unknown-cuda-version)
+
 if(LLVM_DEFAULT_TARGET_TRIPLE)
  list(APPEND compile_options --target=${LLVM_DEFAULT_TARGET_TRIPLE})
 endif()
@ -52,6 +53,9 @@ elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^nvptx" OR
       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^nvptx")
  set(target_name "nvptx")
  list(APPEND compile_options --cuda-feature=+ptx63)
+elseif("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv64" OR
+       "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv64")
+  set(target_name "spirv") 
 endif()

 # Trick to combine these into a bitcode file via the linker's LTO pass.
@ -74,8 +78,15 @@ target_include_directories(libompdevice PRIVATE
                           ${CMAKE_CURRENT_SOURCE_DIR}/../../libc
                           ${CMAKE_CURRENT_SOURCE_DIR}/../../offload/include)
 target_compile_options(libompdevice PRIVATE ${compile_options})
-target_link_options(libompdevice PRIVATE
-                    "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
+if(NOT "${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^spirv" AND
+   NOT "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^spirv")
+  target_link_options(libompdevice PRIVATE
+                  "-flto" "-r" "-nostdlib" "-Wl,--lto-emit-llvm")
+else()
+  target_link_options(libompdevice PRIVATE
+                  "-nostdlib" "-emit-llvm")
+endif()
+
 if(LLVM_DEFAULT_TARGET_TRIPLE)
  target_link_options(libompdevice PRIVATE "--target=${LLVM_DEFAULT_TARGET_TRIPLE}")
 endif()
--- a/openmp/device/include/DeviceTypes.h
+++ b/openmp/device/include/DeviceTypes.h
@ -131,7 +131,17 @@ struct IdentTy {

 using __kmpc_impl_lanemask_t = LaneMaskTy;

-using ParallelRegionFnTy = void *;
+#ifdef __SPIRV__
+// Function pointers in SPIRV backend have a special address space 9.
+// Since function pointers are passed as regular void * pointers it is
+// necessary to annotate them with proper address space to avoid casting
+// errors during compilation.
+using FnPtrTy = void [[clang::address_space(9)]] *;
+#else
+using FnPtrTy = void *;
+#endif
+
+using ParallelRegionFnTy = FnPtrTy;

 using CriticalNameTy = int32_t[8];

--- a/openmp/device/include/State.h
+++ b/openmp/device/include/State.h
@ -219,7 +219,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
  __builtin_unreachable();
 }

-[[gnu::always_inline, gnu::flatten]] inline void *&
+[[gnu::always_inline, gnu::flatten]] inline FnPtrTy &
 lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
  switch (Kind) {
  case state::VK_ParallelRegionFn:
--- a/openmp/device/include/Synchronization.h
+++ b/openmp/device/include/Synchronization.h
@ -42,7 +42,20 @@ enum MemScopeTy {
 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
 V inc(Ty *Address, V Val, atomic::OrderingTy Ordering,
      MemScopeTy MemScope = MemScopeTy::device) {
+#if defined(__SPIRV__)
+  uint32_t Old;
+  while (true) {
+    Old = load(Address, Ordering, MemScope);
+    if (Old >= Val) {
+      if (cas(Address, Old, 0u, Ordering, Ordering, MemScope))
+        break;
+    } else if (cas(Address, Old, Old + 1, Ordering, Ordering, MemScope))
+      break;
+  }
+  return Old;
+#else
  return __scoped_atomic_fetch_uinc(Address, Val, Ordering, MemScope);
+#endif
 }

 template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@ -23,7 +23,7 @@ using namespace allocator;
 // Provide a default implementation of malloc / free for AMDGPU platforms built
 // without 'libc' support.
 extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+#if (defined(__AMDGPU__) || defined(__SPIRV__)) && !defined(OMPTARGET_HAS_LIBC)
 [[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
 [[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
 #else
--- a/openmp/device/src/Parallelism.cpp
+++ b/openmp/device/src/Parallelism.cpp
@ -68,7 +68,7 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {

 // Invoke an outlined parallel function unwrapping arguments (up to 32).
 [[clang::always_inline]] void invokeMicrotask(int32_t global_tid,
-                                              int32_t bound_tid, void *fn,
+                                              int32_t bound_tid, FnPtrTy fn,
                                              void **args, int64_t nargs) {
  switch (nargs) {
 #include "generated_microtask_cases.gen"
@ -84,7 +84,7 @@ extern "C" {

 [[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident,
                                                   int32_t num_threads,
-                                                   void *fn, void **args,
+                                                   FnPtrTy fn, void **args,
                                                   const int64_t nargs) {
  uint32_t TId = mapping::getThreadIdInBlock();
  uint32_t NumThreads = determineNumberOfThreads(num_threads);
@ -142,8 +142,8 @@ extern "C" {

 [[clang::always_inline]] void
 __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
-                   int32_t num_threads, int proc_bind, void *fn,
-                   void *wrapper_fn, void **args, int64_t nargs,
+                   int32_t num_threads, int proc_bind, FnPtrTy fn,
+                   FnPtrTy wrapper_fn, void **args, int64_t nargs,
                   int32_t nt_strict) {
  uint32_t TId = mapping::getThreadIdInBlock();

@ -261,7 +261,7 @@ __kmpc_parallel_60(IdentTy *ident, int32_t, int32_t if_expr,
                                          1u, true, ident,
                                          /*ForceTeamState=*/true);
    state::ValueRAII ParallelRegionFnRAII(state::ParallelRegionFn, wrapper_fn,
-                                          (void *)nullptr, true, ident,
+                                          (FnPtrTy) nullptr, true, ident,
                                          /*ForceTeamState=*/true);
    state::ValueRAII ActiveLevelRAII(icv::ActiveLevel, 1u, 0u, true, ident,
                                     /*ForceTeamState=*/true);
--- a/openmp/device/src/Synchronization.cpp
+++ b/openmp/device/src/Synchronization.cpp
@ -186,6 +186,37 @@ void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
 #endif
 ///}

+#if defined(__SPIRV__)
+void namedBarrierInit() { __builtin_trap(); } // TODO
+void namedBarrier() { __builtin_trap(); }     // TODO
+
+void unsetLock(omp_lock_t *Lock) {
+  atomic::store((int32_t *)Lock, 0, atomic::seq_cst);
+}
+int testLock(omp_lock_t *Lock) {
+  return atomic::add((int32_t *)Lock, 0, atomic::seq_cst);
+}
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void setLock(omp_lock_t *Lock) {
+  int32_t *Lock_ptr = (int32_t *)Lock;
+  bool Acquired = false;
+  int32_t Expected;
+  while (!Acquired) {
+    Expected = 0;
+    if (Expected == atomic::load(Lock_ptr, atomic::seq_cst))
+      Acquired =
+          atomic::cas(Lock_ptr, Expected, 1, atomic::seq_cst, atomic::seq_cst);
+  }
+}
+
+void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
+void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
+void syncThreadsAligned(atomic::OrderingTy Ordering) {
+  synchronize::threads(Ordering);
+}
+#endif
+
 } // namespace impl

 void synchronize::init(bool IsSPMD) {