[OpenMP] Port the OpenMP device runtime to direct C++ compilation (#123673)
Summary: This removes the use of OpenMP offloading to build the device runtime. The main benefit here is that we no longer need to rely on offloading semantics to build a device only runtime. Things like variants are now no longer needed and can just be simple if-defs. In the future, I will remove most of the special handling here and fold it into calls to the `<gpuintrin.h>` functions instead. Additionally I will rework the compilation to make this a separate runtime. The current plan is to have this, but make including OpenMP and offloading either automatically add it, or print a warning if it's missing. This will allow us to use a normal CMake workflow and delete all the weird 'lets pull the clang binary out of the build' business. ``` -DRUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES=offload -DLLVM_RUNTIME_TARGETS=amdgcn-amd-amdhsa ``` After that, linking the OpenMP device runtime will be `-Xoffload-linker -lomp`. I.e. no more fat binary business. Only look at the most recent commit since this includes the two dependencies (fix to AMDGPUEmitPrintfBinding and the PointerToMember bug).
This commit is contained in:
parent
455cedc805
commit
bb7ab2557c
@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
|
||||
|
||||
# Set flags for LLVM Bitcode compilation.
|
||||
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
|
||||
${clang_opt_flags} --offload-device-only
|
||||
-nocudalib -nogpulib -nogpuinc -nostdlibinc
|
||||
-fopenmp -fopenmp-cuda-mode
|
||||
-Wno-unknown-cuda-version -Wno-openmp-target
|
||||
set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
|
||||
${clang_opt_flags} -nogpulib -nostdlibinc
|
||||
-fno-rtti -fno-exceptions -fconvergent-functions
|
||||
-Wno-unknown-cuda-version
|
||||
-DOMPTARGET_DEVICE_RUNTIME
|
||||
-I${include_directory}
|
||||
-I${devicertl_base_directory}/../include
|
||||
@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
|
||||
add_custom_command(OUTPUT ${outfile}
|
||||
COMMAND ${CLANG_TOOL}
|
||||
${bc_flags}
|
||||
-fopenmp-targets=${target_triple}
|
||||
-Xopenmp-target=${target_triple} -march=
|
||||
--target=${target_triple}
|
||||
${target_bc_flags}
|
||||
-MD -MF ${depfile}
|
||||
${infile} -o ${outfile}
|
||||
@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
|
||||
set(ide_target_name omptarget-ide-${target_name})
|
||||
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
|
||||
target_compile_options(${ide_target_name} PRIVATE
|
||||
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
|
||||
-fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
|
||||
-foffload-lto -fvisibility=hidden --offload-device-only
|
||||
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
|
||||
-fvisibility=hidden --target=${target_triple}
|
||||
-nogpulib -nostdlibinc -Wno-unknown-cuda-version
|
||||
)
|
||||
target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
|
||||
target_include_directories(${ide_target_name} PRIVATE
|
||||
|
@ -17,8 +17,6 @@
|
||||
// Forward declaration.
|
||||
struct KernelEnvironmentTy;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace ompx {
|
||||
|
||||
namespace allocator {
|
||||
@ -44,6 +42,4 @@ extern "C" {
|
||||
[[gnu::weak]] void free(void *Ptr);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
||||
|
@ -99,14 +99,7 @@ struct TaskDescriptorTy {
|
||||
TaskFnTy TaskFn;
|
||||
};
|
||||
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
using LaneMaskTy = uint64_t;
|
||||
#pragma omp end declare variant
|
||||
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(amdgcn)}, implementation = {extension(match_none)})
|
||||
using LaneMaskTy = uint64_t;
|
||||
#pragma omp end declare variant
|
||||
|
||||
namespace lanes {
|
||||
enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
|
||||
@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
|
||||
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
||||
|
||||
#define SHARED(NAME) \
|
||||
NAME [[clang::loader_uninitialized]]; \
|
||||
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
|
||||
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
|
||||
|
||||
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
|
||||
// now that's not the case.
|
||||
|
@ -15,8 +15,6 @@
|
||||
#include "DeviceTypes.h"
|
||||
#include "Shared/Utils.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace utils {
|
||||
|
||||
template <typename T> struct type_identity {
|
||||
@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);
|
||||
|
||||
} // namespace utils
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
||||
|
@ -24,12 +24,8 @@ enum {
|
||||
DIM_Z = 2,
|
||||
};
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
inline constexpr uint32_t MaxThreadsPerTeam = 1024;
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
/// Initialize the mapping machinery.
|
||||
void init(bool IsSPMD);
|
||||
|
||||
|
@ -22,8 +22,6 @@
|
||||
// Forward declaration.
|
||||
struct KernelEnvironmentTy;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace ompx {
|
||||
|
||||
namespace memory {
|
||||
@ -88,8 +86,7 @@ struct TeamStateTy {
|
||||
ParallelRegionFnTy ParallelRegionFnVar;
|
||||
};
|
||||
|
||||
extern TeamStateTy TeamState;
|
||||
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
|
||||
extern TeamStateTy [[clang::address_space(3)]] TeamState;
|
||||
|
||||
struct ThreadStateTy {
|
||||
|
||||
@ -115,8 +112,7 @@ struct ThreadStateTy {
|
||||
}
|
||||
};
|
||||
|
||||
extern ThreadStateTy **ThreadStates;
|
||||
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
|
||||
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
|
||||
|
||||
/// Initialize the state machinery. Must be called by all threads.
|
||||
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;
|
||||
|
||||
} // namespace ompx
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
||||
|
@ -15,8 +15,6 @@
|
||||
#include "DeviceTypes.h"
|
||||
#include "DeviceUtils.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace ompx {
|
||||
namespace atomic {
|
||||
|
||||
@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering);
|
||||
|
||||
} // namespace ompx
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
||||
|
@ -12,8 +12,6 @@
|
||||
#ifndef OMPTARGET_WORKSHARE_H
|
||||
#define OMPTARGET_WORKSHARE_H
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace ompx {
|
||||
|
||||
namespace workshare {
|
||||
@ -25,6 +23,4 @@ void init(bool IsSPMD);
|
||||
|
||||
} // namespace ompx
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
||||
|
@ -19,8 +19,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
[[gnu::used, gnu::retain, gnu::weak,
|
||||
gnu::visibility(
|
||||
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
|
||||
@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
|
||||
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
|
||||
|
||||
///}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -17,8 +17,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
|
||||
[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
|
||||
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
|
||||
@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
|
||||
return false;
|
||||
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -21,8 +21,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
extern "C" {
|
||||
void __assert_assume(bool condition) { __builtin_assume(condition); }
|
||||
|
||||
@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -15,14 +15,10 @@
|
||||
#include "Interface.h"
|
||||
#include "Mapping.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
namespace impl {
|
||||
|
||||
bool isSharedMemPtr(const void *Ptr) { return false; }
|
||||
|
||||
void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
|
||||
static_assert(sizeof(unsigned long) == 8, "");
|
||||
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
|
||||
@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
|
||||
/// AMDGCN Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
#ifdef __AMDGPU__
|
||||
|
||||
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
|
||||
int Self = mapping::getThreadIdInWarp();
|
||||
@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
|
||||
return __builtin_amdgcn_is_shared(
|
||||
(const __attribute__((address_space(0))) void *)Ptr);
|
||||
}
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
|
||||
/// NVPTX Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, \
|
||||
implementation = {extension(match_any)})
|
||||
#ifdef __NVPTX__
|
||||
|
||||
int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
|
||||
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
|
||||
@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {
|
||||
|
||||
bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
} // namespace impl
|
||||
|
||||
@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
|
||||
return utils::shuffleDown(lanes::All, Val, Delta, Width);
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -25,8 +25,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
static void
|
||||
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
|
||||
@ -155,5 +153,3 @@ void __kmpc_target_deinit() {
|
||||
|
||||
int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -8,8 +8,6 @@
|
||||
|
||||
#include "LibC.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
|
||||
#else
|
||||
@ -48,5 +46,3 @@ namespace ompx {
|
||||
return ::vprintf(Format, vlist);
|
||||
}
|
||||
} // namespace ompx
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -15,8 +15,6 @@
|
||||
#include "Interface.h"
|
||||
#include "State.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
using namespace ompx;
|
||||
@ -24,24 +22,10 @@ using namespace ompx;
|
||||
namespace ompx {
|
||||
namespace impl {
|
||||
|
||||
// Forward declarations defined to be defined for AMDGCN and NVPTX.
|
||||
LaneMaskTy activemask();
|
||||
LaneMaskTy lanemaskLT();
|
||||
LaneMaskTy lanemaskGT();
|
||||
uint32_t getThreadIdInWarp();
|
||||
uint32_t getThreadIdInBlock(int32_t Dim);
|
||||
uint32_t getNumberOfThreadsInBlock(int32_t Dim);
|
||||
uint32_t getNumberOfThreadsInKernel();
|
||||
uint32_t getBlockIdInKernel(int32_t Dim);
|
||||
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
|
||||
uint32_t getWarpIdInBlock();
|
||||
uint32_t getNumberOfWarpsInBlock();
|
||||
uint32_t getWarpSize();
|
||||
|
||||
/// AMDGCN Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
#ifdef __AMDGPU__
|
||||
|
||||
uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
|
||||
|
||||
@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() {
|
||||
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
|
||||
/// NVPTX Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, \
|
||||
implementation = {extension(match_any)})
|
||||
#ifdef __NVPTX__
|
||||
|
||||
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
|
||||
switch (Dim) {
|
||||
@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() {
|
||||
mapping::getWarpSize();
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
|
||||
} // namespace impl
|
||||
@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
|
||||
}
|
||||
|
||||
long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
|
||||
return utils::shuffleDown(mask, var, delta, width);
|
||||
return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
|
||||
}
|
||||
|
||||
double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
|
||||
@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
|
||||
utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -17,19 +17,13 @@
|
||||
|
||||
#include "Debug.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace ompx {
|
||||
namespace impl {
|
||||
|
||||
double getWTick();
|
||||
|
||||
double getWTime();
|
||||
|
||||
/// AMDGCN Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
#ifdef __AMDGPU__
|
||||
|
||||
double getWTick() {
|
||||
// The number of ticks per second for the AMDGPU clock varies by card and can
|
||||
@ -42,14 +36,12 @@ double getWTime() {
|
||||
return static_cast<double>(__builtin_readsteadycounter()) * getWTick();
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
|
||||
/// NVPTX Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, \
|
||||
implementation = {extension(match_any)})
|
||||
#ifdef __NVPTX__
|
||||
|
||||
double getWTick() {
|
||||
// Timer precision is 1ns
|
||||
@ -61,7 +53,7 @@ double getWTime() {
|
||||
return static_cast<double>(nsecs) * getWTick();
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
|
||||
/// Lookup a device-side function using a host pointer /p HstPtr using the table
|
||||
/// provided by the device plugin. The table is an ordered pair of host and
|
||||
@ -171,4 +163,3 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) {
|
||||
}
|
||||
|
||||
///}
|
||||
#pragma omp end declare target
|
||||
|
@ -43,8 +43,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
namespace {
|
||||
|
||||
uint32_t determineNumberOfThreads(int32_t NumThreadsClause) {
|
||||
@ -311,5 +309,3 @@ void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
|
||||
|
||||
void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -8,8 +8,6 @@
|
||||
|
||||
#include "Profiling.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Provides empty implementations for certain functions in compiler-rt
|
||||
@ -18,5 +16,3 @@ void __llvm_profile_register_function(void *Ptr) {}
|
||||
void __llvm_profile_register_names_function(void *Ptr, long int I) {}
|
||||
void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {}
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -22,8 +22,6 @@ using namespace ompx;
|
||||
|
||||
namespace {
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
|
||||
for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
|
||||
shflFct(reduce_data, /*LaneId - not used= */ 0,
|
||||
@ -316,5 +314,3 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
void *__kmpc_reduction_get_fixed_buffer() {
|
||||
return state::getKernelLaunchEnvironment().ReductionBuffer;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -23,16 +23,13 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
/// Memory implementation
|
||||
///
|
||||
///{
|
||||
|
||||
/// External symbol to access dynamic shared memory.
|
||||
[[gnu::aligned(
|
||||
allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[];
|
||||
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
|
||||
[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
|
||||
[[clang::address_space(3)]] DynamicSharedBuffer[];
|
||||
|
||||
/// The kernel environment passed to the init method by the compiler.
|
||||
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
|
||||
@ -452,13 +449,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
|
||||
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
|
||||
constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
|
||||
|
||||
[[clang::loader_uninitialized]] static void
|
||||
*SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
|
||||
#pragma omp allocate(SharedMemVariableSharingSpace) \
|
||||
allocator(omp_pteam_mem_alloc)
|
||||
[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
|
||||
#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
|
||||
allocator(omp_pteam_mem_alloc)
|
||||
[[clang::loader_uninitialized]] static void *[[clang::address_space(
|
||||
3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
|
||||
[[clang::loader_uninitialized]] static void **[[clang::address_space(
|
||||
3)]] SharedMemVariableSharingSpacePtr;
|
||||
|
||||
void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
|
||||
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
|
||||
@ -481,4 +475,3 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) {
|
||||
*GlobalArgs = SharedMemVariableSharingSpacePtr;
|
||||
}
|
||||
}
|
||||
#pragma omp end declare target
|
||||
|
@ -19,8 +19,6 @@
|
||||
#include "Mapping.h"
|
||||
#include "State.h"
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
namespace impl {
|
||||
@ -28,34 +26,12 @@ namespace impl {
|
||||
/// Atomics
|
||||
///
|
||||
///{
|
||||
/// NOTE: This function needs to be implemented by every target.
|
||||
uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
|
||||
atomic::MemScopeTy MemScope);
|
||||
///}
|
||||
|
||||
// Forward declarations defined to be defined for AMDGCN and NVPTX.
|
||||
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
||||
atomic::MemScopeTy MemScope);
|
||||
void namedBarrierInit();
|
||||
void namedBarrier();
|
||||
void fenceTeam(atomic::OrderingTy Ordering);
|
||||
void fenceKernel(atomic::OrderingTy Ordering);
|
||||
void fenceSystem(atomic::OrderingTy Ordering);
|
||||
void syncWarp(__kmpc_impl_lanemask_t);
|
||||
void syncThreads(atomic::OrderingTy Ordering);
|
||||
void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); }
|
||||
void unsetLock(omp_lock_t *);
|
||||
int testLock(omp_lock_t *);
|
||||
void initLock(omp_lock_t *);
|
||||
void destroyLock(omp_lock_t *);
|
||||
void setLock(omp_lock_t *);
|
||||
void unsetCriticalLock(omp_lock_t *);
|
||||
void setCriticalLock(omp_lock_t *);
|
||||
|
||||
/// AMDGCN Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
#ifdef __AMDGPU__
|
||||
|
||||
uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
||||
atomic::MemScopeTy MemScope) {
|
||||
@ -202,15 +178,13 @@ void setCriticalLock(omp_lock_t *Lock) {
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
|
||||
/// NVPTX Implementation
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, \
|
||||
implementation = {extension(match_any)})
|
||||
#ifdef __NVPTX__
|
||||
|
||||
uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering,
|
||||
atomic::MemScopeTy MemScope) {
|
||||
@ -283,7 +257,7 @@ void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); }
|
||||
|
||||
void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); }
|
||||
|
||||
#pragma omp end declare variant
|
||||
#endif
|
||||
///}
|
||||
|
||||
} // namespace impl
|
||||
@ -401,5 +375,3 @@ void ompx_sync_block_divergent(int Ordering) {
|
||||
impl::syncThreads(atomic::OrderingTy(Ordering));
|
||||
}
|
||||
} // extern "C"
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -20,8 +20,6 @@
|
||||
|
||||
using namespace ompx;
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
extern "C" {
|
||||
|
||||
TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
|
||||
@ -29,7 +27,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t,
|
||||
size_t SharedValuesSize,
|
||||
TaskFnTy TaskFn) {
|
||||
auto TaskSizeInclPrivateValuesPadded =
|
||||
utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
|
||||
utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *));
|
||||
auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
|
||||
TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
|
||||
TaskSizeTotal, "explicit task descriptor");
|
||||
@ -103,5 +101,3 @@ int omp_in_final(void) {
|
||||
|
||||
int omp_get_max_task_priority(void) { return 0; }
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -44,8 +44,6 @@ struct DynamicScheduleTracker {
|
||||
#define NOT_FINISHED 1
|
||||
#define LAST_CHUNK 2
|
||||
|
||||
#pragma omp begin declare target device_type(nohost)
|
||||
|
||||
// TODO: This variable is a hack inherited from the old runtime.
|
||||
static uint64_t SHARED(Cnt);
|
||||
|
||||
@ -935,5 +933,3 @@ OMP_LOOP_ENTRY(_4u, uint32_t)
|
||||
OMP_LOOP_ENTRY(_8, int64_t)
|
||||
OMP_LOOP_ENTRY(_8u, uint64_t)
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
Loading…
x
Reference in New Issue
Block a user