[OpenMP] Replace use of target address space with <gpuintrin.h> local (#126119)
Summary: This definition is more portable since it defines the correct value for the target. I got rid of the helper mostly because I think it's easy enough to use now that it's a type and being explicit about what's `undef` or `poison` is good.
This commit is contained in:
parent
70906f0514
commit
ed9107f2d7
@ -12,9 +12,15 @@
|
||||
#ifndef OMPTARGET_TYPES_H
|
||||
#define OMPTARGET_TYPES_H
|
||||
|
||||
#include <gpuintrin.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
template <typename T> using Private = __gpu_private T;
|
||||
template <typename T> using Constant = __gpu_constant T;
|
||||
template <typename T> using Local = __gpu_local T;
|
||||
template <typename T> using Global = __gpu_local T;
|
||||
|
||||
enum omp_proc_bind_t {
|
||||
omp_proc_bind_false = 0,
|
||||
omp_proc_bind_true = 1,
|
||||
@ -155,19 +161,6 @@ typedef enum omp_allocator_handle_t {
|
||||
#define __PRAGMA(STR) _Pragma(#STR)
|
||||
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
||||
|
||||
#define SHARED(NAME) \
|
||||
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];
|
||||
|
||||
// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
|
||||
// now that's not the case.
|
||||
#define THREAD_LOCAL(NAME) \
|
||||
[[clang::address_space(5)]] NAME [[clang::loader_uninitialized]]
|
||||
|
||||
// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
|
||||
// does?
|
||||
#define CONSTANT(NAME) \
|
||||
[[clang::address_space(4)]] NAME [[clang::loader_uninitialized]]
|
||||
|
||||
///}
|
||||
|
||||
#endif
|
||||
|
@ -86,7 +86,7 @@ struct TeamStateTy {
|
||||
ParallelRegionFnTy ParallelRegionFnVar;
|
||||
};
|
||||
|
||||
extern TeamStateTy [[clang::address_space(3)]] TeamState;
|
||||
extern Local<TeamStateTy> TeamState;
|
||||
|
||||
struct ThreadStateTy {
|
||||
|
||||
@ -112,7 +112,7 @@ struct ThreadStateTy {
|
||||
}
|
||||
};
|
||||
|
||||
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;
|
||||
extern Local<ThreadStateTy **> ThreadStates;
|
||||
|
||||
/// Initialize the state machinery. Must be called by all threads.
|
||||
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
|
@ -28,8 +28,8 @@ using namespace ompx;
|
||||
// This variable should be visible to the plugin so we override the default
|
||||
// hidden visibility.
|
||||
[[gnu::used, gnu::retain, gnu::weak,
|
||||
gnu::visibility("protected")]] DeviceEnvironmentTy
|
||||
CONSTANT(__omp_rtl_device_environment);
|
||||
gnu::visibility(
|
||||
"protected")]] Constant<DeviceEnvironmentTy> __omp_rtl_device_environment;
|
||||
|
||||
uint32_t config::getAssumeTeamsOversubscription() {
|
||||
return __omp_rtl_assume_teams_oversubscription;
|
||||
|
@ -308,7 +308,7 @@ uint32_t mapping::getNumberOfProcessorElements() {
|
||||
|
||||
// TODO: This is a workaround for initialization coming from kernels outside of
|
||||
// the TU. We will need to solve this more correctly in the future.
|
||||
[[gnu::weak]] int SHARED(IsSPMDMode);
|
||||
[[gnu::weak, clang::loader_uninitialized]] Local<int> IsSPMDMode;
|
||||
|
||||
void mapping::init(bool IsSPMD) {
|
||||
if (mapping::isInitialThreadInLevel0(IsSPMD))
|
||||
|
@ -71,16 +71,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
|
||||
if (NumThreads == 1)
|
||||
return 1;
|
||||
|
||||
//
|
||||
// This reduce function handles reduction within a team. It handles
|
||||
// parallel regions in both L1 and L2 parallelism levels. It also
|
||||
// supports Generic, SPMD, and NoOMP modes.
|
||||
//
|
||||
// 1. Reduce within a warp.
|
||||
// 2. Warp master copies value to warp 0 via shared memory.
|
||||
// 3. Warp 0 reduces to a single value.
|
||||
// 4. The reduced value is available in the thread that returns 1.
|
||||
//
|
||||
//
|
||||
// This reduce function handles reduction within a team. It handles
|
||||
// parallel regions in both L1 and L2 parallelism levels. It also
|
||||
// supports Generic, SPMD, and NoOMP modes.
|
||||
//
|
||||
// 1. Reduce within a warp.
|
||||
// 2. Warp master copies value to warp 0 via shared memory.
|
||||
// 3. Warp 0 reduces to a single value.
|
||||
// 4. The reduced value is available in the thread that returns 1.
|
||||
//
|
||||
|
||||
#if __has_builtin(__nvvm_reflect)
|
||||
if (__nvvm_reflect("__CUDA_ARCH") >= 700) {
|
||||
@ -196,8 +196,8 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
uint32_t NumThreads = omp_get_num_threads();
|
||||
uint32_t TeamId = omp_get_team_num();
|
||||
uint32_t NumTeams = omp_get_num_teams();
|
||||
static unsigned SHARED(Bound);
|
||||
static unsigned SHARED(ChunkTeamCount);
|
||||
[[clang::loader_uninitialized]] static Local<unsigned> Bound;
|
||||
[[clang::loader_uninitialized]] static Local<unsigned> ChunkTeamCount;
|
||||
|
||||
// Block progress for teams greater than the current upper
|
||||
// limit. We always only allow a number of teams less or equal
|
||||
|
@ -28,15 +28,17 @@ using namespace ompx;
|
||||
///{
|
||||
|
||||
/// External symbol to access dynamic shared memory.
|
||||
[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char
|
||||
[[clang::address_space(3)]] DynamicSharedBuffer[];
|
||||
[[gnu::aligned(
|
||||
allocator::ALIGNMENT)]] extern Local<unsigned char> DynamicSharedBuffer[];
|
||||
|
||||
/// The kernel environment passed to the init method by the compiler.
|
||||
static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
|
||||
[[clang::loader_uninitialized]] static Local<KernelEnvironmentTy *>
|
||||
KernelEnvironmentPtr;
|
||||
|
||||
/// The kernel launch environment passed as argument to the kernel by the
|
||||
/// runtime.
|
||||
static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
|
||||
[[clang::loader_uninitialized]] static Local<KernelLaunchEnvironmentTy *>
|
||||
KernelLaunchEnvironmentPtr;
|
||||
|
||||
///}
|
||||
|
||||
@ -108,7 +110,8 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256,
|
||||
"Shared scratchpad of this size not supported yet.");
|
||||
|
||||
/// The allocation of a single shared memory scratchpad.
|
||||
static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
|
||||
[[clang::loader_uninitialized]] static Local<SharedMemorySmartStackTy>
|
||||
SharedMemorySmartStack;
|
||||
|
||||
void SharedMemorySmartStackTy::init(bool IsSPMD) {
|
||||
Usage[mapping::getThreadIdInBlock()] = 0;
|
||||
@ -220,8 +223,10 @@ void state::TeamStateTy::assertEqual(TeamStateTy &Other) const {
|
||||
ASSERT(HasThreadState == Other.HasThreadState, nullptr);
|
||||
}
|
||||
|
||||
state::TeamStateTy SHARED(ompx::state::TeamState);
|
||||
state::ThreadStateTy **SHARED(ompx::state::ThreadStates);
|
||||
[[clang::loader_uninitialized]] Local<state::TeamStateTy>
|
||||
ompx::state::TeamState;
|
||||
[[clang::loader_uninitialized]] Local<state::ThreadStateTy **>
|
||||
ompx::state::ThreadStates;
|
||||
|
||||
namespace {
|
||||
|
||||
@ -449,10 +454,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); }
|
||||
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
|
||||
constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64;
|
||||
|
||||
[[clang::loader_uninitialized]] static void *[[clang::address_space(
|
||||
3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
|
||||
[[clang::loader_uninitialized]] static void **[[clang::address_space(
|
||||
3)]] SharedMemVariableSharingSpacePtr;
|
||||
[[clang::loader_uninitialized]] static Local<void *>
|
||||
SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
|
||||
[[clang::loader_uninitialized]] static Local<void **>
|
||||
SharedMemVariableSharingSpacePtr;
|
||||
|
||||
void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) {
|
||||
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
|
||||
|
@ -69,7 +69,7 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering,
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t SHARED(namedBarrierTracker);
|
||||
[[clang::loader_uninitialized]] Local<uint32_t> namedBarrierTracker;
|
||||
|
||||
void namedBarrierInit() {
|
||||
// Don't have global ctors, and shared memory is not zero init
|
||||
|
@ -45,7 +45,7 @@ struct DynamicScheduleTracker {
|
||||
#define LAST_CHUNK 2
|
||||
|
||||
// TODO: This variable is a hack inherited from the old runtime.
|
||||
static uint64_t SHARED(Cnt);
|
||||
[[clang::loader_uninitialized]] static Local<uint64_t> Cnt;
|
||||
|
||||
template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -457,7 +457,8 @@ template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
|
||||
//
|
||||
// __kmpc_dispatch_deinit
|
||||
//
|
||||
static DynamicScheduleTracker **SHARED(ThreadDST);
|
||||
[[clang::loader_uninitialized]] static Local<DynamicScheduleTracker **>
|
||||
ThreadDST;
|
||||
|
||||
// Create a new DST, link the current one, and define the new as current.
|
||||
static DynamicScheduleTracker *pushDST() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user