Johannes Doerfert 330d8983d2
[Offload] Move /openmp/libomptarget to /offload (#75125)
In a nutshell, this moves our libomptarget code to populate the offload
subproject.

With this commit, users need to enable the new LLVM/Offload subproject
as a runtime in their cmake configuration.
No further changes are expected for downstream code.

Tests and other components still depend on OpenMP and have also not been
renamed. The results below are for a build in which OpenMP and Offload
are enabled runtimes. In addition to the pure `git mv`, we needed to
adjust some CMake files. Nothing is intended to change semantics.

```
ninja check-offload
```
Works with the X86 and AMDGPU offload tests

```
ninja check-openmp
```
Still works but doesn't build offload tests anymore.

```
ls install/lib
```
Shows all expected libraries, incl.
- `libomptarget.devicertl.a`
- `libomptarget-nvptx-sm_90.bc`
- `libomptarget.rtl.amdgpu.so` -> `libomptarget.rtl.amdgpu.so.18git`
- `libomptarget.so` -> `libomptarget.so.18git`

Fixes: https://github.com/llvm/llvm-project/issues/75124

---------

Co-authored-by: Saiyedul Islam <Saiyedul.Islam@amd.com>
2024-04-22 09:51:33 -07:00

386 lines
12 KiB
C++

//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//
//===----------------------------------------------------------------------===//
#ifndef OMPTARGET_STATE_H
#define OMPTARGET_STATE_H
#include "Shared/Environment.h"
#include "Debug.h"
#include "Mapping.h"
#include "Types.h"
#include "Utils.h"
// Forward declaration.
struct KernelEnvironmentTy;
#pragma omp begin declare target device_type(nohost)
namespace ompx {
namespace memory {
/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
///
/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
void *allocShared(uint64_t Size, const char *Reason);
/// Free \p Ptr, alloated via allocShared, for \p Reason.
///
/// Note: See the restrictions on __kmpc_free_shared for proper usage.
void freeShared(void *Ptr, uint64_t Bytes, const char *Reason);
/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
void *allocGlobal(uint64_t Size, const char *Reason);
/// Return a pointer to the dynamic shared memory buffer.
void *getDynamicBuffer();
/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
void freeGlobal(void *Ptr, const char *Reason);
} // namespace memory
namespace state {
inline constexpr uint32_t SharedScratchpadSize = SHARED_SCRATCHPAD_SIZE;
struct ICVStateTy {
uint32_t NThreadsVar;
uint32_t LevelVar;
uint32_t ActiveLevelVar;
uint32_t Padding0Val;
uint32_t MaxActiveLevelsVar;
uint32_t RunSchedVar;
uint32_t RunSchedChunkVar;
bool operator==(const ICVStateTy &Other) const;
void assertEqual(const ICVStateTy &Other) const;
};
struct TeamStateTy {
void init(bool IsSPMD);
bool operator==(const TeamStateTy &) const;
void assertEqual(TeamStateTy &Other) const;
/// ICVs
///
/// Preallocated storage for ICV values that are used if the threads have not
/// set a custom default. The latter is supported but unlikely and slow(er).
///
///{
ICVStateTy ICVState;
///}
uint32_t ParallelTeamSize;
uint32_t HasThreadState;
ParallelRegionFnTy ParallelRegionFnVar;
};
extern TeamStateTy TeamState;
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
struct ThreadStateTy {
/// ICVs have preallocated storage in the TeamStateTy which is used if a
/// thread has not set a custom value. The latter is supported but unlikely.
/// When it happens we will allocate dynamic memory to hold the values of all
/// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
/// ICV struct to hold them all. This is slower than alternatives but allows
/// users to pay only for what they use.
///
state::ICVStateTy ICVState;
ThreadStateTy *PreviousThreadState;
void init() {
ICVState = TeamState.ICVState;
PreviousThreadState = nullptr;
}
void init(ThreadStateTy *PreviousTS) {
ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState;
PreviousThreadState = PreviousTS;
}
};
extern ThreadStateTy **ThreadStates;
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
/// holds for all instances of the kernel. The latter is dynamic and provides
/// per-launch information.
KernelEnvironmentTy &getKernelEnvironment();
KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
/// TODO
enum ValueKind {
VK_NThreads,
VK_Level,
VK_ActiveLevel,
VK_MaxActiveLevels,
VK_RunSched,
// ---
VK_RunSchedChunk,
VK_ParallelRegionFn,
VK_ParallelTeamSize,
VK_HasThreadState,
};
/// TODO
void enterDataEnvironment(IdentTy *Ident);
/// TODO
void exitDataEnvironment();
/// TODO
struct DateEnvironmentRAII {
DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); }
~DateEnvironmentRAII() { exitDataEnvironment(); }
};
/// TODO
void resetStateForThread(uint32_t TId);
inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var,
IdentTy *Ident, bool ForceTeamState) {
if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() ||
!TeamState.HasThreadState))
return TeamState.ICVState.*Var;
uint32_t TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ThreadStates[TId])) {
ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
sizeof(ThreadStateTy), "ICV modification outside data environment"));
ASSERT(ThreadStates[TId] != nullptr, "Nullptr returned by malloc!");
TeamState.HasThreadState = true;
ThreadStates[TId]->init();
}
return ThreadStates[TId]->ICVState.*Var;
}
inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var,
bool ForceTeamState) {
auto TId = mapping::getThreadIdInBlock();
if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() &&
TeamState.HasThreadState && ThreadStates[TId]))
return ThreadStates[TId]->ICVState.*Var;
return TeamState.ICVState.*Var;
}
[[gnu::always_inline, gnu::flatten]] inline uint32_t &
lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
switch (Kind) {
case state::VK_NThreads:
if (IsReadonly)
return lookupImpl(&ICVStateTy::NThreadsVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::NThreadsVar, Ident,
ForceTeamState);
case state::VK_Level:
if (IsReadonly)
return lookupImpl(&ICVStateTy::LevelVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::LevelVar, Ident, ForceTeamState);
case state::VK_ActiveLevel:
if (IsReadonly)
return lookupImpl(&ICVStateTy::ActiveLevelVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar, Ident,
ForceTeamState);
case state::VK_MaxActiveLevels:
if (IsReadonly)
return lookupImpl(&ICVStateTy::MaxActiveLevelsVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar, Ident,
ForceTeamState);
case state::VK_RunSched:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::RunSchedVar, Ident,
ForceTeamState);
case state::VK_RunSchedChunk:
if (IsReadonly)
return lookupImpl(&ICVStateTy::RunSchedChunkVar, ForceTeamState);
return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar, Ident,
ForceTeamState);
case state::VK_ParallelTeamSize:
return TeamState.ParallelTeamSize;
case state::VK_HasThreadState:
return TeamState.HasThreadState;
default:
break;
}
__builtin_unreachable();
}
[[gnu::always_inline, gnu::flatten]] inline void *&
lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) {
switch (Kind) {
case state::VK_ParallelRegionFn:
return TeamState.ParallelRegionFnVar;
default:
break;
}
__builtin_unreachable();
}
/// A class without actual state used to provide a nice interface to lookup and
/// update ICV values we can declare in global scope.
template <typename Ty, ValueKind Kind> struct Value {
[[gnu::flatten, gnu::always_inline]] operator Ty() {
return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false);
}
[[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) {
set(Other, /*IdentTy=*/nullptr);
return *this;
}
[[gnu::flatten, gnu::always_inline]] Value &operator++() {
inc(1, /*IdentTy=*/nullptr);
return *this;
}
[[gnu::flatten, gnu::always_inline]] Value &operator--() {
inc(-1, /*IdentTy=*/nullptr);
return *this;
}
[[gnu::flatten, gnu::always_inline]] void
assert_eq(const Ty &V, IdentTy *Ident = nullptr,
bool ForceTeamState = false) {
ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr);
}
private:
[[gnu::flatten, gnu::always_inline]] Ty &
lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) {
Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState);
return t;
}
[[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) {
return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) +=
UpdateVal);
}
[[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) {
return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) =
UpdateVal);
}
template <typename VTy, typename Ty2> friend struct ValueRAII;
};
/// A mookup class without actual state used to provide
/// a nice interface to lookup and update ICV values
/// we can declare in global scope.
template <typename Ty, ValueKind Kind> struct PtrValue {
[[gnu::flatten, gnu::always_inline]] operator Ty() {
return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false);
}
[[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) {
set(Other);
return *this;
}
private:
Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) {
return lookupPtr(Kind, IsReadonly, ForceTeamState);
}
Ty &set(Ty UpdateVal) {
return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr,
/*ForceTeamState=*/false) = UpdateVal);
}
template <typename VTy, typename Ty2> friend struct ValueRAII;
};
template <typename VTy, typename Ty> struct ValueRAII {
ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident,
bool ForceTeamState = false)
: Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState)
: (Ty *)utils::UndefPtr),
Val(OldValue), Active(Active) {
if (!Active)
return;
ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!");
*Ptr = NewValue;
}
~ValueRAII() {
if (Active)
*Ptr = Val;
}
private:
Ty *Ptr;
Ty Val;
bool Active;
};
/// TODO
inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
/// TODO
inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
/// TODO
inline state::Value<uint32_t, state::VK_HasThreadState> HasThreadState;
/// TODO
inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
ParallelRegionFn;
void runAndCheckState(void(Func(void)));
void assumeInitialState(bool IsSPMD);
/// Return the value of the ParallelTeamSize ICV.
int getEffectivePTeamSize();
} // namespace state
namespace icv {
/// TODO
inline state::Value<uint32_t, state::VK_NThreads> NThreads;
/// TODO
inline state::Value<uint32_t, state::VK_Level> Level;
/// The `active-level` describes which of the parallel level counted with the
/// `level-var` is active. There can only be one.
///
/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
/// TODO
inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
/// TODO
inline state::Value<uint32_t, state::VK_RunSched> RunSched;
} // namespace icv
} // namespace ompx
#pragma omp end declare target
#endif