[OpenMP] Rework handling of global ctor/dtors in OpenMP (#71739)

Summary:
This patch reworks how we handle global constructors in OpenMP.
Previously, we emitted individual kernels that were all registered and
called individually. In order to provide more generic support, this
patch moves all handling of this to the target backend and the runtime
plugin. This has the benefit of supporting the GNU extensions for
constructors an destructors, removing a class of failures related to
shared library destruction order, and allows targets other than OpenMP
to use the same support without needing to change the frontend.

This is primarily done by calling kernels that the backend emits to
iterate a list of ctor / dtor functions. For x64, this is automatic and
we get it for free with the standard `dlopen` handling. For AMDGPU, we
emit `amdgcn.device.init` and `amdgcn.device.fini` functions which
handle everything atuomatically and simply need to be called. For NVPTX,
a patch https://github.com/llvm/llvm-project/pull/71549 provides the
kernels to call, but the runtime needs to set up the array manually by
pulling out all the known constructor / destructor functions.

One concession that this patch requires is the change that for GPU
targets in OpenMP offloading we will use `llvm.global_dtors` instead of
using `atexit`. This is because `atexit` is a separate runtime function
that does not mesh well with the handling we're trying to do here. This
should be equivalent in all cases except for cases where we would need
to destruct manually such as:

```
struct S { ~S() { foo(); } };
void foo() {
  static S s;
}
```

However this is broken in many other ways on the GPU, so it is not
regressing any support, simply increasing the scope of what we can
handle.

This changes the handling of ctors / dtors. This patch now outputs a
information message regarding the deprecation if the old format is used.
This will be completely removed in a later release.

Depends on: https://github.com/llvm/llvm-project/pull/71549
This commit is contained in:
Joseph Huber 2023-11-10 14:53:53 -06:00 committed by GitHub
parent 133bcacecf
commit 237adfca4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 317 additions and 216 deletions

View File

@ -597,6 +597,9 @@ public:
return !requiresStrictPrototypes() && !OpenCL;
}
/// Returns true if the language supports calling the 'atexit' function.
bool hasAtExit() const { return !(OpenMP && OpenMPIsTargetDevice); }
/// Returns true if implicit int is part of the language requirements.
bool isImplicitIntRequired() const { return !CPlusPlus && !C99; }

View File

@ -327,6 +327,15 @@ void CodeGenFunction::registerGlobalDtorWithAtExit(const VarDecl &VD,
registerGlobalDtorWithAtExit(dtorStub);
}
/// Register a global destructor using the LLVM 'llvm.global_dtors' global.
void CodeGenFunction::registerGlobalDtorWithLLVM(const VarDecl &VD,
llvm::FunctionCallee Dtor,
llvm::Constant *Addr) {
// Create a function which calls the destructor.
llvm::Function *dtorStub = createAtExitStub(VD, Dtor, Addr);
CGM.AddGlobalDtor(dtorStub);
}
void CodeGenFunction::registerGlobalDtorWithAtExit(llvm::Constant *dtorStub) {
// extern "C" int atexit(void (*f)(void));
assert(dtorStub->getType() ==
@ -519,10 +528,6 @@ CodeGenModule::EmitCXXGlobalVarDeclInitFunc(const VarDecl *D,
D->hasAttr<CUDASharedAttr>()))
return;
if (getLangOpts().OpenMP &&
getOpenMPRuntime().emitDeclareTargetVarDefinition(D, Addr, PerformInit))
return;
// Check if we've already initialized this decl.
auto I = DelayedCXXInitPosition.find(D);
if (I != DelayedCXXInitPosition.end() && I->second == ~0U)

View File

@ -1747,136 +1747,6 @@ llvm::Function *CGOpenMPRuntime::emitThreadPrivateVarDefinition(
return nullptr;
}
bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
llvm::GlobalVariable *Addr,
bool PerformInit) {
if (CGM.getLangOpts().OMPTargetTriples.empty() &&
!CGM.getLangOpts().OpenMPIsTargetDevice)
return false;
std::optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Link ||
((*Res == OMPDeclareTargetDeclAttr::MT_To ||
*Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
HasRequiresUnifiedSharedMemory))
return CGM.getLangOpts().OpenMPIsTargetDevice;
VD = VD->getDefinition(CGM.getContext());
assert(VD && "Unknown VarDecl");
if (!DeclareTargetWithDefinition.insert(CGM.getMangledName(VD)).second)
return CGM.getLangOpts().OpenMPIsTargetDevice;
QualType ASTTy = VD->getType();
SourceLocation Loc = VD->getCanonicalDecl()->getBeginLoc();
// Produce the unique prefix to identify the new target regions. We use
// the source location of the variable declaration which we know to not
// conflict with any target region.
llvm::TargetRegionEntryInfo EntryInfo =
getEntryInfoFromPresumedLoc(CGM, OMPBuilder, Loc, VD->getName());
SmallString<128> Buffer, Out;
OMPBuilder.OffloadInfoManager.getTargetRegionEntryFnName(Buffer, EntryInfo);
const Expr *Init = VD->getAnyInitializer();
if (CGM.getLangOpts().CPlusPlus && PerformInit) {
llvm::Constant *Ctor;
llvm::Constant *ID;
if (CGM.getLangOpts().OpenMPIsTargetDevice) {
// Generate function that re-emits the declaration's initializer into
// the threadprivate copy of the variable VD
CodeGenFunction CtorCGF(CGM);
const CGFunctionInfo &FI = CGM.getTypes().arrangeNullaryFunction();
llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FI);
llvm::Function *Fn = CGM.CreateGlobalInitOrCleanUpFunction(
FTy, Twine(Buffer, "_ctor"), FI, Loc, false,
llvm::GlobalValue::WeakODRLinkage);
Fn->setVisibility(llvm::GlobalValue::ProtectedVisibility);
if (CGM.getTriple().isAMDGCN())
Fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
auto NL = ApplyDebugLocation::CreateEmpty(CtorCGF);
CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI,
FunctionArgList(), Loc, Loc);
auto AL = ApplyDebugLocation::CreateArtificial(CtorCGF);
llvm::Constant *AddrInAS0 = Addr;
if (Addr->getAddressSpace() != 0)
AddrInAS0 = llvm::ConstantExpr::getAddrSpaceCast(
Addr, llvm::PointerType::get(CGM.getLLVMContext(), 0));
CtorCGF.EmitAnyExprToMem(Init,
Address(AddrInAS0, Addr->getValueType(),
CGM.getContext().getDeclAlign(VD)),
Init->getType().getQualifiers(),
/*IsInitializer=*/true);
CtorCGF.FinishFunction();
Ctor = Fn;
ID = Fn;
} else {
Ctor = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::PrivateLinkage,
llvm::Constant::getNullValue(CGM.Int8Ty), Twine(Buffer, "_ctor"));
ID = Ctor;
}
// Register the information for the entry associated with the constructor.
Out.clear();
auto CtorEntryInfo = EntryInfo;
CtorEntryInfo.ParentName = Twine(Buffer, "_ctor").toStringRef(Out);
OMPBuilder.OffloadInfoManager.registerTargetRegionEntryInfo(
CtorEntryInfo, Ctor, ID,
llvm::OffloadEntriesInfoManager::OMPTargetRegionEntryCtor);
}
if (VD->getType().isDestructedType() != QualType::DK_none) {
llvm::Constant *Dtor;
llvm::Constant *ID;
if (CGM.getLangOpts().OpenMPIsTargetDevice) {
// Generate function that emits destructor call for the threadprivate
// copy of the variable VD
CodeGenFunction DtorCGF(CGM);
const CGFunctionInfo &FI = CGM.getTypes().arrangeNullaryFunction();
llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FI);
llvm::Function *Fn = CGM.CreateGlobalInitOrCleanUpFunction(
FTy, Twine(Buffer, "_dtor"), FI, Loc, false,
llvm::GlobalValue::WeakODRLinkage);
Fn->setVisibility(llvm::GlobalValue::ProtectedVisibility);
if (CGM.getTriple().isAMDGCN())
Fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
auto NL = ApplyDebugLocation::CreateEmpty(DtorCGF);
DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI,
FunctionArgList(), Loc, Loc);
// Create a scope with an artificial location for the body of this
// function.
auto AL = ApplyDebugLocation::CreateArtificial(DtorCGF);
llvm::Constant *AddrInAS0 = Addr;
if (Addr->getAddressSpace() != 0)
AddrInAS0 = llvm::ConstantExpr::getAddrSpaceCast(
Addr, llvm::PointerType::get(CGM.getLLVMContext(), 0));
DtorCGF.emitDestroy(Address(AddrInAS0, Addr->getValueType(),
CGM.getContext().getDeclAlign(VD)),
ASTTy, DtorCGF.getDestroyer(ASTTy.isDestructedType()),
DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
DtorCGF.FinishFunction();
Dtor = Fn;
ID = Fn;
} else {
Dtor = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
llvm::GlobalValue::PrivateLinkage,
llvm::Constant::getNullValue(CGM.Int8Ty), Twine(Buffer, "_dtor"));
ID = Dtor;
}
// Register the information for the entry associated with the destructor.
Out.clear();
auto DtorEntryInfo = EntryInfo;
DtorEntryInfo.ParentName = Twine(Buffer, "_dtor").toStringRef(Out);
OMPBuilder.OffloadInfoManager.registerTargetRegionEntryInfo(
DtorEntryInfo, Dtor, ID,
llvm::OffloadEntriesInfoManager::OMPTargetRegionEntryDtor);
}
return CGM.getLangOpts().OpenMPIsTargetDevice;
}
void CGOpenMPRuntime::emitDeclareTargetFunction(const FunctionDecl *FD,
llvm::GlobalValue *GV) {
std::optional<OMPDeclareTargetDeclAttr *> ActiveAttr =

View File

@ -1089,14 +1089,6 @@ public:
SourceLocation Loc, bool PerformInit,
CodeGenFunction *CGF = nullptr);
/// Emit a code for initialization of declare target variable.
/// \param VD Declare target variable.
/// \param Addr Address of the global variable \a VD.
/// \param PerformInit true if initialization expression is not constant.
virtual bool emitDeclareTargetVarDefinition(const VarDecl *VD,
llvm::GlobalVariable *Addr,
bool PerformInit);
/// Emit code for handling declare target functions in the runtime.
/// \param FD Declare target function.
/// \param Addr Address of the global \a FD.

View File

@ -4536,6 +4536,11 @@ public:
void registerGlobalDtorWithAtExit(const VarDecl &D, llvm::FunctionCallee fn,
llvm::Constant *addr);
/// Registers the dtor using 'llvm.global_dtors' for platforms that do not
/// support an 'atexit()' function.
void registerGlobalDtorWithLLVM(const VarDecl &D, llvm::FunctionCallee fn,
llvm::Constant *addr);
/// Call atexit() with function dtorStub.
void registerGlobalDtorWithAtExit(llvm::Constant *dtorStub);

View File

@ -1570,6 +1570,13 @@ public:
const VarDecl *D,
ForDefinition_t IsForDefinition = NotForDefinition);
// FIXME: Hardcoding priority here is gross.
void AddGlobalCtor(llvm::Function *Ctor, int Priority = 65535,
unsigned LexOrder = ~0U,
llvm::Constant *AssociatedData = nullptr);
void AddGlobalDtor(llvm::Function *Dtor, int Priority = 65535,
bool IsDtorAttrFunc = false);
private:
llvm::Constant *GetOrCreateLLVMFunction(
StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,
@ -1641,13 +1648,6 @@ private:
void EmitPointerToInitFunc(const VarDecl *VD, llvm::GlobalVariable *Addr,
llvm::Function *InitFunc, InitSegAttr *ISA);
// FIXME: Hardcoding priority here is gross.
void AddGlobalCtor(llvm::Function *Ctor, int Priority = 65535,
unsigned LexOrder = ~0U,
llvm::Constant *AssociatedData = nullptr);
void AddGlobalDtor(llvm::Function *Dtor, int Priority = 65535,
bool IsDtorAttrFunc = false);
/// EmitCtorList - Generates a global array of functions and priorities using
/// the given list and name. This array will have appending linkage and is
/// suitable for use as a LLVM constructor or destructor array. Clears Fns.

View File

@ -2794,6 +2794,14 @@ void ItaniumCXXABI::registerGlobalDtor(CodeGenFunction &CGF, const VarDecl &D,
if (D.isNoDestroy(CGM.getContext()))
return;
// OpenMP offloading supports C++ constructors and destructors but we do not
// always have 'atexit' available. Instead lower these to use the LLVM global
// destructors which we can handle directly in the runtime. Note that this is
// not strictly 1-to-1 with using `atexit` because we no longer tear down
// globals in reverse order of when they were constructed.
if (!CGM.getLangOpts().hasAtExit() && !D.isStaticLocal())
return CGF.registerGlobalDtorWithLLVM(D, dtor, addr);
// emitGlobalDtorWithCXAAtExit will emit a call to either __cxa_thread_atexit
// or __cxa_atexit depending on whether this VarDecl is a thread-local storage
// or not. CXAAtExit controls only __cxa_atexit, so use it if it is enabled.

View File

@ -35,7 +35,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
#pragma omp end declare target
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fabsf_f32_l14_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -49,7 +49,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fabs_f32_l15_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I_I:%.*]] = alloca float, align 4, addrspace(5)
@ -69,7 +69,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_sinf_f32_l17_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -83,7 +83,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_sin_f32_l18_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I_I:%.*]] = alloca float, align 4, addrspace(5)
@ -103,7 +103,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_cosf_f32_l20_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -117,7 +117,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_cos_f32_l21_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.5
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I_I:%.*]] = alloca float, align 4, addrspace(5)
@ -137,7 +137,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fmaf_f32_l23_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.6
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -159,7 +159,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fma_f32_l24_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I_I:%.*]] = alloca float, align 4, addrspace(5)
@ -195,7 +195,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_min_f32_l27_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.8
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -213,7 +213,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_max_f32_l28_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.9
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -231,7 +231,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fmin_f32_l30_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.10
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CALL:%.*]] = call noundef float @_Z4fminff(float noundef 2.000000e+00, float noundef -4.000000e+00) #[[ATTR4:[0-9]+]]
@ -239,7 +239,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fmax_f32_l31_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.11
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CALL:%.*]] = call noundef float @_Z4fmaxff(float noundef 2.000000e+00, float noundef -4.000000e+00) #[[ATTR4]]
@ -247,7 +247,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fminf_f32_l33_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.12
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -265,7 +265,7 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_constexpr_fmaxf_f32_l34_ctor
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.13
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[RETVAL_I:%.*]] = alloca float, align 4, addrspace(5)
@ -282,3 +282,23 @@ const float constexpr_fmaxf_f32 = fmaxf(2.0f, -4.0f);
// CHECK-NEXT: store float [[TMP2]], ptr addrspacecast (ptr addrspace(1) @_ZL19constexpr_fmaxf_f32 to ptr), align 4
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_amdgcn_openmp_device_math_constexpr.cpp
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @__cxx_global_var_init()
// CHECK-NEXT: call void @__cxx_global_var_init.1()
// CHECK-NEXT: call void @__cxx_global_var_init.2()
// CHECK-NEXT: call void @__cxx_global_var_init.3()
// CHECK-NEXT: call void @__cxx_global_var_init.4()
// CHECK-NEXT: call void @__cxx_global_var_init.5()
// CHECK-NEXT: call void @__cxx_global_var_init.6()
// CHECK-NEXT: call void @__cxx_global_var_init.7()
// CHECK-NEXT: call void @__cxx_global_var_init.8()
// CHECK-NEXT: call void @__cxx_global_var_init.9()
// CHECK-NEXT: call void @__cxx_global_var_init.10()
// CHECK-NEXT: call void @__cxx_global_var_init.11()
// CHECK-NEXT: call void @__cxx_global_var_init.12()
// CHECK-NEXT: call void @__cxx_global_var_init.13()
// CHECK-NEXT: ret void
//

View File

@ -1,4 +1,4 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
@ -20,7 +20,11 @@ S A;
#pragma omp end declare target
#endif
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_A_l19_ctor
//.
// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_amdgcn_target_global_constructor.cpp, ptr null }]
// CHECK: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @__dtor_A, ptr null }]
//.
// CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @_ZN1SC1Ev(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @A to ptr)) #[[ATTR3:[0-9]+]]
@ -38,13 +42,6 @@ S A;
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_A_l19_dtor
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @_ZN1SD1Ev(ptr noundef nonnull align 4 dereferenceable(4) addrspacecast (ptr addrspace(1) @A to ptr)) #[[ATTR4:[0-9]+]]
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_ZN1SD1Ev
// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
// CHECK-NEXT: entry:
@ -52,7 +49,14 @@ S A;
// CHECK-NEXT: [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr
// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8
// CHECK-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
// CHECK-NEXT: call void @_ZN1SD2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4:[0-9]+]]
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__dtor_A
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @_ZN1SD1Ev(ptr addrspacecast (ptr addrspace(1) @A to ptr))
// CHECK-NEXT: ret void
//
//
@ -78,3 +82,9 @@ S A;
// CHECK-NEXT: call void @_Z3foov() #[[ATTR3]]
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_amdgcn_target_global_constructor.cpp
// CHECK-SAME: () #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: call void @__cxx_global_var_init()
// CHECK-NEXT: ret void

View File

@ -52,7 +52,6 @@
// CHECK-DAG: define {{.*}}i32 @{{.*}}{{foo|bar|baz2|baz3|FA|f_method}}{{.*}}()
// CHECK-DAG: define {{.*}}void @{{.*}}TemplateClass{{.*}}(ptr {{[^,]*}} %{{.*}})
// CHECK-DAG: define {{.*}}i32 @{{.*}}TemplateClass{{.*}}f_method{{.*}}(ptr {{[^,]*}} %{{.*}})
// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}_globals_l[[@LINE+89]]_ctor()
#ifndef HEADER
#define HEADER

View File

@ -20,6 +20,9 @@
// HOST-DAG: @[[C_ADDR:.+]] = internal global i32 0,
// HOST-DAG: @[[CD_ADDR:.+]] ={{( protected | dso_local)?}} global %struct.S zeroinitializer,
// DEVICE-DAG: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @[[CTOR:.+]], ptr null }]
// DEVICE-DAG: @llvm.global_dtors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @[[DTOR:.+]], ptr null }]
#pragma omp declare target
int foo() { return 0; }
#pragma omp end declare target
@ -43,12 +46,6 @@ int caz() { return 0; }
static int c = foo() + bar() + baz();
#pragma omp declare target (c)
// HOST-DAG: @[[C_CTOR:__omp_offloading_.+_c_l44_ctor]] = private constant i8 0
// DEVICE-DAG: define weak_odr protected void [[C_CTOR:@__omp_offloading_.+_c_l44_ctor]]()
// DEVICE-DAG: call noundef i32 [[FOO]]()
// DEVICE-DAG: call noundef i32 [[BAR]]()
// DEVICE-DAG: call noundef i32 [[BAZ]]()
// DEVICE-DAG: ret void
struct S {
int a;
@ -60,26 +57,7 @@ struct S {
#pragma omp declare target
S cd = doo() + car() + caz() + baz();
#pragma omp end declare target
// HOST-DAG: @[[CD_CTOR:__omp_offloading_.+_cd_l61_ctor]] = private constant i8 0
// DEVICE-DAG: define weak_odr protected void [[CD_CTOR:@__omp_offloading_.+_cd_l61_ctor]]()
// DEVICE-DAG: call noundef i32 [[DOO]]()
// DEVICE-DAG: call noundef i32 [[CAR]]()
// DEVICE-DAG: call noundef i32 [[CAZ]]()
// DEVICE-DAG: ret void
// HOST-DAG: @[[CD_DTOR:__omp_offloading_.+_cd_l61_dtor]] = private constant i8 0
// DEVICE-DAG: define weak_odr protected void [[CD_DTOR:@__omp_offloading_.+_cd_l61_dtor]]()
// DEVICE-DAG: call void
// DEVICE-DAG: ret void
// HOST-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[CD_ADDR]]\00"
// HOST-DAG: @.omp_offloading.entry.[[CD_ADDR]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr @[[CD_ADDR]], ptr @.omp_offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
// HOST-DAG: @.omp_offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[C_CTOR]]\00"
// HOST-DAG: @.omp_offloading.entry.[[C_CTOR]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr @[[C_CTOR]], ptr @.omp_offloading.entry_name{{.*}}, i64 0, i32 2, i32 0 }, section "omp_offloading_entries", align 1
// HOST-DAG: @.omp_offloading.entry_name{{.*}}= internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[CD_CTOR]]\00"
// HOST-DAG: @.omp_offloading.entry.[[CD_CTOR]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr @[[CD_CTOR]], ptr @.omp_offloading.entry_name{{.*}}, i64 0, i32 2, i32 0 }, section "omp_offloading_entries", align 1
// HOST-DAG: @.omp_offloading.entry_name{{.*}}= internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[CD_DTOR]]\00"
// HOST-DAG: @.omp_offloading.entry.[[CD_DTOR]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr @[[CD_DTOR]], ptr @.omp_offloading.entry_name{{.*}}, i64 0, i32 4, i32 0 }, section "omp_offloading_entries", align 1
int maini1() {
int a;
#pragma omp target map(tofrom : a)
@ -100,10 +78,5 @@ int maini1() {
// HOST-DAG: !{i32 1, !"[[CD_ADDR]]", i32 0, i32 {{[0-9]+}}}
// HOST-DAG: !{i32 1, !"[[C_ADDR]]", i32 0, i32 {{[0-9]+}}}
// DEVICE: !nvvm.annotations
// DEVICE-DAG: !{ptr [[C_CTOR]], !"kernel", i32 1}
// DEVICE-DAG: !{ptr [[CD_CTOR]], !"kernel", i32 1}
// DEVICE-DAG: !{ptr [[CD_DTOR]], !"kernel", i32 1}
#endif // HEADER

View File

@ -265,10 +265,6 @@ public:
enum OMPTargetRegionEntryKind : uint32_t {
/// Mark the entry as target region.
OMPTargetRegionEntryTargetRegion = 0x0,
/// Mark the entry as a global constructor.
OMPTargetRegionEntryCtor = 0x02,
/// Mark the entry as a global destructor.
OMPTargetRegionEntryDtor = 0x04,
};
/// Target region entries info.

View File

@ -789,14 +789,17 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
if (M.alias_size() && (STI.getPTXVersion() < 63 || STI.getSmVersion() < 30))
report_fatal_error(".alias requires PTX version >= 6.3 and sm_30");
// OpenMP supports NVPTX global constructors and destructors.
bool IsOpenMP = M.getModuleFlag("openmp") != nullptr;
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_ctors")) &&
!LowerCtorDtor) {
!LowerCtorDtor && !IsOpenMP) {
report_fatal_error(
"Module has a nontrivial global ctor, which NVPTX does not support.");
return true; // error
}
if (!isEmptyXXStructor(M.getNamedGlobal("llvm.global_dtors")) &&
!LowerCtorDtor) {
!LowerCtorDtor && !IsOpenMP) {
report_fatal_error(
"Module has a nontrivial global dtor, which NVPTX does not support.");
return true; // error

View File

@ -1914,6 +1914,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.init");
}
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
return callGlobalCtorDtorCommon(Plugin, Image, "amdgcn.device.fini");
}
const uint64_t getStreamBusyWaitMicroseconds() const {
return OMPX_StreamBusyWait;
}
@ -2627,6 +2637,38 @@ private:
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
/// Common method to invoke a single threaded constructor or destructor
/// kernel by name.
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
const char *Name) {
// Perform a quick check for the named kernel in the image. The kernel
// should be created by the 'amdgpu-lower-ctor-dtor' pass.
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
GlobalTy Global(Name, sizeof(void *));
if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
consumeError(std::move(Err));
return Plugin::success();
}
// Allocate and construct the AMDGPU kernel.
AMDGPUKernelTy AMDGPUKernel(Name);
if (auto Err = AMDGPUKernel.init(*this, Image))
return std::move(Err);
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs,
/*Args=*/nullptr, AsyncInfoWrapper))
return std::move(Err);
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
return std::move(Err);
}
/// Envar for controlling the number of HSA queues per device. High number of
/// queues may degrade performance.
UInt32Envar OMPX_NumQueues;

View File

@ -91,11 +91,6 @@ class GenericGlobalHandlerTy {
/// Map to store the ELF object files that have been loaded.
llvm::DenseMap<int32_t, ELF64LEObjectFile> ELFObjectFiles;
/// Get the cached ELF64LEObjectFile previosuly created for a specific
/// device image or create it if did not exist.
const ELF64LEObjectFile *
getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image);
/// Extract the global's information from the ELF image, section, and symbol.
virtual Error getGlobalMetadataFromELF(const DeviceImageTy &Image,
const ELF64LE::Sym &Symbol,
@ -119,6 +114,11 @@ class GenericGlobalHandlerTy {
public:
virtual ~GenericGlobalHandlerTy() {}
/// Get the cached ELF64LEObjectFile previosuly created for a specific
/// device image or create it if did not exist.
const ELF64LEObjectFile *
getOrCreateELFObjectFile(const GenericDeviceTy &Device, DeviceImageTy &Image);
/// Get the address and size of a global in the image. Address and size are
/// return in \p ImageGlobal, the global name is passed in \p ImageGlobal.
Error getGlobalMetadataFromImage(GenericDeviceTy &Device,

View File

@ -717,6 +717,9 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
}
Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = callGlobalDestructors(Plugin, *Image))
return std::move(Err);
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
@ -839,6 +842,10 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
}
#endif
// Call any global constructors present on the device.
if (auto Err = callGlobalConstructors(Plugin, *Image))
return std::move(Err);
// Return the pointer to the table of entries.
return Image->getOffloadEntryTable();
}

View File

@ -671,6 +671,20 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error synchronize(__tgt_async_info *AsyncInfo);
virtual Error synchronizeImpl(__tgt_async_info &AsyncInfo) = 0;
/// Invokes any global constructors on the device if present and is required
/// by the target.
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
return Error::success();
}
/// Invokes any global destructors on the device if present and is required
/// by the target.
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
return Error::success();
}
/// Query for the completion of the pending operations on the __tgt_async_info
/// structure in a non-blocking manner.
Error queryAsync(__tgt_async_info *AsyncInfo);

View File

@ -377,6 +377,16 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Plugin::success();
}
virtual Error callGlobalConstructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/true);
}
virtual Error callGlobalDestructors(GenericPluginTy &Plugin,
DeviceImageTy &Image) override {
return callGlobalCtorDtorCommon(Plugin, Image, /*IsCtor=*/false);
}
/// Allocate and construct a CUDA kernel.
Expected<GenericKernelTy &>
constructKernel(const __tgt_offload_entry &KernelEntry) override {
@ -1038,6 +1048,106 @@ private:
using CUDAStreamManagerTy = GenericDeviceResourceManagerTy<CUDAStreamRef>;
using CUDAEventManagerTy = GenericDeviceResourceManagerTy<CUDAEventRef>;
Error callGlobalCtorDtorCommon(GenericPluginTy &Plugin, DeviceImageTy &Image,
bool IsCtor) {
const char *KernelName = IsCtor ? "nvptx$device$init" : "nvptx$device$fini";
// Perform a quick check for the named kernel in the image. The kernel
// should be created by the 'nvptx-lower-ctor-dtor' pass.
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
GlobalTy Global(KernelName, sizeof(void *));
if (auto Err = Handler.getGlobalMetadataFromImage(*this, Image, Global)) {
consumeError(std::move(Err));
return Plugin::success();
}
// The Nvidia backend cannot handle creating the ctor / dtor array
// automatically so we must create it ourselves. The backend will emit
// several globals that contain function pointers we can call. These are
// prefixed with a known name due to Nvidia's lack of section support.
const ELF64LEObjectFile *ELFObj =
Handler.getOrCreateELFObjectFile(*this, Image);
if (!ELFObj)
return Plugin::error("Unable to create ELF object for image %p",
Image.getStart());
// Search for all symbols that contain a constructor or destructor.
SmallVector<std::pair<StringRef, uint16_t>> Funcs;
for (ELFSymbolRef Sym : ELFObj->symbols()) {
auto NameOrErr = Sym.getName();
if (!NameOrErr)
return NameOrErr.takeError();
if (!NameOrErr->starts_with(IsCtor ? "__init_array_object_"
: "__fini_array_object_"))
continue;
uint16_t Priority;
if (NameOrErr->rsplit('_').second.getAsInteger(10, Priority))
return Plugin::error("Invalid priority for constructor or destructor");
Funcs.emplace_back(*NameOrErr, Priority);
}
// Sort the created array to be in priority order.
llvm::sort(Funcs, [=](auto x, auto y) { return x.second < y.second; });
// Allocate a buffer to store all of the known constructor / destructor
// functions in so we can iterate them on the device.
void *Buffer =
allocate(Funcs.size() * sizeof(void *), nullptr, TARGET_ALLOC_DEVICE);
if (!Buffer)
return Plugin::error("Failed to allocate memory for global buffer");
auto *GlobalPtrStart = reinterpret_cast<uintptr_t *>(Buffer);
auto *GlobalPtrStop = reinterpret_cast<uintptr_t *>(Buffer) + Funcs.size();
SmallVector<void *> FunctionPtrs(Funcs.size());
std::size_t Idx = 0;
for (auto [Name, Priority] : Funcs) {
GlobalTy FunctionAddr(Name.str(), sizeof(void *), &FunctionPtrs[Idx++]);
if (auto Err = Handler.readGlobalFromDevice(*this, Image, FunctionAddr))
return std::move(Err);
}
// Copy the local buffer to the device.
if (auto Err = dataSubmit(GlobalPtrStart, FunctionPtrs.data(),
FunctionPtrs.size() * sizeof(void *), nullptr))
return std::move(Err);
// Copy the created buffer to the appropriate symbols so the kernel can
// iterate through them.
GlobalTy StartGlobal(IsCtor ? "__init_array_start" : "__fini_array_start",
sizeof(void *), &GlobalPtrStart);
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StartGlobal))
return std::move(Err);
GlobalTy StopGlobal(IsCtor ? "__init_array_end" : "__fini_array_end",
sizeof(void *), &GlobalPtrStop);
if (auto Err = Handler.writeGlobalToDevice(*this, Image, StopGlobal))
return std::move(Err);
CUDAKernelTy CUDAKernel(KernelName);
if (auto Err = CUDAKernel.init(*this, Image))
return std::move(Err);
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
KernelArgsTy KernelArgs = {};
if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
/*NumBlocks=*/1ul, KernelArgs, nullptr,
AsyncInfoWrapper))
return std::move(Err);
Error Err = Plugin::success();
AsyncInfoWrapper.finalize(Err);
if (free(Buffer, TARGET_ALLOC_DEVICE) != OFFLOAD_SUCCESS)
return Plugin::error("Failed to free memory for global buffer");
return std::move(Err);
}
/// Stream manager for CUDA streams.
CUDAStreamManagerTy CUDAStreamManager;

View File

@ -313,12 +313,18 @@ static void registerGlobalCtorsDtorsForImage(__tgt_bin_desc *Desc,
DP("Adding ctor " DPxMOD " to the pending list.\n",
DPxPTR(Entry->addr));
Device.PendingCtorsDtors[Desc].PendingCtors.push_back(Entry->addr);
MESSAGE("WARNING: Calling deprecated constructor for entry %s will be "
"removed in a future release \n",
Entry->name);
} else if (Entry->flags & OMP_DECLARE_TARGET_DTOR) {
// Dtors are pushed in reverse order so they are executed from end
// to beginning when unregistering the library!
DP("Adding dtor " DPxMOD " to the pending list.\n",
DPxPTR(Entry->addr));
Device.PendingCtorsDtors[Desc].PendingDtors.push_front(Entry->addr);
MESSAGE("WARNING: Calling deprecated destructor for entry %s will be "
"removed in a future release \n",
Entry->name);
}
if (Entry->flags & OMP_DECLARE_TARGET_LINK) {
@ -544,7 +550,8 @@ void RTLsTy::unregisterLib(__tgt_bin_desc *Desc) {
if (Device.PendingCtorsDtors[Desc].PendingCtors.empty()) {
AsyncInfoTy AsyncInfo(Device);
for (auto &Dtor : Device.PendingCtorsDtors[Desc].PendingDtors) {
int Rc = target(nullptr, Device, Dtor, CTorDTorKernelArgs, AsyncInfo);
int Rc =
target(nullptr, Device, Dtor, CTorDTorKernelArgs, AsyncInfo);
if (Rc != OFFLOAD_SUCCESS) {
DP("Running destructor " DPxMOD " failed.\n", DPxPTR(Dtor));
}

View File

@ -0,0 +1,37 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
// REQUIRES: libc
#include <stdio.h>
#pragma omp begin declare target device_type(nohost)
// CHECK: void ctor1()
// CHECK: void ctor2()
// CHECK: void ctor3()
[[gnu::constructor(101)]] void ctor1() { puts(__PRETTY_FUNCTION__); }
[[gnu::constructor(102)]] void ctor2() { puts(__PRETTY_FUNCTION__); }
[[gnu::constructor(103)]] void ctor3() { puts(__PRETTY_FUNCTION__); }
struct S {
S() { puts(__PRETTY_FUNCTION__); }
~S() { puts(__PRETTY_FUNCTION__); }
};
// CHECK: S::S()
// CHECK: S::~S()
S s;
// CHECK: void dtor3()
// CHECK: void dtor2()
// CHECK: void dtor1()
[[gnu::destructor(101)]] void dtor1() { puts(__PRETTY_FUNCTION__); }
[[gnu::destructor(103)]] void dtor3() { puts(__PRETTY_FUNCTION__); }
[[gnu::destructor(102)]] void dtor2() { puts(__PRETTY_FUNCTION__); }
#pragma omp end declare target
int main() {
#pragma omp target
;
}