[flang][cuda] Support non-allocatable module-level managed variables (#188526)

Add support for non-allocatable module-level CUDA managed variables
using pointer indirection through a companion global in
__nv_managed_data__. The CUDA runtime populates this pointer with the
unified memory address via __cudaRegisterManagedVar and
__cudaInitModule.

1. Create a .managed.ptr companion global in the __nv_managed_data__
section and register it with _FortranACUFRegisterManagedVariable
(CUFAddConstructor.cpp)
2. Call __cudaInitModule after registration to populate the managed
pointer (registration.cpp)
3. Annotate managed globals in gpu.module with nvvm.managed for PTX
.attribute(.managed) generation (cuda-code-gen.mlir)
4. Suppress cuf.data_transfer for assignments to/from non-allocatable
module managed variables, since cudaMemcpy would target the shadow
address rather than the actual unified memory (tools.h)
5. Preserve cuf.data_transfer for device_var = managed_var assignments
where explicit transfer is still required
This commit is contained in:
Zhen Wang 2026-03-31 09:27:08 -07:00 committed by GitHub
parent 0e0a0458ce
commit c4e6cf0abf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 293 additions and 23 deletions

View File

@ -27,6 +27,7 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
extern void __cudaRegisterManagedVar(void **fatCubinHandle,
void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
int ext, size_t size, int constant, int global);
extern char __cudaInitModule(void **fatCubinHandle);
void *RTDECL(CUFRegisterModule)(void *data) {
void **fatHandle{__cudaRegisterFatBinary(data)};
@ -50,6 +51,8 @@ void RTDEF(CUFRegisterManagedVariable)(
__cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
}
void RTDEF(CUFInitModule)(void **module) { __cudaInitModule(module); }
} // extern "C"
} // namespace Fortran::runtime::cuda

View File

@ -1311,6 +1311,28 @@ inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
return false;
}
// Non-allocatable module-level managed/unified variables use pointer
// indirection through a companion global in __nv_managed_data__.
// Explicit data transfers (cudaMemcpy) must be avoided for these
// variables since they would target the shadow address rather than
// the actual unified memory address.
inline bool IsNonAllocatableModuleCUDAManagedSymbol(const Symbol &sym) {
const Symbol &ultimate = sym.GetUltimate();
if (!IsCUDAManagedOrUnifiedSymbol(ultimate))
return false;
if (ultimate.attrs().test(semantics::Attr::ALLOCATABLE))
return false;
return ultimate.owner().IsModule();
}
template <typename A>
inline bool HasNonAllocatableModuleCUDAManagedSymbols(const A &expr) {
for (const Symbol &sym : CollectCudaSymbols(expr))
if (IsNonAllocatableModuleCUDAManagedSymbol(sym))
return true;
return false;
}
// Get the number of distinct symbols with CUDA device
// attribute in the expression.
template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
@ -1350,17 +1372,21 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
int rhsNbManagedSymbols{GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
if (HasNonAllocatableModuleCUDAManagedSymbols(lhs))
return false;
if (lhsNbManagedSymbols >= 1 && lhs.Rank() > 0 && rhsNbSymbols == 0 &&
rhsNbManagedSymbols == 0 && (IsVariable(rhs) || IsConstantExpr(rhs))) {
return true; // Managed arrays initialization is performed on the device.
}
// Special cases performed on the host:
// - Only managed or unifed symbols are involved on RHS and LHS.
// - LHS is managed or unified and the RHS is host only.
// Cases where no explicit data transfer is needed:
// - Both sides involve only managed/unified symbols (host-accessible).
// - LHS is host-only and RHS has only managed/unified symbols.
// - LHS is managed/unified and RHS is host-only.
if ((lhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
(lhsNbManagedSymbols == 0 && rhsNbManagedSymbols >= 1 &&
rhsNbManagedSymbols == rhsNbSymbols) ||
(lhsNbManagedSymbols == 0 && !HasCUDADeviceAttrs(lhs) &&
rhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
(lhsNbManagedSymbols >= 1 && rhsNbSymbols == 0)) {
return false;
}

View File

@ -32,6 +32,11 @@ void RTDECL(CUFRegisterVariable)(
void RTDECL(CUFRegisterManagedVariable)(
void **module, void **varSym, char *varName, int64_t size);
/// Initialize a CUDA module after all variables have been registered.
/// Triggers the runtime to populate managed variable pointers with
/// unified memory addresses.
void RTDECL(CUFInitModule)(void **module);
} // extern "C"
} // namespace Fortran::runtime::cuda

View File

@ -3448,6 +3448,15 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
g.setAddrSpace(
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
if (gpuMod && global.getDataAttr() &&
*global.getDataAttr() == cuf::DataAttribute::Managed &&
!mlir::isa<fir::BaseBoxType>(global.getType())) {
g.setAddrSpace(
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Global));
g->setAttr(mlir::NVVM::NVVMDialect::getManagedAttrName(),
mlir::UnitAttr::get(global.getContext()));
}
rewriter.eraseOp(global);
return mlir::success();
}

View File

@ -41,6 +41,40 @@ namespace {
static constexpr llvm::StringRef cudaFortranCtorName{
"__cudaFortranConstructor"};
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
/// Create an 8-byte pointer global in the __nv_managed_data__ section.
/// The CUDA runtime populates this pointer with the unified memory address
/// when the module is initialized via __cudaInitModule.
static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
mlir::ModuleOp mod,
fir::GlobalOp globalOp) {
mlir::MLIRContext *ctx = mod.getContext();
std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointAfter(globalOp);
llvm::SmallVector<mlir::NamedAttribute> attrs;
attrs.push_back(
mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
mlir::StringAttr::get(ctx, "__nv_managed_data__")));
mlir::DenseElementsAttr initAttr = {};
auto ptrGlobal = fir::GlobalOp::create(
builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
/*isTarget=*/false, ptrTy, initAttr,
/*linkName=*/builder.createInternalLinkage(), attrs);
mlir::Region &region = ptrGlobal.getRegion();
mlir::Block *block = builder.createBlock(&region);
builder.setInsertionPointToStart(block);
mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
fir::HasValueOp::create(builder, globalOp.getLoc(), zero);
return ptrGlobal;
}
struct CUFAddConstructor
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
@ -108,19 +142,15 @@ struct CUFAddConstructor
if (!attr)
continue;
if (attr.getValue() == cuf::DataAttribute::Managed &&
!mlir::isa<fir::BaseBoxType>(globalOp.getType()))
TODO(loc, "registration of non-allocatable managed variables");
bool isNonAllocManagedGlobal =
attr.getValue() == cuf::DataAttribute::Managed &&
!mlir::isa<fir::BaseBoxType>(globalOp.getType());
mlir::func::FuncOp func;
switch (attr.getValue()) {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant:
case cuf::DataAttribute::Managed: {
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
loc, builder);
auto fTy = func.getFunctionType();
// Global variable name
std::string gblNameStr = globalOp.getSymbol().getValue().str();
gblNameStr += '\0';
@ -141,18 +171,44 @@ struct CUFAddConstructor
}
auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
// Global variable address
mlir::Value addr = fir::AddrOfOp::create(
builder, loc, globalOp.resultType(), globalOp.getSymbol());
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
fir::CallOp::create(builder, loc, func, args);
if (isNonAllocManagedGlobal) {
// Non-allocatable managed globals use pointer indirection:
// a companion pointer in __nv_managed_data__ holds the unified
// memory address, registered via __cudaRegisterManagedVar.
fir::GlobalOp ptrGlobal =
createManagedPointerGlobal(builder, mod, globalOp);
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFRegisterManagedVariable)>(loc, builder);
auto fTy = func.getFunctionType();
mlir::Value addr = fir::AddrOfOp::create(
builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
fir::CallOp::create(builder, loc, func, args);
} else {
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
loc, builder);
auto fTy = func.getFunctionType();
mlir::Value addr = fir::AddrOfOp::create(
builder, loc, globalOp.resultType(), globalOp.getSymbol());
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
fir::CallOp::create(builder, loc, func, args);
}
} break;
default:
break;
}
}
// Initialize the module after all variables are registered so the
// runtime populates managed variable unified memory pointers.
mlir::func::FuncOp initFunc =
fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
auto initFTy = initFunc.getFunctionType();
llvm::SmallVector<mlir::Value> initArgs{
fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
fir::CallOp::create(builder, loc, initFunc, initArgs);
}
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});

View File

@ -13,6 +13,7 @@
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Runtime/CUDA/descriptor.h"
@ -48,6 +49,8 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
return val;
}
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
struct CUFDeviceAddressOpConversion
: public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
using OpRewritePattern::OpRewritePattern;
@ -59,10 +62,25 @@ struct CUFDeviceAddressOpConversion
mlir::LogicalResult
matchAndRewrite(cuf::DeviceAddressOp op,
mlir::PatternRewriter &rewriter) const override {
if (auto global = symTab.lookup<fir::GlobalOp>(
op.getHostSymbol().getRootReference().getValue())) {
auto symName = op.getHostSymbol().getRootReference().getValue();
if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
auto mod = op->getParentOfType<mlir::ModuleOp>();
mlir::Location loc = op.getLoc();
// For non-allocatable managed globals, CUFAddConstructor created a
// companion pointer global (@sym.managed.ptr) that holds the unified
// memory address. Load from it instead of calling CUFGetDeviceAddress.
std::string ptrGlobalName = (symName + managedPtrSuffix).str();
if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
auto ptrRef = fir::AddrOfOp::create(
rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
auto converted =
fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
rewriter.replaceOp(op, converted);
return success();
}
auto hostAddr = fir::AddrOfOp::create(
rewriter, loc, fir::ReferenceType::get(global.getType()),
op.getHostSymbol());

View File

@ -312,3 +312,19 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: gpu.func @_QMkernelsPassign
// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
// -----
// Test that non-allocatable managed globals inside gpu.module get
// addr_space = 1 (Global) and the nvvm.managed annotation.
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
gpu.module @cuda_device_mod {
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
%0 = fir.zero_bits !fir.array<100xi32>
fir.has_value %0 : !fir.array<100xi32>
}
}
}
// CHECK: llvm.mlir.global external @_QMtestEmanx() {addr_space = 1 : i32, nvvm.managed} : !llvm.array<100 x i32>

View File

@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
//
// CHECK: fir.call @_FortranACUFInitModule
// -----
@ -78,3 +78,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
// CHECK: llvm.func internal @__cudaFortranConstructor()
// CHECK: fir.address_of(@_QMmEa00)
// CHECK: fir.call @_FortranACUFRegisterVariable
// CHECK: fir.call @_FortranACUFInitModule
// -----
// Non-allocatable managed global: should create pointer global in
// __nv_managed_data__ and register with CUFRegisterManagedVariable.
//
// Fortran source:
// module test
// integer*4, managed :: manx(100)
// contains
// attributes(global) subroutine kernel()
// end subroutine
// end module
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f32, dense<32> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
%0 = fir.zero_bits !fir.array<100xi32>
fir.has_value %0 : !fir.array<100xi32>
}
gpu.module @cuda_device_mod {
}
}
// Pointer global should be created with section attribute.
// CHECK: fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8>
// CHECK: fir.zero_bits !fir.llvm_ptr<i8>
// Constructor should register with CUFRegisterManagedVariable then init module.
// CHECK: llvm.func internal @__cudaFortranConstructor()
// CHECK: fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
// CHECK: fir.call @_FortranACUFRegisterManagedVariable
// CHECK: fir.call @_FortranACUFInitModule

View File

@ -12,3 +12,42 @@ func.func @_QPxa(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<d
// CHECK-LABEL: func.func @_QPxa
// CHECK: fir.call @_FortranACUFGetDeviceAddress
// -----
// Non-allocatable managed global with companion pointer global:
// cuf.device_address should load from the pointer global instead of
// calling CUFGetDeviceAddress.
//
// Fortran source:
// module test
// integer*4, managed :: manx(100)
// end module
// subroutine user()
// use test
// manx(1) = 42
// end subroutine
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
%0 = fir.zero_bits !fir.array<100xi32>
fir.has_value %0 : !fir.array<100xi32>
}
fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8> {
%0 = fir.zero_bits !fir.llvm_ptr<i8>
fir.has_value %0 : !fir.llvm_ptr<i8>
}
func.func @_QPuser() {
%c100 = arith.constant 100 : index
%0 = cuf.device_address @_QMtestEmanx -> !fir.ref<!fir.array<100xi32>>
%1 = fir.shape %c100 : (index) -> !fir.shape<1>
%2 = fir.declare %0(%1) {uniq_name = "_QMtestEmanx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
return
}
// CHECK-LABEL: func.func @_QPuser
// CHECK-NOT: fir.call @_FortranACUFGetDeviceAddress
// CHECK: %[[PTR_REF:.*]] = fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
// CHECK: %[[RAW_PTR:.*]] = fir.load %[[PTR_REF]] : !fir.ref<!fir.llvm_ptr<i8>>
// CHECK: %[[ADDR:.*]] = fir.convert %[[RAW_PTR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.array<100xi32>>

View File

@ -637,3 +637,39 @@ end subroutine
! CHECK-LABEL: func.func @_QPsub34
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {hasManagedOrUnifedSymbols, transfer_kind = #cuf.cuda_transfer<host_device>} : f16, !fir.box<!fir.array<?xf16>>
module managed_mod
integer, managed :: marray(10)
end module
subroutine sub35()
use managed_mod
integer :: host_arr(10)
marray = host_arr
marray = 0
end subroutine
! CHECK-LABEL: func.func @_QPsub35()
! CHECK-NOT: cuf.data_transfer
! Test that host_var = managed_module_var does NOT generate cuf.data_transfer
! (managed memory is host-accessible, so direct assignment suffices).
subroutine sub36()
use managed_mod
integer :: host_arr(10)
host_arr = marray
end subroutine
! CHECK-LABEL: func.func @_QPsub36()
! CHECK-NOT: cuf.data_transfer
! Test that device_var = managed_module_var DOES generate cuf.data_transfer
! (device memory requires explicit cudaMemcpy).
subroutine sub37()
use managed_mod
integer, device :: dev_arr(10)
dev_arr = marray
end subroutine
! CHECK-LABEL: func.func @_QPsub37()
! CHECK: cuf.data_transfer

View File

@ -96,6 +96,9 @@ def NVVM_Dialect : Dialect {
/// nvvm.cluster_dim attributes.
static StringRef getBlocksAreClustersAttrName() { return "nvvm.blocksareclusters"; }
/// Get the name of the attribute used to annotate managed global variables.
static StringRef getManagedAttrName() { return "nvvm.managed"; }
/// Verify an attribute from this dialect on the argument at 'argIndex' for
/// the region at 'regionIndex' on the given operation. Returns failure if
/// the verification failed, success otherwise. This hook may optionally be

View File

@ -705,11 +705,28 @@ public:
return failure();
}
/// Attaches module-level metadata for functions marked as kernels.
/// Attaches module-level metadata for functions marked as kernels
/// and managed annotations for global variables.
LogicalResult
amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
NamedAttribute attribute,
LLVM::ModuleTranslation &moduleTranslation) const final {
if (auto globalOp = dyn_cast<LLVM::GlobalOp>(op)) {
if (attribute.getName() == NVVM::NVVMDialect::getManagedAttrName()) {
auto *gv = cast<llvm::GlobalVariable>(
moduleTranslation.lookupGlobal(globalOp));
llvm::Module *m = gv->getParent();
llvm::LLVMContext &ctx = m->getContext();
llvm::NamedMDNode *md = m->getOrInsertNamedMetadata("nvvm.annotations");
md->addOperand(llvm::MDNode::get(
ctx, {llvm::ConstantAsMetadata::get(gv),
llvm::MDString::get(ctx, "managed"),
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
llvm::Type::getInt32Ty(ctx), 1))}));
}
return success();
}
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)
return failure();

View File

@ -915,3 +915,10 @@ llvm.func @nanosleep(%duration: i32) {
nvvm.nanosleep %duration
llvm.return
}
// -----
// CHECK: @managed_g = addrspace(1) global i32 0
// CHECK: !nvvm.annotations = !{![[MANAGED:[0-9]+]]}
// CHECK: ![[MANAGED]] = !{ptr addrspace(1) @managed_g, !"managed", i32 1}
llvm.mlir.global external @managed_g(0 : i32) {addr_space = 1 : i32, nvvm.managed} : i32