From c4e6cf0abff628f9c018428e0d4beed8788efdf9 Mon Sep 17 00:00:00 2001 From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:27:08 -0700 Subject: [PATCH] [flang][cuda] Support non-allocatable module-level managed variables (#188526) Add support for non-allocatable module-level CUDA managed variables using pointer indirection through a companion global in __nv_managed_data__. The CUDA runtime populates this pointer with the unified memory address via __cudaRegisterManagedVar and __cudaInitModule. 1. Create a .managed.ptr companion global in the __nv_managed_data__ section and register it with _FortranACUFRegisterManagedVariable (CUFAddConstructor.cpp) 2. Call __cudaInitModule after registration to populate the managed pointer (registration.cpp) 3. Annotate managed globals in gpu.module with nvvm.managed for PTX .attribute(.managed) generation (cuda-code-gen.mlir) 4. Suppress cuf.data_transfer for assignments to/from non-allocatable module managed variables, since cudaMemcpy would target the shadow address rather than the actual unified memory (tools.h) 5. Preserve cuf.data_transfer for device_var = managed_var assignments where explicit transfer is still required --- flang-rt/lib/cuda/registration.cpp | 3 + flang/include/flang/Evaluate/tools.h | 36 ++++++-- .../include/flang/Runtime/CUDA/registration.h | 5 ++ flang/lib/Optimizer/CodeGen/CodeGen.cpp | 9 ++ .../Transforms/CUDA/CUFAddConstructor.cpp | 84 +++++++++++++++---- .../Transforms/CUDA/CUFOpConversionLate.cpp | 22 ++++- flang/test/Fir/CUDA/cuda-code-gen.mlir | 16 ++++ flang/test/Fir/CUDA/cuda-constructor-2.f90 | 37 +++++++- flang/test/Fir/CUDA/cuda-device-address.mlir | 39 +++++++++ flang/test/Lower/CUDA/cuda-data-transfer.cuf | 36 ++++++++ mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 3 + .../Dialect/NVVM/NVVMToLLVMIRTranslation.cpp | 19 ++++- mlir/test/Target/LLVMIR/nvvmir.mlir | 7 ++ 13 files changed, 293 insertions(+), 23 deletions(-) diff --git a/flang-rt/lib/cuda/registration.cpp b/flang-rt/lib/cuda/registration.cpp index 8123220c2624..58077d6a6a52 100644 --- a/flang-rt/lib/cuda/registration.cpp +++ b/flang-rt/lib/cuda/registration.cpp @@ -27,6 +27,7 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, extern void __cudaRegisterManagedVar(void **fatCubinHandle, void **hostVarPtrAddress, char *deviceAddress, const char *deviceName, int ext, size_t size, int constant, int global); +extern char __cudaInitModule(void **fatCubinHandle); void *RTDECL(CUFRegisterModule)(void *data) { void **fatHandle{__cudaRegisterFatBinary(data)}; @@ -50,6 +51,8 @@ void RTDEF(CUFRegisterManagedVariable)( __cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0); } +void RTDEF(CUFInitModule)(void **module) { __cudaInitModule(module); } + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index 963452755064..51dc0582fcde 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -1311,6 +1311,28 @@ inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) { return false; } +// Non-allocatable module-level managed/unified variables use pointer +// indirection through a companion global in __nv_managed_data__. +// Explicit data transfers (cudaMemcpy) must be avoided for these +// variables since they would target the shadow address rather than +// the actual unified memory address. +inline bool IsNonAllocatableModuleCUDAManagedSymbol(const Symbol &sym) { + const Symbol &ultimate = sym.GetUltimate(); + if (!IsCUDAManagedOrUnifiedSymbol(ultimate)) + return false; + if (ultimate.attrs().test(semantics::Attr::ALLOCATABLE)) + return false; + return ultimate.owner().IsModule(); +} + +template +inline bool HasNonAllocatableModuleCUDAManagedSymbols(const A &expr) { + for (const Symbol &sym : CollectCudaSymbols(expr)) + if (IsNonAllocatableModuleCUDAManagedSymbol(sym)) + return true; + return false; +} + // Get the number of distinct symbols with CUDA device // attribute in the expression. template inline int GetNbOfCUDADeviceSymbols(const A &expr) { @@ -1350,17 +1372,21 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) { int rhsNbManagedSymbols{GetNbOfCUDAManagedOrUnifiedSymbols(rhs)}; int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)}; + if (HasNonAllocatableModuleCUDAManagedSymbols(lhs)) + return false; + if (lhsNbManagedSymbols >= 1 && lhs.Rank() > 0 && rhsNbSymbols == 0 && rhsNbManagedSymbols == 0 && (IsVariable(rhs) || IsConstantExpr(rhs))) { return true; // Managed arrays initialization is performed on the device. } - // Special cases performed on the host: - // - Only managed or unifed symbols are involved on RHS and LHS. - // - LHS is managed or unified and the RHS is host only. + // Cases where no explicit data transfer is needed: + // - Both sides involve only managed/unified symbols (host-accessible). + // - LHS is host-only and RHS has only managed/unified symbols. + // - LHS is managed/unified and RHS is host-only. if ((lhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) || - (lhsNbManagedSymbols == 0 && rhsNbManagedSymbols >= 1 && - rhsNbManagedSymbols == rhsNbSymbols) || + (lhsNbManagedSymbols == 0 && !HasCUDADeviceAttrs(lhs) && + rhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) || (lhsNbManagedSymbols >= 1 && rhsNbSymbols == 0)) { return false; } diff --git a/flang/include/flang/Runtime/CUDA/registration.h b/flang/include/flang/Runtime/CUDA/registration.h index 15f013432fa0..74dbf9e18907 100644 --- a/flang/include/flang/Runtime/CUDA/registration.h +++ b/flang/include/flang/Runtime/CUDA/registration.h @@ -32,6 +32,11 @@ void RTDECL(CUFRegisterVariable)( void RTDECL(CUFRegisterManagedVariable)( void **module, void **varSym, char *varName, int64_t size); +/// Initialize a CUDA module after all variables have been registered. +/// Triggers the runtime to populate managed variable pointers with +/// unified memory addresses. +void RTDECL(CUFInitModule)(void **module); + } // extern "C" } // namespace Fortran::runtime::cuda diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 25eb6194efa9..2d01463cf604 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3448,6 +3448,15 @@ struct GlobalOpConversion : public fir::FIROpConversion { g.setAddrSpace( static_cast(mlir::NVVM::NVVMMemorySpace::Constant)); + if (gpuMod && global.getDataAttr() && + *global.getDataAttr() == cuf::DataAttribute::Managed && + !mlir::isa(global.getType())) { + g.setAddrSpace( + static_cast(mlir::NVVM::NVVMMemorySpace::Global)); + g->setAttr(mlir::NVVM::NVVMDialect::getManagedAttrName(), + mlir::UnitAttr::get(global.getContext())); + } + rewriter.eraseOp(global); return mlir::success(); } diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp index baa8e591ee16..9ed76745c261 100644 --- a/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp +++ b/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp @@ -41,6 +41,40 @@ namespace { static constexpr llvm::StringRef cudaFortranCtorName{ "__cudaFortranConstructor"}; +static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"}; + +/// Create an 8-byte pointer global in the __nv_managed_data__ section. +/// The CUDA runtime populates this pointer with the unified memory address +/// when the module is initialized via __cudaInitModule. +static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder, + mlir::ModuleOp mod, + fir::GlobalOp globalOp) { + mlir::MLIRContext *ctx = mod.getContext(); + std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str(); + auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8)); + + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointAfter(globalOp); + + llvm::SmallVector attrs; + attrs.push_back( + mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"), + mlir::StringAttr::get(ctx, "__nv_managed_data__"))); + + mlir::DenseElementsAttr initAttr = {}; + auto ptrGlobal = fir::GlobalOp::create( + builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false, + /*isTarget=*/false, ptrTy, initAttr, + /*linkName=*/builder.createInternalLinkage(), attrs); + + mlir::Region ®ion = ptrGlobal.getRegion(); + mlir::Block *block = builder.createBlock(®ion); + builder.setInsertionPointToStart(block); + mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy); + fir::HasValueOp::create(builder, globalOp.getLoc(), zero); + + return ptrGlobal; +} struct CUFAddConstructor : public fir::impl::CUFAddConstructorBase { @@ -108,19 +142,15 @@ struct CUFAddConstructor if (!attr) continue; - if (attr.getValue() == cuf::DataAttribute::Managed && - !mlir::isa(globalOp.getType())) - TODO(loc, "registration of non-allocatable managed variables"); + bool isNonAllocManagedGlobal = + attr.getValue() == cuf::DataAttribute::Managed && + !mlir::isa(globalOp.getType()); mlir::func::FuncOp func; switch (attr.getValue()) { case cuf::DataAttribute::Device: case cuf::DataAttribute::Constant: case cuf::DataAttribute::Managed: { - func = fir::runtime::getRuntimeFunc( - loc, builder); - auto fTy = func.getFunctionType(); - // Global variable name std::string gblNameStr = globalOp.getSymbol().getValue().str(); gblNameStr += '\0'; @@ -141,18 +171,44 @@ struct CUFAddConstructor } auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size); - // Global variable address - mlir::Value addr = fir::AddrOfOp::create( - builder, loc, globalOp.resultType(), globalOp.getSymbol()); - - llvm::SmallVector args{fir::runtime::createArguments( - builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; - fir::CallOp::create(builder, loc, func, args); + if (isNonAllocManagedGlobal) { + // Non-allocatable managed globals use pointer indirection: + // a companion pointer in __nv_managed_data__ holds the unified + // memory address, registered via __cudaRegisterManagedVar. + fir::GlobalOp ptrGlobal = + createManagedPointerGlobal(builder, mod, globalOp); + func = fir::runtime::getRuntimeFunc(loc, builder); + auto fTy = func.getFunctionType(); + mlir::Value addr = fir::AddrOfOp::create( + builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol()); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; + fir::CallOp::create(builder, loc, func, args); + } else { + func = fir::runtime::getRuntimeFunc( + loc, builder); + auto fTy = func.getFunctionType(); + mlir::Value addr = fir::AddrOfOp::create( + builder, loc, globalOp.resultType(), globalOp.getSymbol()); + llvm::SmallVector args{fir::runtime::createArguments( + builder, loc, fTy, registeredMod, addr, gblName, sizeVal)}; + fir::CallOp::create(builder, loc, func, args); + } } break; default: break; } } + + // Initialize the module after all variables are registered so the + // runtime populates managed variable unified memory pointers. + mlir::func::FuncOp initFunc = + fir::runtime::getRuntimeFunc(loc, builder); + auto initFTy = initFunc.getFunctionType(); + llvm::SmallVector initArgs{ + fir::runtime::createArguments(builder, loc, initFTy, registeredMod)}; + fir::CallOp::create(builder, loc, initFunc, initArgs); } mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{}); diff --git a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp index fe459712a6ba..62f95f5d23c3 100644 --- a/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp +++ b/flang/lib/Optimizer/Transforms/CUDA/CUFOpConversionLate.cpp @@ -13,6 +13,7 @@ #include "flang/Optimizer/Dialect/CUF/CUFOps.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Transforms/Passes.h" #include "flang/Runtime/CUDA/common.h" #include "flang/Runtime/CUDA/descriptor.h" @@ -48,6 +49,8 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter, return val; } +static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"}; + struct CUFDeviceAddressOpConversion : public mlir::OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -59,10 +62,25 @@ struct CUFDeviceAddressOpConversion mlir::LogicalResult matchAndRewrite(cuf::DeviceAddressOp op, mlir::PatternRewriter &rewriter) const override { - if (auto global = symTab.lookup( - op.getHostSymbol().getRootReference().getValue())) { + auto symName = op.getHostSymbol().getRootReference().getValue(); + if (auto global = symTab.lookup(symName)) { auto mod = op->getParentOfType(); mlir::Location loc = op.getLoc(); + + // For non-allocatable managed globals, CUFAddConstructor created a + // companion pointer global (@sym.managed.ptr) that holds the unified + // memory address. Load from it instead of calling CUFGetDeviceAddress. + std::string ptrGlobalName = (symName + managedPtrSuffix).str(); + if (auto ptrGlobal = symTab.lookup(ptrGlobalName)) { + auto ptrRef = fir::AddrOfOp::create( + rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol()); + auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef); + auto converted = + fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr); + rewriter.replaceOp(op, converted); + return success(); + } + auto hostAddr = fir::AddrOfOp::create( rewriter, loc, fir::ReferenceType::get(global.getType()), op.getHostSymbol()); diff --git a/flang/test/Fir/CUDA/cuda-code-gen.mlir b/flang/test/Fir/CUDA/cuda-code-gen.mlir index e83648f21bdf..fc962f8de503 100644 --- a/flang/test/Fir/CUDA/cuda-code-gen.mlir +++ b/flang/test/Fir/CUDA/cuda-code-gen.mlir @@ -312,3 +312,19 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e // CHECK-LABEL: gpu.func @_QMkernelsPassign // CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4> // CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr + +// ----- + +// Test that non-allocatable managed globals inside gpu.module get +// addr_space = 1 (Global) and the nvvm.managed annotation. + +module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry, dense<64> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry, dense<32> : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} { + gpu.module @cuda_device_mod { + fir.global @_QMtestEmanx {data_attr = #cuf.cuda} : !fir.array<100xi32> { + %0 = fir.zero_bits !fir.array<100xi32> + fir.has_value %0 : !fir.array<100xi32> + } + } +} + +// CHECK: llvm.mlir.global external @_QMtestEmanx() {addr_space = 1 : i32, nvvm.managed} : !llvm.array<100 x i32> diff --git a/flang/test/Fir/CUDA/cuda-constructor-2.f90 b/flang/test/Fir/CUDA/cuda-constructor-2.f90 index f21d8f9c3763..d61ca4849ec3 100644 --- a/flang/test/Fir/CUDA/cuda-constructor-2.f90 +++ b/flang/test/Fir/CUDA/cuda-constructor-2.f90 @@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry>>> // CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref>>>) -> !fir.ref // CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}}) -// +// CHECK: fir.call @_FortranACUFInitModule // ----- @@ -78,3 +78,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec : vector<2xi64>, i // CHECK: llvm.func internal @__cudaFortranConstructor() // CHECK: fir.address_of(@_QMmEa00) // CHECK: fir.call @_FortranACUFRegisterVariable +// CHECK: fir.call @_FortranACUFInitModule + +// ----- + +// Non-allocatable managed global: should create pointer global in +// __nv_managed_data__ and register with CUFRegisterManagedVariable. +// +// Fortran source: +// module test +// integer*4, managed :: manx(100) +// contains +// attributes(global) subroutine kernel() +// end subroutine +// end module + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry : vector<4xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} { + + fir.global @_QMtestEmanx {data_attr = #cuf.cuda} : !fir.array<100xi32> { + %0 = fir.zero_bits !fir.array<100xi32> + fir.has_value %0 : !fir.array<100xi32> + } + + gpu.module @cuda_device_mod { + } +} + +// Pointer global should be created with section attribute. +// CHECK: fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr +// CHECK: fir.zero_bits !fir.llvm_ptr + +// Constructor should register with CUFRegisterManagedVariable then init module. +// CHECK: llvm.func internal @__cudaFortranConstructor() +// CHECK: fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref> +// CHECK: fir.call @_FortranACUFRegisterManagedVariable +// CHECK: fir.call @_FortranACUFInitModule diff --git a/flang/test/Fir/CUDA/cuda-device-address.mlir b/flang/test/Fir/CUDA/cuda-device-address.mlir index e86208321b8a..a2dae7155786 100644 --- a/flang/test/Fir/CUDA/cuda-device-address.mlir +++ b/flang/test/Fir/CUDA/cuda-device-address.mlir @@ -12,3 +12,42 @@ func.func @_QPxa(%arg0: !fir.ref> {cuf.data_attr = #cuf.cuda} : !fir.array<100xi32> { + %0 = fir.zero_bits !fir.array<100xi32> + fir.has_value %0 : !fir.array<100xi32> +} + +fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr { + %0 = fir.zero_bits !fir.llvm_ptr + fir.has_value %0 : !fir.llvm_ptr +} + +func.func @_QPuser() { + %c100 = arith.constant 100 : index + %0 = cuf.device_address @_QMtestEmanx -> !fir.ref> + %1 = fir.shape %c100 : (index) -> !fir.shape<1> + %2 = fir.declare %0(%1) {uniq_name = "_QMtestEmanx"} : (!fir.ref>, !fir.shape<1>) -> !fir.ref> + return +} + +// CHECK-LABEL: func.func @_QPuser +// CHECK-NOT: fir.call @_FortranACUFGetDeviceAddress +// CHECK: %[[PTR_REF:.*]] = fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref> +// CHECK: %[[RAW_PTR:.*]] = fir.load %[[PTR_REF]] : !fir.ref> +// CHECK: %[[ADDR:.*]] = fir.convert %[[RAW_PTR]] : (!fir.llvm_ptr) -> !fir.ref> diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf index 66c3a28f9aec..1d0e510c110e 100644 --- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf +++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf @@ -637,3 +637,39 @@ end subroutine ! CHECK-LABEL: func.func @_QPsub34 ! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {hasManagedOrUnifedSymbols, transfer_kind = #cuf.cuda_transfer} : f16, !fir.box> + +module managed_mod + integer, managed :: marray(10) +end module + +subroutine sub35() + use managed_mod + integer :: host_arr(10) + marray = host_arr + marray = 0 +end subroutine + +! CHECK-LABEL: func.func @_QPsub35() +! CHECK-NOT: cuf.data_transfer + +! Test that host_var = managed_module_var does NOT generate cuf.data_transfer +! (managed memory is host-accessible, so direct assignment suffices). +subroutine sub36() + use managed_mod + integer :: host_arr(10) + host_arr = marray +end subroutine + +! CHECK-LABEL: func.func @_QPsub36() +! CHECK-NOT: cuf.data_transfer + +! Test that device_var = managed_module_var DOES generate cuf.data_transfer +! (device memory requires explicit cudaMemcpy). +subroutine sub37() + use managed_mod + integer, device :: dev_arr(10) + dev_arr = marray +end subroutine + +! CHECK-LABEL: func.func @_QPsub37() +! CHECK: cuf.data_transfer diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td index 0c5dae265e2c..87fd75f5a3e1 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -96,6 +96,9 @@ def NVVM_Dialect : Dialect { /// nvvm.cluster_dim attributes. static StringRef getBlocksAreClustersAttrName() { return "nvvm.blocksareclusters"; } + /// Get the name of the attribute used to annotate managed global variables. + static StringRef getManagedAttrName() { return "nvvm.managed"; } + /// Verify an attribute from this dialect on the argument at 'argIndex' for /// the region at 'regionIndex' on the given operation. Returns failure if /// the verification failed, success otherwise. This hook may optionally be diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp index 254be8e63590..08e44949de1c 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp @@ -705,11 +705,28 @@ public: return failure(); } - /// Attaches module-level metadata for functions marked as kernels. + /// Attaches module-level metadata for functions marked as kernels + /// and managed annotations for global variables. LogicalResult amendOperation(Operation *op, ArrayRef instructions, NamedAttribute attribute, LLVM::ModuleTranslation &moduleTranslation) const final { + if (auto globalOp = dyn_cast(op)) { + if (attribute.getName() == NVVM::NVVMDialect::getManagedAttrName()) { + auto *gv = cast( + moduleTranslation.lookupGlobal(globalOp)); + llvm::Module *m = gv->getParent(); + llvm::LLVMContext &ctx = m->getContext(); + llvm::NamedMDNode *md = m->getOrInsertNamedMetadata("nvvm.annotations"); + md->addOperand(llvm::MDNode::get( + ctx, {llvm::ConstantAsMetadata::get(gv), + llvm::MDString::get(ctx, "managed"), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + llvm::Type::getInt32Ty(ctx), 1))})); + } + return success(); + } + auto func = dyn_cast(op); if (!func) return failure(); diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir index 8a7e9bae4ec2..f6360be2dfcc 100644 --- a/mlir/test/Target/LLVMIR/nvvmir.mlir +++ b/mlir/test/Target/LLVMIR/nvvmir.mlir @@ -915,3 +915,10 @@ llvm.func @nanosleep(%duration: i32) { nvvm.nanosleep %duration llvm.return } + +// ----- + +// CHECK: @managed_g = addrspace(1) global i32 0 +// CHECK: !nvvm.annotations = !{![[MANAGED:[0-9]+]]} +// CHECK: ![[MANAGED]] = !{ptr addrspace(1) @managed_g, !"managed", i32 1} +llvm.mlir.global external @managed_g(0 : i32) {addr_space = 1 : i32, nvvm.managed} : i32