[flang][cuda] Support non-allocatable module-level managed variables (#188526)
Add support for non-allocatable module-level CUDA managed variables using pointer indirection through a companion global in __nv_managed_data__. The CUDA runtime populates this pointer with the unified memory address via __cudaRegisterManagedVar and __cudaInitModule. 1. Create a .managed.ptr companion global in the __nv_managed_data__ section and register it with _FortranACUFRegisterManagedVariable (CUFAddConstructor.cpp) 2. Call __cudaInitModule after registration to populate the managed pointer (registration.cpp) 3. Annotate managed globals in gpu.module with nvvm.managed for PTX .attribute(.managed) generation (cuda-code-gen.mlir) 4. Suppress cuf.data_transfer for assignments to/from non-allocatable module managed variables, since cudaMemcpy would target the shadow address rather than the actual unified memory (tools.h) 5. Preserve cuf.data_transfer for device_var = managed_var assignments where explicit transfer is still required
This commit is contained in:
parent
0e0a0458ce
commit
c4e6cf0abf
@ -27,6 +27,7 @@ extern void __cudaRegisterVar(void **fatCubinHandle, char *hostVar,
|
||||
extern void __cudaRegisterManagedVar(void **fatCubinHandle,
|
||||
void **hostVarPtrAddress, char *deviceAddress, const char *deviceName,
|
||||
int ext, size_t size, int constant, int global);
|
||||
extern char __cudaInitModule(void **fatCubinHandle);
|
||||
|
||||
void *RTDECL(CUFRegisterModule)(void *data) {
|
||||
void **fatHandle{__cudaRegisterFatBinary(data)};
|
||||
@ -50,6 +51,8 @@ void RTDEF(CUFRegisterManagedVariable)(
|
||||
__cudaRegisterManagedVar(module, varSym, varName, varName, 0, size, 0, 0);
|
||||
}
|
||||
|
||||
void RTDEF(CUFInitModule)(void **module) { __cudaInitModule(module); }
|
||||
|
||||
} // extern "C"
|
||||
|
||||
} // namespace Fortran::runtime::cuda
|
||||
|
||||
@ -1311,6 +1311,28 @@ inline bool IsCUDAManagedOrUnifiedSymbol(const Symbol &sym) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Non-allocatable module-level managed/unified variables use pointer
|
||||
// indirection through a companion global in __nv_managed_data__.
|
||||
// Explicit data transfers (cudaMemcpy) must be avoided for these
|
||||
// variables since they would target the shadow address rather than
|
||||
// the actual unified memory address.
|
||||
inline bool IsNonAllocatableModuleCUDAManagedSymbol(const Symbol &sym) {
|
||||
const Symbol &ultimate = sym.GetUltimate();
|
||||
if (!IsCUDAManagedOrUnifiedSymbol(ultimate))
|
||||
return false;
|
||||
if (ultimate.attrs().test(semantics::Attr::ALLOCATABLE))
|
||||
return false;
|
||||
return ultimate.owner().IsModule();
|
||||
}
|
||||
|
||||
template <typename A>
|
||||
inline bool HasNonAllocatableModuleCUDAManagedSymbols(const A &expr) {
|
||||
for (const Symbol &sym : CollectCudaSymbols(expr))
|
||||
if (IsNonAllocatableModuleCUDAManagedSymbol(sym))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get the number of distinct symbols with CUDA device
|
||||
// attribute in the expression.
|
||||
template <typename A> inline int GetNbOfCUDADeviceSymbols(const A &expr) {
|
||||
@ -1350,17 +1372,21 @@ inline bool IsCUDADataTransfer(const A &lhs, const B &rhs) {
|
||||
int rhsNbManagedSymbols{GetNbOfCUDAManagedOrUnifiedSymbols(rhs)};
|
||||
int rhsNbSymbols{GetNbOfCUDADeviceSymbols(rhs)};
|
||||
|
||||
if (HasNonAllocatableModuleCUDAManagedSymbols(lhs))
|
||||
return false;
|
||||
|
||||
if (lhsNbManagedSymbols >= 1 && lhs.Rank() > 0 && rhsNbSymbols == 0 &&
|
||||
rhsNbManagedSymbols == 0 && (IsVariable(rhs) || IsConstantExpr(rhs))) {
|
||||
return true; // Managed arrays initialization is performed on the device.
|
||||
}
|
||||
|
||||
// Special cases performed on the host:
|
||||
// - Only managed or unifed symbols are involved on RHS and LHS.
|
||||
// - LHS is managed or unified and the RHS is host only.
|
||||
// Cases where no explicit data transfer is needed:
|
||||
// - Both sides involve only managed/unified symbols (host-accessible).
|
||||
// - LHS is host-only and RHS has only managed/unified symbols.
|
||||
// - LHS is managed/unified and RHS is host-only.
|
||||
if ((lhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
|
||||
(lhsNbManagedSymbols == 0 && rhsNbManagedSymbols >= 1 &&
|
||||
rhsNbManagedSymbols == rhsNbSymbols) ||
|
||||
(lhsNbManagedSymbols == 0 && !HasCUDADeviceAttrs(lhs) &&
|
||||
rhsNbManagedSymbols >= 1 && rhsNbManagedSymbols == rhsNbSymbols) ||
|
||||
(lhsNbManagedSymbols >= 1 && rhsNbSymbols == 0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -32,6 +32,11 @@ void RTDECL(CUFRegisterVariable)(
|
||||
void RTDECL(CUFRegisterManagedVariable)(
|
||||
void **module, void **varSym, char *varName, int64_t size);
|
||||
|
||||
/// Initialize a CUDA module after all variables have been registered.
|
||||
/// Triggers the runtime to populate managed variable pointers with
|
||||
/// unified memory addresses.
|
||||
void RTDECL(CUFInitModule)(void **module);
|
||||
|
||||
} // extern "C"
|
||||
|
||||
} // namespace Fortran::runtime::cuda
|
||||
|
||||
@ -3448,6 +3448,15 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
|
||||
g.setAddrSpace(
|
||||
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Constant));
|
||||
|
||||
if (gpuMod && global.getDataAttr() &&
|
||||
*global.getDataAttr() == cuf::DataAttribute::Managed &&
|
||||
!mlir::isa<fir::BaseBoxType>(global.getType())) {
|
||||
g.setAddrSpace(
|
||||
static_cast<unsigned>(mlir::NVVM::NVVMMemorySpace::Global));
|
||||
g->setAttr(mlir::NVVM::NVVMDialect::getManagedAttrName(),
|
||||
mlir::UnitAttr::get(global.getContext()));
|
||||
}
|
||||
|
||||
rewriter.eraseOp(global);
|
||||
return mlir::success();
|
||||
}
|
||||
|
||||
@ -41,6 +41,40 @@ namespace {
|
||||
|
||||
static constexpr llvm::StringRef cudaFortranCtorName{
|
||||
"__cudaFortranConstructor"};
|
||||
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
|
||||
|
||||
/// Create an 8-byte pointer global in the __nv_managed_data__ section.
|
||||
/// The CUDA runtime populates this pointer with the unified memory address
|
||||
/// when the module is initialized via __cudaInitModule.
|
||||
static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
|
||||
mlir::ModuleOp mod,
|
||||
fir::GlobalOp globalOp) {
|
||||
mlir::MLIRContext *ctx = mod.getContext();
|
||||
std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
|
||||
auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
|
||||
|
||||
mlir::OpBuilder::InsertionGuard guard(builder);
|
||||
builder.setInsertionPointAfter(globalOp);
|
||||
|
||||
llvm::SmallVector<mlir::NamedAttribute> attrs;
|
||||
attrs.push_back(
|
||||
mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
|
||||
mlir::StringAttr::get(ctx, "__nv_managed_data__")));
|
||||
|
||||
mlir::DenseElementsAttr initAttr = {};
|
||||
auto ptrGlobal = fir::GlobalOp::create(
|
||||
builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
|
||||
/*isTarget=*/false, ptrTy, initAttr,
|
||||
/*linkName=*/builder.createInternalLinkage(), attrs);
|
||||
|
||||
mlir::Region ®ion = ptrGlobal.getRegion();
|
||||
mlir::Block *block = builder.createBlock(®ion);
|
||||
builder.setInsertionPointToStart(block);
|
||||
mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
|
||||
fir::HasValueOp::create(builder, globalOp.getLoc(), zero);
|
||||
|
||||
return ptrGlobal;
|
||||
}
|
||||
|
||||
struct CUFAddConstructor
|
||||
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
|
||||
@ -108,19 +142,15 @@ struct CUFAddConstructor
|
||||
if (!attr)
|
||||
continue;
|
||||
|
||||
if (attr.getValue() == cuf::DataAttribute::Managed &&
|
||||
!mlir::isa<fir::BaseBoxType>(globalOp.getType()))
|
||||
TODO(loc, "registration of non-allocatable managed variables");
|
||||
bool isNonAllocManagedGlobal =
|
||||
attr.getValue() == cuf::DataAttribute::Managed &&
|
||||
!mlir::isa<fir::BaseBoxType>(globalOp.getType());
|
||||
|
||||
mlir::func::FuncOp func;
|
||||
switch (attr.getValue()) {
|
||||
case cuf::DataAttribute::Device:
|
||||
case cuf::DataAttribute::Constant:
|
||||
case cuf::DataAttribute::Managed: {
|
||||
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
|
||||
loc, builder);
|
||||
auto fTy = func.getFunctionType();
|
||||
|
||||
// Global variable name
|
||||
std::string gblNameStr = globalOp.getSymbol().getValue().str();
|
||||
gblNameStr += '\0';
|
||||
@ -141,18 +171,44 @@ struct CUFAddConstructor
|
||||
}
|
||||
auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
|
||||
|
||||
// Global variable address
|
||||
mlir::Value addr = fir::AddrOfOp::create(
|
||||
builder, loc, globalOp.resultType(), globalOp.getSymbol());
|
||||
|
||||
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
||||
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
|
||||
fir::CallOp::create(builder, loc, func, args);
|
||||
if (isNonAllocManagedGlobal) {
|
||||
// Non-allocatable managed globals use pointer indirection:
|
||||
// a companion pointer in __nv_managed_data__ holds the unified
|
||||
// memory address, registered via __cudaRegisterManagedVar.
|
||||
fir::GlobalOp ptrGlobal =
|
||||
createManagedPointerGlobal(builder, mod, globalOp);
|
||||
func = fir::runtime::getRuntimeFunc<mkRTKey(
|
||||
CUFRegisterManagedVariable)>(loc, builder);
|
||||
auto fTy = func.getFunctionType();
|
||||
mlir::Value addr = fir::AddrOfOp::create(
|
||||
builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
|
||||
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
||||
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
|
||||
fir::CallOp::create(builder, loc, func, args);
|
||||
} else {
|
||||
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
|
||||
loc, builder);
|
||||
auto fTy = func.getFunctionType();
|
||||
mlir::Value addr = fir::AddrOfOp::create(
|
||||
builder, loc, globalOp.resultType(), globalOp.getSymbol());
|
||||
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
|
||||
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
|
||||
fir::CallOp::create(builder, loc, func, args);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize the module after all variables are registered so the
|
||||
// runtime populates managed variable unified memory pointers.
|
||||
mlir::func::FuncOp initFunc =
|
||||
fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
|
||||
auto initFTy = initFunc.getFunctionType();
|
||||
llvm::SmallVector<mlir::Value> initArgs{
|
||||
fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
|
||||
fir::CallOp::create(builder, loc, initFunc, initArgs);
|
||||
}
|
||||
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
|
||||
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
||||
#include "flang/Optimizer/Dialect/FIRDialect.h"
|
||||
#include "flang/Optimizer/Dialect/FIROps.h"
|
||||
#include "flang/Optimizer/Dialect/FIRType.h"
|
||||
#include "flang/Optimizer/Transforms/Passes.h"
|
||||
#include "flang/Runtime/CUDA/common.h"
|
||||
#include "flang/Runtime/CUDA/descriptor.h"
|
||||
@ -48,6 +49,8 @@ static mlir::Value createConvertOp(mlir::PatternRewriter &rewriter,
|
||||
return val;
|
||||
}
|
||||
|
||||
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
|
||||
|
||||
struct CUFDeviceAddressOpConversion
|
||||
: public mlir::OpRewritePattern<cuf::DeviceAddressOp> {
|
||||
using OpRewritePattern::OpRewritePattern;
|
||||
@ -59,10 +62,25 @@ struct CUFDeviceAddressOpConversion
|
||||
mlir::LogicalResult
|
||||
matchAndRewrite(cuf::DeviceAddressOp op,
|
||||
mlir::PatternRewriter &rewriter) const override {
|
||||
if (auto global = symTab.lookup<fir::GlobalOp>(
|
||||
op.getHostSymbol().getRootReference().getValue())) {
|
||||
auto symName = op.getHostSymbol().getRootReference().getValue();
|
||||
if (auto global = symTab.lookup<fir::GlobalOp>(symName)) {
|
||||
auto mod = op->getParentOfType<mlir::ModuleOp>();
|
||||
mlir::Location loc = op.getLoc();
|
||||
|
||||
// For non-allocatable managed globals, CUFAddConstructor created a
|
||||
// companion pointer global (@sym.managed.ptr) that holds the unified
|
||||
// memory address. Load from it instead of calling CUFGetDeviceAddress.
|
||||
std::string ptrGlobalName = (symName + managedPtrSuffix).str();
|
||||
if (auto ptrGlobal = symTab.lookup<fir::GlobalOp>(ptrGlobalName)) {
|
||||
auto ptrRef = fir::AddrOfOp::create(
|
||||
rewriter, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
|
||||
auto rawPtr = fir::LoadOp::create(rewriter, loc, ptrRef);
|
||||
auto converted =
|
||||
fir::ConvertOp::create(rewriter, loc, op.getType(), rawPtr);
|
||||
rewriter.replaceOp(op, converted);
|
||||
return success();
|
||||
}
|
||||
|
||||
auto hostAddr = fir::AddrOfOp::create(
|
||||
rewriter, loc, fir::ReferenceType::get(global.getType()),
|
||||
op.getHostSymbol());
|
||||
|
||||
@ -312,3 +312,19 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
|
||||
// CHECK-LABEL: gpu.func @_QMkernelsPassign
|
||||
// CHECK: %[[ADDROF:.*]] = llvm.mlir.addressof @_QMkernelsEinitial_val : !llvm.ptr<4>
|
||||
// CHECK: %{{.*}} = llvm.addrspacecast %[[ADDROF]] : !llvm.ptr<4> to !llvm.ptr
|
||||
|
||||
// -----
|
||||
|
||||
// Test that non-allocatable managed globals inside gpu.module get
|
||||
// addr_space = 1 (Global) and the nvvm.managed annotation.
|
||||
|
||||
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
|
||||
gpu.module @cuda_device_mod {
|
||||
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
|
||||
%0 = fir.zero_bits !fir.array<100xi32>
|
||||
fir.has_value %0 : !fir.array<100xi32>
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: llvm.mlir.global external @_QMtestEmanx() {addr_space = 1 : i32, nvvm.managed} : !llvm.array<100 x i32>
|
||||
|
||||
@ -29,7 +29,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
|
||||
// CHECK-DAG: %[[BOX:.*]] = fir.address_of(@_QMmtestsEndev) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
|
||||
// CHECK-DAG: %[[BOXREF:.*]] = fir.convert %[[BOX]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<i8>
|
||||
// CHECK-DAG: fir.call @_FortranACUFRegisterVariable(%[[MODULE:.*]], %[[BOXREF]], %{{.*}}, %{{.*}})
|
||||
//
|
||||
// CHECK: fir.call @_FortranACUFInitModule
|
||||
|
||||
// -----
|
||||
|
||||
@ -78,3 +78,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i
|
||||
// CHECK: llvm.func internal @__cudaFortranConstructor()
|
||||
// CHECK: fir.address_of(@_QMmEa00)
|
||||
// CHECK: fir.call @_FortranACUFRegisterVariable
|
||||
// CHECK: fir.call @_FortranACUFInitModule
|
||||
|
||||
// -----
|
||||
|
||||
// Non-allocatable managed global: should create pointer global in
|
||||
// __nv_managed_data__ and register with CUFRegisterManagedVariable.
|
||||
//
|
||||
// Fortran source:
|
||||
// module test
|
||||
// integer*4, managed :: manx(100)
|
||||
// contains
|
||||
// attributes(global) subroutine kernel()
|
||||
// end subroutine
|
||||
// end module
|
||||
|
||||
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f32, dense<32> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
|
||||
|
||||
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
|
||||
%0 = fir.zero_bits !fir.array<100xi32>
|
||||
fir.has_value %0 : !fir.array<100xi32>
|
||||
}
|
||||
|
||||
gpu.module @cuda_device_mod {
|
||||
}
|
||||
}
|
||||
|
||||
// Pointer global should be created with section attribute.
|
||||
// CHECK: fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8>
|
||||
// CHECK: fir.zero_bits !fir.llvm_ptr<i8>
|
||||
|
||||
// Constructor should register with CUFRegisterManagedVariable then init module.
|
||||
// CHECK: llvm.func internal @__cudaFortranConstructor()
|
||||
// CHECK: fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
|
||||
// CHECK: fir.call @_FortranACUFRegisterManagedVariable
|
||||
// CHECK: fir.call @_FortranACUFInitModule
|
||||
|
||||
@ -12,3 +12,42 @@ func.func @_QPxa(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<d
|
||||
|
||||
// CHECK-LABEL: func.func @_QPxa
|
||||
// CHECK: fir.call @_FortranACUFGetDeviceAddress
|
||||
|
||||
// -----
|
||||
|
||||
// Non-allocatable managed global with companion pointer global:
|
||||
// cuf.device_address should load from the pointer global instead of
|
||||
// calling CUFGetDeviceAddress.
|
||||
//
|
||||
// Fortran source:
|
||||
// module test
|
||||
// integer*4, managed :: manx(100)
|
||||
// end module
|
||||
// subroutine user()
|
||||
// use test
|
||||
// manx(1) = 42
|
||||
// end subroutine
|
||||
|
||||
fir.global @_QMtestEmanx {data_attr = #cuf.cuda<managed>} : !fir.array<100xi32> {
|
||||
%0 = fir.zero_bits !fir.array<100xi32>
|
||||
fir.has_value %0 : !fir.array<100xi32>
|
||||
}
|
||||
|
||||
fir.global internal @_QMtestEmanx.managed.ptr {section = "__nv_managed_data__"} : !fir.llvm_ptr<i8> {
|
||||
%0 = fir.zero_bits !fir.llvm_ptr<i8>
|
||||
fir.has_value %0 : !fir.llvm_ptr<i8>
|
||||
}
|
||||
|
||||
func.func @_QPuser() {
|
||||
%c100 = arith.constant 100 : index
|
||||
%0 = cuf.device_address @_QMtestEmanx -> !fir.ref<!fir.array<100xi32>>
|
||||
%1 = fir.shape %c100 : (index) -> !fir.shape<1>
|
||||
%2 = fir.declare %0(%1) {uniq_name = "_QMtestEmanx"} : (!fir.ref<!fir.array<100xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<100xi32>>
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func.func @_QPuser
|
||||
// CHECK-NOT: fir.call @_FortranACUFGetDeviceAddress
|
||||
// CHECK: %[[PTR_REF:.*]] = fir.address_of(@_QMtestEmanx.managed.ptr) : !fir.ref<!fir.llvm_ptr<i8>>
|
||||
// CHECK: %[[RAW_PTR:.*]] = fir.load %[[PTR_REF]] : !fir.ref<!fir.llvm_ptr<i8>>
|
||||
// CHECK: %[[ADDR:.*]] = fir.convert %[[RAW_PTR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.array<100xi32>>
|
||||
|
||||
@ -637,3 +637,39 @@ end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPsub34
|
||||
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {hasManagedOrUnifedSymbols, transfer_kind = #cuf.cuda_transfer<host_device>} : f16, !fir.box<!fir.array<?xf16>>
|
||||
|
||||
module managed_mod
|
||||
integer, managed :: marray(10)
|
||||
end module
|
||||
|
||||
subroutine sub35()
|
||||
use managed_mod
|
||||
integer :: host_arr(10)
|
||||
marray = host_arr
|
||||
marray = 0
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPsub35()
|
||||
! CHECK-NOT: cuf.data_transfer
|
||||
|
||||
! Test that host_var = managed_module_var does NOT generate cuf.data_transfer
|
||||
! (managed memory is host-accessible, so direct assignment suffices).
|
||||
subroutine sub36()
|
||||
use managed_mod
|
||||
integer :: host_arr(10)
|
||||
host_arr = marray
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPsub36()
|
||||
! CHECK-NOT: cuf.data_transfer
|
||||
|
||||
! Test that device_var = managed_module_var DOES generate cuf.data_transfer
|
||||
! (device memory requires explicit cudaMemcpy).
|
||||
subroutine sub37()
|
||||
use managed_mod
|
||||
integer, device :: dev_arr(10)
|
||||
dev_arr = marray
|
||||
end subroutine
|
||||
|
||||
! CHECK-LABEL: func.func @_QPsub37()
|
||||
! CHECK: cuf.data_transfer
|
||||
|
||||
@ -96,6 +96,9 @@ def NVVM_Dialect : Dialect {
|
||||
/// nvvm.cluster_dim attributes.
|
||||
static StringRef getBlocksAreClustersAttrName() { return "nvvm.blocksareclusters"; }
|
||||
|
||||
/// Get the name of the attribute used to annotate managed global variables.
|
||||
static StringRef getManagedAttrName() { return "nvvm.managed"; }
|
||||
|
||||
/// Verify an attribute from this dialect on the argument at 'argIndex' for
|
||||
/// the region at 'regionIndex' on the given operation. Returns failure if
|
||||
/// the verification failed, success otherwise. This hook may optionally be
|
||||
|
||||
@ -705,11 +705,28 @@ public:
|
||||
return failure();
|
||||
}
|
||||
|
||||
/// Attaches module-level metadata for functions marked as kernels.
|
||||
/// Attaches module-level metadata for functions marked as kernels
|
||||
/// and managed annotations for global variables.
|
||||
LogicalResult
|
||||
amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
|
||||
NamedAttribute attribute,
|
||||
LLVM::ModuleTranslation &moduleTranslation) const final {
|
||||
if (auto globalOp = dyn_cast<LLVM::GlobalOp>(op)) {
|
||||
if (attribute.getName() == NVVM::NVVMDialect::getManagedAttrName()) {
|
||||
auto *gv = cast<llvm::GlobalVariable>(
|
||||
moduleTranslation.lookupGlobal(globalOp));
|
||||
llvm::Module *m = gv->getParent();
|
||||
llvm::LLVMContext &ctx = m->getContext();
|
||||
llvm::NamedMDNode *md = m->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
md->addOperand(llvm::MDNode::get(
|
||||
ctx, {llvm::ConstantAsMetadata::get(gv),
|
||||
llvm::MDString::get(ctx, "managed"),
|
||||
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
|
||||
llvm::Type::getInt32Ty(ctx), 1))}));
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
||||
if (!func)
|
||||
return failure();
|
||||
|
||||
@ -915,3 +915,10 @@ llvm.func @nanosleep(%duration: i32) {
|
||||
nvvm.nanosleep %duration
|
||||
llvm.return
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK: @managed_g = addrspace(1) global i32 0
|
||||
// CHECK: !nvvm.annotations = !{![[MANAGED:[0-9]+]]}
|
||||
// CHECK: ![[MANAGED]] = !{ptr addrspace(1) @managed_g, !"managed", i32 1}
|
||||
llvm.mlir.global external @managed_g(0 : i32) {addr_space = 1 : i32, nvvm.managed} : i32
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user