llvm-project/flang/lib/Optimizer/Transforms/CUDA/CUFAddConstructor.cpp
Zhen Wang c4e6cf0abf
[flang][cuda] Support non-allocatable module-level managed variables (#188526)
Add support for non-allocatable module-level CUDA managed variables
using pointer indirection through a companion global in
__nv_managed_data__. The CUDA runtime populates this pointer with the
unified memory address via __cudaRegisterManagedVar and
__cudaInitModule.

1. Create a .managed.ptr companion global in the __nv_managed_data__
section and register it with _FortranACUFRegisterManagedVariable
(CUFAddConstructor.cpp)
2. Call __cudaInitModule after registration to populate the managed
pointer (registration.cpp)
3. Annotate managed globals in gpu.module with nvvm.managed for PTX
.attribute(.managed) generation (cuda-code-gen.mlir)
4. Suppress cuf.data_transfer for assignments to/from non-allocatable
module managed variables, since cudaMemcpy would target the shadow
address rather than the actual unified memory (tools.h)
5. Preserve cuf.data_transfer for device_var = managed_var assignments
where explicit transfer is still required
2026-03-31 16:27:08 +00:00

233 lines
10 KiB
C++

//===-- CUFAddConstructor.cpp ---------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/CodeGen/Target.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/SmallVector.h"
namespace fir {
#define GEN_PASS_DEF_CUFADDCONSTRUCTOR
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir
using namespace Fortran::runtime::cuda;
namespace {
static constexpr llvm::StringRef cudaFortranCtorName{
"__cudaFortranConstructor"};
static constexpr llvm::StringRef managedPtrSuffix{".managed.ptr"};
/// Create an 8-byte pointer global in the __nv_managed_data__ section.
/// The CUDA runtime populates this pointer with the unified memory address
/// when the module is initialized via __cudaInitModule.
static fir::GlobalOp createManagedPointerGlobal(fir::FirOpBuilder &builder,
mlir::ModuleOp mod,
fir::GlobalOp globalOp) {
mlir::MLIRContext *ctx = mod.getContext();
std::string ptrGlobalName = (globalOp.getSymName() + managedPtrSuffix).str();
auto ptrTy = fir::LLVMPointerType::get(ctx, mlir::IntegerType::get(ctx, 8));
mlir::OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointAfter(globalOp);
llvm::SmallVector<mlir::NamedAttribute> attrs;
attrs.push_back(
mlir::NamedAttribute(mlir::StringAttr::get(ctx, "section"),
mlir::StringAttr::get(ctx, "__nv_managed_data__")));
mlir::DenseElementsAttr initAttr = {};
auto ptrGlobal = fir::GlobalOp::create(
builder, globalOp.getLoc(), ptrGlobalName, /*isConstant=*/false,
/*isTarget=*/false, ptrTy, initAttr,
/*linkName=*/builder.createInternalLinkage(), attrs);
mlir::Region &region = ptrGlobal.getRegion();
mlir::Block *block = builder.createBlock(&region);
builder.setInsertionPointToStart(block);
mlir::Value zero = fir::ZeroOp::create(builder, globalOp.getLoc(), ptrTy);
fir::HasValueOp::create(builder, globalOp.getLoc(), zero);
return ptrGlobal;
}
struct CUFAddConstructor
: public fir::impl::CUFAddConstructorBase<CUFAddConstructor> {
void runOnOperation() override {
mlir::ModuleOp mod = getOperation();
mlir::SymbolTable symTab(mod);
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
fir::FirOpBuilder builder(opBuilder, mod);
fir::KindMapping kindMap{fir::getKindMapping(mod)};
builder.setInsertionPointToEnd(mod.getBody());
mlir::Location loc = mod.getLoc();
auto *ctx = mod.getContext();
auto voidTy = mlir::LLVM::LLVMVoidType::get(ctx);
auto idxTy = builder.getIndexType();
auto funcTy =
mlir::LLVM::LLVMFunctionType::get(voidTy, {}, /*isVarArg=*/false);
std::optional<mlir::DataLayout> dl =
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
if (!dl) {
mlir::emitError(mod.getLoc(),
"data layout attribute is required to perform " +
getName() + "pass");
}
// Symbol reference to CUFRegisterAllocator.
builder.setInsertionPointToEnd(mod.getBody());
auto registerFuncOp = mlir::LLVM::LLVMFuncOp::create(
builder, loc, RTNAME_STRING(CUFRegisterAllocator), funcTy);
registerFuncOp.setVisibility(mlir::SymbolTable::Visibility::Private);
auto cufRegisterAllocatorRef = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFRegisterAllocator));
builder.setInsertionPointToEnd(mod.getBody());
// Create the constructor function that call CUFRegisterAllocator.
auto func = mlir::LLVM::LLVMFuncOp::create(builder, loc,
cudaFortranCtorName, funcTy);
func.setLinkage(mlir::LLVM::Linkage::Internal);
builder.setInsertionPointToStart(func.addEntryBlock(builder));
mlir::LLVM::CallOp::create(builder, loc, funcTy, cufRegisterAllocatorRef);
auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(cudaDeviceModuleName);
if (gpuMod) {
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(ctx);
auto registeredMod = cuf::RegisterModuleOp::create(
builder, loc, llvmPtrTy,
mlir::SymbolRefAttr::get(ctx, gpuMod.getName()));
fir::LLVMTypeConverter typeConverter(mod, /*applyTBAA=*/false,
/*forceUnifiedTBAATree=*/false, *dl);
// Register kernels
for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
if (func.isKernel()) {
auto kernelName = mlir::SymbolRefAttr::get(
builder.getStringAttr(cudaDeviceModuleName),
{mlir::SymbolRefAttr::get(builder.getContext(), func.getName())});
cuf::RegisterKernelOp::create(builder, loc, kernelName,
registeredMod);
}
}
// Register variables
for (fir::GlobalOp globalOp : mod.getOps<fir::GlobalOp>()) {
auto attr = globalOp.getDataAttrAttr();
if (!attr)
continue;
bool isNonAllocManagedGlobal =
attr.getValue() == cuf::DataAttribute::Managed &&
!mlir::isa<fir::BaseBoxType>(globalOp.getType());
mlir::func::FuncOp func;
switch (attr.getValue()) {
case cuf::DataAttribute::Device:
case cuf::DataAttribute::Constant:
case cuf::DataAttribute::Managed: {
// Global variable name
std::string gblNameStr = globalOp.getSymbol().getValue().str();
gblNameStr += '\0';
mlir::Value gblName = fir::getBase(
fir::factory::createStringLiteral(builder, loc, gblNameStr));
// Global variable size
std::optional<uint64_t> size;
if (auto boxTy =
mlir::dyn_cast<fir::BaseBoxType>(globalOp.getType())) {
mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy);
size = dl->getTypeSizeInBits(structTy) / 8;
}
if (!size) {
size = fir::getTypeSizeAndAlignmentOrCrash(loc, globalOp.getType(),
*dl, kindMap)
.first;
}
auto sizeVal = builder.createIntegerConstant(loc, idxTy, *size);
if (isNonAllocManagedGlobal) {
// Non-allocatable managed globals use pointer indirection:
// a companion pointer in __nv_managed_data__ holds the unified
// memory address, registered via __cudaRegisterManagedVar.
fir::GlobalOp ptrGlobal =
createManagedPointerGlobal(builder, mod, globalOp);
func = fir::runtime::getRuntimeFunc<mkRTKey(
CUFRegisterManagedVariable)>(loc, builder);
auto fTy = func.getFunctionType();
mlir::Value addr = fir::AddrOfOp::create(
builder, loc, ptrGlobal.resultType(), ptrGlobal.getSymbol());
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
fir::CallOp::create(builder, loc, func, args);
} else {
func = fir::runtime::getRuntimeFunc<mkRTKey(CUFRegisterVariable)>(
loc, builder);
auto fTy = func.getFunctionType();
mlir::Value addr = fir::AddrOfOp::create(
builder, loc, globalOp.resultType(), globalOp.getSymbol());
llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
builder, loc, fTy, registeredMod, addr, gblName, sizeVal)};
fir::CallOp::create(builder, loc, func, args);
}
} break;
default:
break;
}
}
// Initialize the module after all variables are registered so the
// runtime populates managed variable unified memory pointers.
mlir::func::FuncOp initFunc =
fir::runtime::getRuntimeFunc<mkRTKey(CUFInitModule)>(loc, builder);
auto initFTy = initFunc.getFunctionType();
llvm::SmallVector<mlir::Value> initArgs{
fir::runtime::createArguments(builder, loc, initFTy, registeredMod)};
fir::CallOp::create(builder, loc, initFunc, initArgs);
}
mlir::LLVM::ReturnOp::create(builder, loc, mlir::ValueRange{});
// Create the llvm.global_ctor with the function.
// TODO: We might want to have a utility that retrieve it if already
// created and adds new functions.
builder.setInsertionPointToEnd(mod.getBody());
llvm::SmallVector<mlir::Attribute> funcs;
funcs.push_back(
mlir::FlatSymbolRefAttr::get(mod.getContext(), func.getSymName()));
llvm::SmallVector<int> priorities;
llvm::SmallVector<mlir::Attribute> data;
priorities.push_back(0);
data.push_back(mlir::LLVM::ZeroAttr::get(mod.getContext()));
mlir::LLVM::GlobalCtorsOp::create(
builder, mod.getLoc(), builder.getArrayAttr(funcs),
builder.getI32ArrayAttr(priorities), builder.getArrayAttr(data));
}
};
} // end anonymous namespace