llvm-project/flang/lib/Optimizer/Transforms/CUDA/CUFDeviceFuncTransform.cpp
Valentin Clement (バレンタイン クレメン) 320c330989
[flang][cuda] Update visibility of declaration copied to in gpu.module (#179725)
https://github.com/llvm/llvm-project/pull/179362 changes which op is
checked for visibility during nested symbol resolution. This cause
issues in the CUDA Fortran pipeline and make some lookup fails. Update
the visibility of declaration copied to the gpu.module to nested.
2026-02-04 18:49:40 +00:00

251 lines
9.6 KiB
C++

//===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Support/InternalNames.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Index/IR/IndexOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/StringSet.h"
namespace fir {
#define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir
using namespace mlir;
namespace {
class CUFDeviceFuncTransform
: public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
using CUFDeviceFuncTransformBase<
CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;
static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
bool isGlobal, int computeCap) {
mlir::OpBuilder builder(funcOp.getContext());
mlir::Region &funcOpBody = funcOp.getBody();
SetVector<Value> operands;
for (mlir::Value operand : funcOp.getArguments())
operands.insert(operand);
llvm::SmallVector<mlir::Type> funcOperandTypes;
llvm::SmallVector<mlir::Type> funcResultTypes;
funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
funcResultTypes.reserve(funcOp.getResultTypes().size());
for (mlir::Type opTy : funcOp.getArgumentTypes())
funcOperandTypes.push_back(opTy);
for (mlir::Type resTy : funcOp.getResultTypes())
funcResultTypes.push_back(resTy);
mlir::Location loc = funcOp.getLoc();
mlir::FunctionType type = mlir::FunctionType::get(
funcOp.getContext(), funcOperandTypes, funcResultTypes);
auto deviceFuncOp =
gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
mlir::TypeRange{}, mlir::TypeRange{});
if (isGlobal)
deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
builder.getUnitAttr());
mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
mlir::Block &entryBlock = deviceFuncBody.front();
mlir::IRMapping map;
for (const auto &operand : enumerate(operands))
map.map(operand.value(), entryBlock.getArgument(operand.index()));
funcOpBody.cloneInto(&deviceFuncBody, map);
deviceFuncOp.walk([](func::ReturnOp op) {
mlir::OpBuilder replacer(op);
gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
gpuReturnOp->setOperands(op.getOperands());
op.erase();
});
mlir::Block &funcOpEntry = funcOp.front();
mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);
entryBlock.getOperations().splice(entryBlock.getOperations().end(),
clonedFuncOpEntry->getOperations());
clonedFuncOpEntry->erase();
auto launchBoundsAttr =
funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
cuf::getLaunchBoundsAttrName());
if (launchBoundsAttr) {
auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
auto maxntid =
builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
launchBoundsAttr.getMinBPM());
if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
launchBoundsAttr.getUpperBoundClusterSize());
}
return deviceFuncOp;
}
static void createHostStub(mlir::func::FuncOp funcOp,
mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
mlir::Location loc = funcOp.getLoc();
mlir::OpBuilder modBuilder(mod.getBodyRegion());
modBuilder.setInsertionPointToEnd(mod.getBody());
auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
funcOp.getFunctionType());
emptyStub.setVisibility(funcOp.getVisibility());
emptyStub->setAttrs(funcOp->getAttrs());
auto entryBlock = emptyStub.addEntryBlock();
modBuilder.setInsertionPointToEnd(entryBlock);
func::ReturnOp::create(modBuilder, loc);
symTab.erase(funcOp);
symTab.insert(emptyStub);
}
static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
if (auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName()))
if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
return true;
return false;
}
void runOnOperation() override {
// Working on Module operation because inserting/removing function from the
// module is not thread-safe.
ModuleOp mod = getOperation();
mlir::SymbolTable symbolTable(getOperation());
auto *ctx = getOperation().getContext();
mlir::OpBuilder builder(ctx);
gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
mlir::SymbolTable gpuModSymTab(gpuMod);
llvm::SetVector<mlir::func::FuncOp> funcsToClone;
llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
llvm::SetVector<mlir::func::FuncOp> keepInModule;
llvm::StringSet<> deviceFuncNames;
// Look for all function to migrate to the GPU module.
mod.walk([&](mlir::func::FuncOp op) {
if (isDeviceFunc(op)) {
deviceFuncs.insert(op);
deviceFuncNames.insert(op.getSymName());
}
});
auto processCallOp = [&](fir::CallOp op) {
if (op.getCallee()) {
auto func = symbolTable.lookup<mlir::func::FuncOp>(
op.getCallee()->getLeafReference());
if (deviceFuncs.count(func) == 0)
funcsToClone.insert(func);
}
};
// Gather all function called by device functions.
for (auto funcOp : deviceFuncs) {
funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
funcOp.walk([&](fir::DispatchOp op) {
TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
"in device procedure");
});
}
// Functions that are referenced in a derived-type binding table must be
// kept in the host module to avoid LLVM dialect verification errors.
for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
globalOp.walk([&](fir::AddrOfOp addrOfOp) {
if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
keepInModule.insert(
*llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
return f.getSymName() ==
addrOfOp.getSymbol().getLeafReference();
}));
});
}
}
// Gather all functions called by CUF kernels.
mod.walk([&](cuf::KernelOp kernelOp) {
kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
kernelOp.walk([&](fir::DispatchOp op) {
TODO(op.getLoc(),
"type-bound procedure call with dynamic dispatch in cuf kernel");
});
});
for (auto funcOp : funcsToClone)
gpuModSymTab.insert(funcOp->clone());
for (auto funcOp : deviceFuncs) {
auto cudaProcAttr =
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
cuf::getProcAttrName());
auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
if (funcOp.isDeclaration()) {
mlir::Operation *clonedFuncOp = funcOp->clone();
if (isGlobal) {
clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
builder.getUnitAttr());
clonedFuncOp->removeAttr(cuf::getProcAttrName());
if (auto funcOp = mlir::dyn_cast<func::FuncOp>(clonedFuncOp))
funcOp.setNested();
}
gpuModSymTab.insert(clonedFuncOp);
} else {
gpu::GPUFuncOp deviceFuncOp =
createGPUFuncOp(funcOp, isGlobal, computeCap);
gpuModSymTab.insert(deviceFuncOp);
if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
// If the function is a global, we need to keep the host side
// declaration for the kernel registration. Currently we just
// erase its body but in the future, the body should be rewritten
// to be able to launch CUDA Fortran kernel from C code.
if (isGlobal || keepInModule.contains(funcOp))
createHostStub(funcOp, symbolTable, mod);
else
funcOp.erase();
}
}
}
}
};
} // end anonymous namespace