https://github.com/llvm/llvm-project/pull/179362 changes which op is checked for visibility during nested symbol resolution. This cause issues in the CUDA Fortran pipeline and make some lookup fails. Update the visibility of declaration copied to the gpu.module to nested.
251 lines
9.6 KiB
C++
251 lines
9.6 KiB
C++
//===-- CUFDeviceFuncTransform.cpp ----------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Optimizer/Builder/CUFCommon.h"
|
|
#include "flang/Optimizer/Builder/Todo.h"
|
|
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
|
|
#include "flang/Optimizer/Dialect/FIRAttr.h"
|
|
#include "flang/Optimizer/Dialect/FIRDialect.h"
|
|
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
|
|
#include "flang/Optimizer/Support/InternalNames.h"
|
|
#include "flang/Optimizer/Transforms/Passes.h"
|
|
#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/Index/IR/IndexDialect.h"
|
|
#include "mlir/Dialect/Index/IR/IndexOps.h"
|
|
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
|
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/IR/IRMapping.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/Transforms/RegionUtils.h"
|
|
#include "llvm/ADT/SetVector.h"
|
|
#include "llvm/ADT/StringSet.h"
|
|
|
|
namespace fir {
|
|
#define GEN_PASS_DEF_CUFDEVICEFUNCTRANSFORM
|
|
#include "flang/Optimizer/Transforms/Passes.h.inc"
|
|
} // namespace fir
|
|
|
|
using namespace mlir;
|
|
|
|
namespace {
|
|
|
|
class CUFDeviceFuncTransform
|
|
: public fir::impl::CUFDeviceFuncTransformBase<CUFDeviceFuncTransform> {
|
|
using CUFDeviceFuncTransformBase<
|
|
CUFDeviceFuncTransform>::CUFDeviceFuncTransformBase;
|
|
|
|
static gpu::GPUFuncOp createGPUFuncOp(mlir::func::FuncOp funcOp,
|
|
bool isGlobal, int computeCap) {
|
|
mlir::OpBuilder builder(funcOp.getContext());
|
|
|
|
mlir::Region &funcOpBody = funcOp.getBody();
|
|
SetVector<Value> operands;
|
|
for (mlir::Value operand : funcOp.getArguments())
|
|
operands.insert(operand);
|
|
|
|
llvm::SmallVector<mlir::Type> funcOperandTypes;
|
|
llvm::SmallVector<mlir::Type> funcResultTypes;
|
|
funcOperandTypes.reserve(funcOp.getArgumentTypes().size());
|
|
funcResultTypes.reserve(funcOp.getResultTypes().size());
|
|
for (mlir::Type opTy : funcOp.getArgumentTypes())
|
|
funcOperandTypes.push_back(opTy);
|
|
for (mlir::Type resTy : funcOp.getResultTypes())
|
|
funcResultTypes.push_back(resTy);
|
|
|
|
mlir::Location loc = funcOp.getLoc();
|
|
|
|
mlir::FunctionType type = mlir::FunctionType::get(
|
|
funcOp.getContext(), funcOperandTypes, funcResultTypes);
|
|
|
|
auto deviceFuncOp =
|
|
gpu::GPUFuncOp::create(builder, loc, funcOp.getName(), type,
|
|
mlir::TypeRange{}, mlir::TypeRange{});
|
|
if (isGlobal)
|
|
deviceFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
|
|
builder.getUnitAttr());
|
|
|
|
mlir::Region &deviceFuncBody = deviceFuncOp.getBody();
|
|
mlir::Block &entryBlock = deviceFuncBody.front();
|
|
|
|
mlir::IRMapping map;
|
|
for (const auto &operand : enumerate(operands))
|
|
map.map(operand.value(), entryBlock.getArgument(operand.index()));
|
|
|
|
funcOpBody.cloneInto(&deviceFuncBody, map);
|
|
|
|
deviceFuncOp.walk([](func::ReturnOp op) {
|
|
mlir::OpBuilder replacer(op);
|
|
gpu::ReturnOp gpuReturnOp = gpu::ReturnOp::create(replacer, op.getLoc());
|
|
gpuReturnOp->setOperands(op.getOperands());
|
|
op.erase();
|
|
});
|
|
|
|
mlir::Block &funcOpEntry = funcOp.front();
|
|
mlir::Block *clonedFuncOpEntry = map.lookup(&funcOpEntry);
|
|
|
|
entryBlock.getOperations().splice(entryBlock.getOperations().end(),
|
|
clonedFuncOpEntry->getOperations());
|
|
clonedFuncOpEntry->erase();
|
|
|
|
auto launchBoundsAttr =
|
|
funcOp.getOperation()->getAttrOfType<cuf::LaunchBoundsAttr>(
|
|
cuf::getLaunchBoundsAttrName());
|
|
if (launchBoundsAttr) {
|
|
auto maxTPB = launchBoundsAttr.getMaxTPB().getInt();
|
|
auto maxntid =
|
|
builder.getDenseI32ArrayAttr({static_cast<int32_t>(maxTPB), 1, 1});
|
|
deviceFuncOp->setAttr(NVVM::NVVMDialect::getMaxntidAttrName(), maxntid);
|
|
deviceFuncOp->setAttr(NVVM::NVVMDialect::getMinctasmAttrName(),
|
|
launchBoundsAttr.getMinBPM());
|
|
if (computeCap >= 90 && launchBoundsAttr.getUpperBoundClusterSize())
|
|
deviceFuncOp->setAttr(NVVM::NVVMDialect::getClusterMaxBlocksAttrName(),
|
|
launchBoundsAttr.getUpperBoundClusterSize());
|
|
}
|
|
|
|
return deviceFuncOp;
|
|
}
|
|
|
|
static void createHostStub(mlir::func::FuncOp funcOp,
|
|
mlir::SymbolTable &symTab, mlir::ModuleOp mod) {
|
|
mlir::Location loc = funcOp.getLoc();
|
|
mlir::OpBuilder modBuilder(mod.getBodyRegion());
|
|
modBuilder.setInsertionPointToEnd(mod.getBody());
|
|
auto emptyStub = func::FuncOp::create(modBuilder, loc, funcOp.getName(),
|
|
funcOp.getFunctionType());
|
|
emptyStub.setVisibility(funcOp.getVisibility());
|
|
emptyStub->setAttrs(funcOp->getAttrs());
|
|
auto entryBlock = emptyStub.addEntryBlock();
|
|
modBuilder.setInsertionPointToEnd(entryBlock);
|
|
func::ReturnOp::create(modBuilder, loc);
|
|
|
|
symTab.erase(funcOp);
|
|
symTab.insert(emptyStub);
|
|
}
|
|
|
|
static bool isDeviceFunc(mlir::func::FuncOp funcOp) {
|
|
if (auto cudaProcAttr =
|
|
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
|
|
cuf::getProcAttrName()))
|
|
if (cudaProcAttr.getValue() == cuf::ProcAttribute::Device ||
|
|
cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
|
|
cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal ||
|
|
cudaProcAttr.getValue() == cuf::ProcAttribute::HostDevice)
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
void runOnOperation() override {
|
|
// Working on Module operation because inserting/removing function from the
|
|
// module is not thread-safe.
|
|
ModuleOp mod = getOperation();
|
|
mlir::SymbolTable symbolTable(getOperation());
|
|
|
|
auto *ctx = getOperation().getContext();
|
|
mlir::OpBuilder builder(ctx);
|
|
|
|
gpu::GPUModuleOp gpuMod = cuf::getOrCreateGPUModule(mod, symbolTable);
|
|
mlir::SymbolTable gpuModSymTab(gpuMod);
|
|
|
|
llvm::SetVector<mlir::func::FuncOp> funcsToClone;
|
|
llvm::SetVector<mlir::func::FuncOp> deviceFuncs;
|
|
llvm::SetVector<mlir::func::FuncOp> keepInModule;
|
|
llvm::StringSet<> deviceFuncNames;
|
|
|
|
// Look for all function to migrate to the GPU module.
|
|
mod.walk([&](mlir::func::FuncOp op) {
|
|
if (isDeviceFunc(op)) {
|
|
deviceFuncs.insert(op);
|
|
deviceFuncNames.insert(op.getSymName());
|
|
}
|
|
});
|
|
|
|
auto processCallOp = [&](fir::CallOp op) {
|
|
if (op.getCallee()) {
|
|
auto func = symbolTable.lookup<mlir::func::FuncOp>(
|
|
op.getCallee()->getLeafReference());
|
|
if (deviceFuncs.count(func) == 0)
|
|
funcsToClone.insert(func);
|
|
}
|
|
};
|
|
|
|
// Gather all function called by device functions.
|
|
for (auto funcOp : deviceFuncs) {
|
|
funcOp.walk([&](fir::CallOp op) { processCallOp(op); });
|
|
funcOp.walk([&](fir::DispatchOp op) {
|
|
TODO(op.getLoc(), "type-bound procedure call with dynamic dispatch "
|
|
"in device procedure");
|
|
});
|
|
}
|
|
|
|
// Functions that are referenced in a derived-type binding table must be
|
|
// kept in the host module to avoid LLVM dialect verification errors.
|
|
for (auto globalOp : mod.getOps<fir::GlobalOp>()) {
|
|
if (globalOp.getName().contains(fir::kBindingTableSeparator)) {
|
|
globalOp.walk([&](fir::AddrOfOp addrOfOp) {
|
|
if (deviceFuncNames.contains(addrOfOp.getSymbol().getLeafReference()))
|
|
keepInModule.insert(
|
|
*llvm::find_if(deviceFuncs, [&](mlir::func::FuncOp f) {
|
|
return f.getSymName() ==
|
|
addrOfOp.getSymbol().getLeafReference();
|
|
}));
|
|
});
|
|
}
|
|
}
|
|
|
|
// Gather all functions called by CUF kernels.
|
|
mod.walk([&](cuf::KernelOp kernelOp) {
|
|
kernelOp.walk([&](fir::CallOp op) { processCallOp(op); });
|
|
kernelOp.walk([&](fir::DispatchOp op) {
|
|
TODO(op.getLoc(),
|
|
"type-bound procedure call with dynamic dispatch in cuf kernel");
|
|
});
|
|
});
|
|
|
|
for (auto funcOp : funcsToClone)
|
|
gpuModSymTab.insert(funcOp->clone());
|
|
|
|
for (auto funcOp : deviceFuncs) {
|
|
auto cudaProcAttr =
|
|
funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
|
|
cuf::getProcAttrName());
|
|
auto isGlobal = cudaProcAttr.getValue() == cuf::ProcAttribute::Global ||
|
|
cudaProcAttr.getValue() == cuf::ProcAttribute::GridGlobal;
|
|
if (funcOp.isDeclaration()) {
|
|
mlir::Operation *clonedFuncOp = funcOp->clone();
|
|
if (isGlobal) {
|
|
clonedFuncOp->setAttr(gpu::GPUDialect::getKernelFuncAttrName(),
|
|
builder.getUnitAttr());
|
|
clonedFuncOp->removeAttr(cuf::getProcAttrName());
|
|
if (auto funcOp = mlir::dyn_cast<func::FuncOp>(clonedFuncOp))
|
|
funcOp.setNested();
|
|
}
|
|
gpuModSymTab.insert(clonedFuncOp);
|
|
} else {
|
|
gpu::GPUFuncOp deviceFuncOp =
|
|
createGPUFuncOp(funcOp, isGlobal, computeCap);
|
|
gpuModSymTab.insert(deviceFuncOp);
|
|
|
|
if (cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice) {
|
|
// If the function is a global, we need to keep the host side
|
|
// declaration for the kernel registration. Currently we just
|
|
// erase its body but in the future, the body should be rewritten
|
|
// to be able to launch CUDA Fortran kernel from C code.
|
|
if (isGlobal || keepInModule.contains(funcOp))
|
|
createHostStub(funcOp, symbolTable, mod);
|
|
else
|
|
funcOp.erase();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
} // end anonymous namespace
|