llvm-project/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp

345 lines
14 KiB
C++

//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
// runtime library.
//
//===----------------------------------------------------------------------===//
#include "CIRGenCUDARuntime.h"
#include "CIRGenFunction.h"
#include "CIRGenModule.h"
#include "mlir/IR/Operation.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
#include "clang/AST/GlobalDecl.h"
#include "clang/Basic/AddressSpaces.h"
#include "clang/Basic/Cuda.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "llvm/Support/Casting.h"
using namespace clang;
using namespace clang::CIRGen;
namespace {
class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
protected:
StringRef prefix;
// Map a device stub function to a symbol for identifying kernel in host
// code. For CUDA, the symbol for identifying the kernel is the same as the
// device stub function. For HIP, they are different.
llvm::StringMap<mlir::Operation *> kernelHandles;
// Map a kernel handle to the kernel stub.
llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
// Mangle context for device.
std::unique_ptr<MangleContext> deviceMC;
private:
void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args);
mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
FunctionArgList &args);
mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;
mlir::Operation *getKernelStub(mlir::Operation *handle) override {
auto it = kernelStubs.find(handle);
assert(it != kernelStubs.end());
return it->second;
}
std::string addPrefixToName(StringRef funcName) const;
std::string addUnderscoredPrefixToName(StringRef funcName) const;
public:
CIRGenNVCUDARuntime(CIRGenModule &cgm);
~CIRGenNVCUDARuntime();
void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args) override;
};
} // namespace
std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
return (prefix + funcName).str();
}
std::string
CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
return ("__" + prefix + funcName).str();
}
CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
: CIRGenCUDARuntime(cgm),
deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
if (cgm.getLangOpts().OffloadViaLLVM)
cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
else if (cgm.getLangOpts().HIP)
prefix = "hip";
else
prefix = "cuda";
}
mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
mlir::Location loc,
FunctionArgList &args) {
CIRGenBuilderTy &builder = cgm.getBuilder();
// Build void *args[] and populate with the addresses of kernel arguments.
auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
mlir::Value kernelArgs = builder.createAlloca(
loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
CharUnits::fromQuantity(16));
mlir::Value kernelArgsDecayed =
builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
cir::PointerType::get(cgm.voidPtrTy));
for (const auto &[i, arg] : llvm::enumerate(args)) {
mlir::Value index =
builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
mlir::Value storePos =
builder.createPtrStride(loc, kernelArgsDecayed, index);
mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy);
builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos);
}
return kernelArgsDecayed;
}
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
cir::FuncOp fn,
FunctionArgList &args) {
// This requires arguments to be sent to kernels in a different way.
if (cgm.getLangOpts().OffloadViaLLVM)
cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
CIRGenBuilderTy &builder = cgm.getBuilder();
mlir::Location loc = fn.getLoc();
// For [cuda|hip]LaunchKernel, we must add another layer of indirection
// to arguments. For example, for function `add(int a, float b)`,
// we need to pass it as `void *args[2] = { &a, &b }`.
mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);
// Lookup cudaLaunchKernel/hipLaunchKernel function.
// HIP kernel launching API name depends on -fgpu-default-stream option. For
// the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
// it is hipLaunchKernel_spt.
// cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
// void **args, size_t sharedMem,
// cudaStream_t stream);
// hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
// dim3 blockDim, void **args,
// size_t sharedMem, hipStream_t stream);
TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);
// The default stream is usually stream 0 (the legacy default stream).
// For per-thread default stream, we need a different LaunchKernel function.
StringRef kernelLaunchAPI = "LaunchKernel";
if (cgm.getLangOpts().GPUDefaultStream ==
LangOptions::GPUDefaultStreamKind::PerThread)
cgm.errorNYI("CUDA/HIP Stream per thread");
std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
const IdentifierInfo &launchII =
cgm.getASTContext().Idents.get(launchKernelName);
FunctionDecl *cudaLaunchKernelFD = nullptr;
for (NamedDecl *result : dc->lookup(&launchII)) {
if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
cudaLaunchKernelFD = fd;
}
if (cudaLaunchKernelFD == nullptr) {
cgm.error(cgf.curFuncDecl->getLocation(),
"Can't find declaration for " + launchKernelName);
return;
}
// Use this function to retrieve arguments for cudaLaunchKernel:
// int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
// *sharedMem, cudaStream_t *stream)
//
// Here [cuda|hip]Stream_t, while also being the 6th argument of
// [cuda|hip]LaunchKernel, is a pointer to some opaque struct.
mlir::Type dim3Ty = cgf.getTypes().convertType(
cudaLaunchKernelFD->getParamDecl(1)->getType());
mlir::Type streamTy = cgf.getTypes().convertType(
cudaLaunchKernelFD->getParamDecl(5)->getType());
mlir::Value gridDim =
builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
"grid_dim", CharUnits::fromQuantity(8));
mlir::Value blockDim =
builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
"block_dim", CharUnits::fromQuantity(8));
mlir::Value sharedMem =
builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
"shared_mem", cgm.getSizeAlign());
mlir::Value stream =
builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
"stream", cgm.getPointerAlign());
cir::FuncOp popConfig = cgm.createRuntimeFunction(
cir::FuncType::get({gridDim.getType(), blockDim.getType(),
sharedMem.getType(), stream.getType()},
cgm.sInt32Ty),
addUnderscoredPrefixToName("PopCallConfiguration"));
cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});
// Now emit the call to cudaLaunchKernel
// [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
// dim3 blockDim,
// void **args, size_t sharedMem,
// [cuda|hip]Stream_t stream);
// We now either pick the function or the stub global for cuda, hip
// respectively.
mlir::Value kernel = [&]() -> mlir::Value {
if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
kernelHandles[fn.getSymName()])) {
cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType());
mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy,
globalOp.getSymName());
mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
return func;
}
if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
kernelHandles[fn.getSymName()])) {
cir::PointerType kernelTy =
cir::PointerType::get(funcOp.getFunctionType());
mlir::Value kernelVal =
cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
return func;
}
llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp");
}();
CallArgList launchArgs;
launchArgs.add(RValue::get(kernel),
cudaLaunchKernelFD->getParamDecl(0)->getType());
launchArgs.add(
RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
cudaLaunchKernelFD->getParamDecl(1)->getType());
launchArgs.add(
RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
cudaLaunchKernelFD->getParamDecl(2)->getType());
launchArgs.add(RValue::get(kernelArgs),
cudaLaunchKernelFD->getParamDecl(3)->getType());
launchArgs.add(
RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
cudaLaunchKernelFD->getParamDecl(4)->getType());
launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
cudaLaunchKernelFD->getParamDecl(5)->getType());
mlir::Type launchTy =
cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
cast<cir::FuncType>(launchTy), launchKernelName);
const CIRGenFunctionInfo &callInfo =
cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
ReturnValueSlot(), launchArgs);
if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
!cgf.getLangOpts().HIP)
cgm.errorNYI("MSVC CUDA stub handling");
}
void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
FunctionArgList &args) {
if (auto globalOp =
llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
CIRGenBuilderTy &builder = cgm.getBuilder();
mlir::Type fnPtrTy = globalOp.getSymType();
auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);
globalOp->setAttr("initial_value", gv);
globalOp->removeAttr("sym_visibility");
globalOp->setAttr("alignment", builder.getI64IntegerAttr(
cgm.getPointerAlign().getQuantity()));
}
// CUDA 9.0 changed the way to launch kernels.
if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
(cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
cgm.getLangOpts().OffloadViaLLVM)
emitDeviceStubBodyNew(cgf, fn, args);
else
cgm.errorNYI("Emit Stub Body Legacy");
}
CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
return new CIRGenNVCUDARuntime(cgm);
}
CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}
mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
GlobalDecl gd) {
// Check if we already have a kernel handle for this function
auto it = kernelHandles.find(fn.getSymName());
if (it != kernelHandles.end()) {
mlir::Operation *oldHandle = it->second;
// Here we know that the fn did not change. Return it
if (kernelStubs[oldHandle] == fn)
return oldHandle;
// We've found the function name, but F itself has changed, so we need to
// update the references.
if (cgm.getLangOpts().HIP) {
// For HIP compilation the handle itself does not change, so we only need
// to update the Stub value.
kernelStubs[oldHandle] = fn;
return oldHandle;
}
// For non-HIP compilation, erase the old Stub and fall-through to creating
// new entries.
kernelStubs.erase(oldHandle);
}
// If not targeting HIP, store the function itself
if (!cgm.getLangOpts().HIP) {
kernelHandles[fn.getSymName()] = fn;
kernelStubs[fn] = fn;
return fn;
}
// Create a new CIR global variable to represent the kernel handle
CIRGenBuilderTy &builder = cgm.getBuilder();
StringRef globalName = cgm.getMangledName(
gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
cir::GlobalOp globalOp = CIRGenModule::createGlobalOp(
cgm, fn.getLoc(), globalName, fn.getFunctionType(),
/*isConstant=*/true);
globalOp->setAttr("alignment", builder.getI64IntegerAttr(
cgm.getPointerAlign().getQuantity()));
// Store references
kernelHandles[fn.getSymName()] = globalOp;
kernelStubs[globalOp] = fn;
return globalOp;
}