llvm-project/clang/lib/CIR/CodeGen/CIRGenCUDANV.cpp

//========- CIRGenCUDANV.cpp - Interface to NVIDIA CUDA Runtime -----=========//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
// runtime library.
//
//===----------------------------------------------------------------------===//

#include "CIRGenCUDARuntime.h"
#include "CIRGenFunction.h"
#include "CIRGenModule.h"
#include "mlir/IR/Operation.h"
#include "clang/AST/ASTContext.h"
#include "clang/AST/Decl.h"
#include "clang/AST/GlobalDecl.h"
#include "clang/Basic/AddressSpaces.h"
#include "clang/Basic/Cuda.h"
#include "clang/CIR/Dialect/IR/CIRDialect.h"
#include "clang/CIR/Dialect/IR/CIRTypes.h"
#include "llvm/Support/Casting.h"

using namespace clang;
using namespace clang::CIRGen;

namespace {

class CIRGenNVCUDARuntime : public CIRGenCUDARuntime {
protected:
  StringRef prefix;

  // Map a device stub function to a symbol for identifying kernel in host
  // code. For CUDA, the symbol for identifying the kernel is the same as the
  // device stub function. For HIP, they are different.
  llvm::StringMap<mlir::Operation *> kernelHandles;

  // Map a kernel handle to the kernel stub.
  llvm::DenseMap<mlir::Operation *, mlir::Operation *> kernelStubs;
  // Mangle context for device.
  std::unique_ptr<MangleContext> deviceMC;

private:
  void emitDeviceStubBodyNew(CIRGenFunction &cgf, cir::FuncOp fn,
                             FunctionArgList &args);
  mlir::Value prepareKernelArgs(CIRGenFunction &cgf, mlir::Location loc,
                                FunctionArgList &args);
  mlir::Operation *getKernelHandle(cir::FuncOp fn, GlobalDecl gd) override;

  mlir::Operation *getKernelStub(mlir::Operation *handle) override {
    auto it = kernelStubs.find(handle);
    assert(it != kernelStubs.end());
    return it->second;
  }
  std::string addPrefixToName(StringRef funcName) const;
  std::string addUnderscoredPrefixToName(StringRef funcName) const;

public:
  CIRGenNVCUDARuntime(CIRGenModule &cgm);
  ~CIRGenNVCUDARuntime();

  void emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
                      FunctionArgList &args) override;
};

} // namespace

std::string CIRGenNVCUDARuntime::addPrefixToName(StringRef funcName) const {
  return (prefix + funcName).str();
}

std::string
CIRGenNVCUDARuntime::addUnderscoredPrefixToName(StringRef funcName) const {
  return ("__" + prefix + funcName).str();
}

CIRGenNVCUDARuntime::CIRGenNVCUDARuntime(CIRGenModule &cgm)
    : CIRGenCUDARuntime(cgm),
      deviceMC(cgm.getASTContext().cudaNVInitDeviceMC()) {
  if (cgm.getLangOpts().OffloadViaLLVM)
    cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");
  else if (cgm.getLangOpts().HIP)
    prefix = "hip";
  else
    prefix = "cuda";
}

mlir::Value CIRGenNVCUDARuntime::prepareKernelArgs(CIRGenFunction &cgf,
                                                   mlir::Location loc,
                                                   FunctionArgList &args) {
  CIRGenBuilderTy &builder = cgm.getBuilder();

  // Build void *args[] and populate with the addresses of kernel arguments.
  auto voidPtrArrayTy = cir::ArrayType::get(cgm.voidPtrTy, args.size());
  mlir::Value kernelArgs = builder.createAlloca(
      loc, cir::PointerType::get(voidPtrArrayTy), voidPtrArrayTy, "kernel_args",
      CharUnits::fromQuantity(16));

  mlir::Value kernelArgsDecayed =
      builder.createCast(cir::CastKind::array_to_ptrdecay, kernelArgs,
                         cir::PointerType::get(cgm.voidPtrTy));

  for (const auto &[i, arg] : llvm::enumerate(args)) {
    mlir::Value index =
        builder.getConstInt(loc, llvm::APInt(/*numBits=*/32, i));
    mlir::Value storePos =
        builder.createPtrStride(loc, kernelArgsDecayed, index);
    mlir::Value argAddr = cgf.getAddrOfLocalVar(arg).getPointer();
    mlir::Value argAsVoid = builder.createBitcast(argAddr, cgm.voidPtrTy);

    builder.CIRBaseBuilderTy::createStore(loc, argAsVoid, storePos);
  }

  return kernelArgsDecayed;
}

// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
// array and kernels are launched using cudaLaunchKernel().
void CIRGenNVCUDARuntime::emitDeviceStubBodyNew(CIRGenFunction &cgf,
                                                cir::FuncOp fn,
                                                FunctionArgList &args) {

  // This requires arguments to be sent to kernels in a different way.
  if (cgm.getLangOpts().OffloadViaLLVM)
    cgm.errorNYI("CIRGenNVCUDARuntime: Offload via LLVM");

  CIRGenBuilderTy &builder = cgm.getBuilder();
  mlir::Location loc = fn.getLoc();

  // For [cuda|hip]LaunchKernel, we must add another layer of indirection
  // to arguments. For example, for function `add(int a, float b)`,
  // we need to pass it as `void *args[2] = { &a, &b }`.
  mlir::Value kernelArgs = prepareKernelArgs(cgf, loc, args);

  // Lookup cudaLaunchKernel/hipLaunchKernel function.
  // HIP kernel launching API name depends on -fgpu-default-stream option. For
  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
  // it is hipLaunchKernel_spt.
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
  //                              void **args, size_t sharedMem,
  //                              cudaStream_t stream);
  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
  //                                  dim3 blockDim, void **args,
  //                                  size_t sharedMem, hipStream_t stream);
  TranslationUnitDecl *tuDecl = cgm.getASTContext().getTranslationUnitDecl();
  DeclContext *dc = TranslationUnitDecl::castToDeclContext(tuDecl);

  // The default stream is usually stream 0 (the legacy default stream).
  // For per-thread default stream, we need a different LaunchKernel function.
  StringRef kernelLaunchAPI = "LaunchKernel";
  if (cgm.getLangOpts().GPUDefaultStream ==
      LangOptions::GPUDefaultStreamKind::PerThread)
    cgm.errorNYI("CUDA/HIP Stream per thread");

  std::string launchKernelName = addPrefixToName(kernelLaunchAPI);
  const IdentifierInfo &launchII =
      cgm.getASTContext().Idents.get(launchKernelName);
  FunctionDecl *cudaLaunchKernelFD = nullptr;
  for (NamedDecl *result : dc->lookup(&launchII)) {
    if (FunctionDecl *fd = dyn_cast<FunctionDecl>(result))
      cudaLaunchKernelFD = fd;
  }

  if (cudaLaunchKernelFD == nullptr) {
    cgm.error(cgf.curFuncDecl->getLocation(),
              "Can't find declaration for " + launchKernelName);
    return;
  }

  // Use this function to retrieve arguments for cudaLaunchKernel:
  // int __[cuda|hip]PopCallConfiguration(dim3 *gridDim, dim3 *blockDim, size_t
  //                                *sharedMem, cudaStream_t *stream)
  //
  // Here [cuda|hip]Stream_t, while also being the 6th argument of
  // [cuda|hip]LaunchKernel, is a pointer to some opaque struct.

  mlir::Type dim3Ty = cgf.getTypes().convertType(
      cudaLaunchKernelFD->getParamDecl(1)->getType());
  mlir::Type streamTy = cgf.getTypes().convertType(
      cudaLaunchKernelFD->getParamDecl(5)->getType());

  mlir::Value gridDim =
      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
                           "grid_dim", CharUnits::fromQuantity(8));
  mlir::Value blockDim =
      builder.createAlloca(loc, cir::PointerType::get(dim3Ty), dim3Ty,
                           "block_dim", CharUnits::fromQuantity(8));
  mlir::Value sharedMem =
      builder.createAlloca(loc, cir::PointerType::get(cgm.sizeTy), cgm.sizeTy,
                           "shared_mem", cgm.getSizeAlign());
  mlir::Value stream =
      builder.createAlloca(loc, cir::PointerType::get(streamTy), streamTy,
                           "stream", cgm.getPointerAlign());

  cir::FuncOp popConfig = cgm.createRuntimeFunction(
      cir::FuncType::get({gridDim.getType(), blockDim.getType(),
                          sharedMem.getType(), stream.getType()},
                         cgm.sInt32Ty),
      addUnderscoredPrefixToName("PopCallConfiguration"));
  cgf.emitRuntimeCall(loc, popConfig, {gridDim, blockDim, sharedMem, stream});

  // Now emit the call to cudaLaunchKernel
  // [cuda|hip]Error_t [cuda|hip]LaunchKernel(const void *func, dim3 gridDim,
  // dim3 blockDim,
  //                              void **args, size_t sharedMem,
  //                              [cuda|hip]Stream_t stream);

  // We now either pick the function or the stub global for cuda, hip
  // respectively.
  mlir::Value kernel = [&]() -> mlir::Value {
    if (cir::GlobalOp globalOp = llvm::dyn_cast_or_null<cir::GlobalOp>(
            kernelHandles[fn.getSymName()])) {
      cir::PointerType kernelTy = cir::PointerType::get(globalOp.getSymType());
      mlir::Value kernelVal = cir::GetGlobalOp::create(builder, loc, kernelTy,
                                                       globalOp.getSymName());
      mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
      return func;
    }
    if (cir::FuncOp funcOp = llvm::dyn_cast_or_null<cir::FuncOp>(
            kernelHandles[fn.getSymName()])) {
      cir::PointerType kernelTy =
          cir::PointerType::get(funcOp.getFunctionType());
      mlir::Value kernelVal =
          cir::GetGlobalOp::create(builder, loc, kernelTy, funcOp.getSymName());
      mlir::Value func = builder.createBitcast(kernelVal, cgm.voidPtrTy);
      return func;
    }
    llvm_unreachable("Expected stub handle to be cir::GlobalOp or FuncOp");
  }();

  CallArgList launchArgs;
  launchArgs.add(RValue::get(kernel),
                 cudaLaunchKernelFD->getParamDecl(0)->getType());
  launchArgs.add(
      RValue::getAggregate(Address(gridDim, CharUnits::fromQuantity(8))),
      cudaLaunchKernelFD->getParamDecl(1)->getType());
  launchArgs.add(
      RValue::getAggregate(Address(blockDim, CharUnits::fromQuantity(8))),
      cudaLaunchKernelFD->getParamDecl(2)->getType());
  launchArgs.add(RValue::get(kernelArgs),
                 cudaLaunchKernelFD->getParamDecl(3)->getType());
  launchArgs.add(
      RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, sharedMem)),
      cudaLaunchKernelFD->getParamDecl(4)->getType());
  launchArgs.add(RValue::get(builder.CIRBaseBuilderTy::createLoad(loc, stream)),
                 cudaLaunchKernelFD->getParamDecl(5)->getType());

  mlir::Type launchTy =
      cgm.getTypes().convertType(cudaLaunchKernelFD->getType());
  mlir::Operation *cudaKernelLauncherFn = cgm.createRuntimeFunction(
      cast<cir::FuncType>(launchTy), launchKernelName);
  const CIRGenFunctionInfo &callInfo =
      cgm.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
  cgf.emitCall(callInfo, CIRGenCallee::forDirect(cudaKernelLauncherFn),
               ReturnValueSlot(), launchArgs);

  if (cgm.getASTContext().getTargetInfo().getCXXABI().isMicrosoft() &&
      !cgf.getLangOpts().HIP)
    cgm.errorNYI("MSVC CUDA stub handling");
}

void CIRGenNVCUDARuntime::emitDeviceStub(CIRGenFunction &cgf, cir::FuncOp fn,
                                         FunctionArgList &args) {

  if (auto globalOp =
          llvm::dyn_cast<cir::GlobalOp>(kernelHandles[fn.getSymName()])) {
    CIRGenBuilderTy &builder = cgm.getBuilder();
    mlir::Type fnPtrTy = globalOp.getSymType();
    auto sym = mlir::FlatSymbolRefAttr::get(fn.getSymNameAttr());
    auto gv = cir::GlobalViewAttr::get(fnPtrTy, sym);

    globalOp->setAttr("initial_value", gv);
    globalOp->removeAttr("sym_visibility");
    globalOp->setAttr("alignment", builder.getI64IntegerAttr(
                                       cgm.getPointerAlign().getQuantity()));
  }

  // CUDA 9.0 changed the way to launch kernels.
  if (CudaFeatureEnabled(cgm.getTarget().getSDKVersion(),
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
      (cgm.getLangOpts().HIP && cgm.getLangOpts().HIPUseNewLaunchAPI) ||
      cgm.getLangOpts().OffloadViaLLVM)
    emitDeviceStubBodyNew(cgf, fn, args);
  else
    cgm.errorNYI("Emit Stub Body Legacy");
}

CIRGenCUDARuntime *clang::CIRGen::createNVCUDARuntime(CIRGenModule &cgm) {
  return new CIRGenNVCUDARuntime(cgm);
}

CIRGenNVCUDARuntime::~CIRGenNVCUDARuntime() {}

mlir::Operation *CIRGenNVCUDARuntime::getKernelHandle(cir::FuncOp fn,
                                                      GlobalDecl gd) {

  // Check if we already have a kernel handle for this function
  auto it = kernelHandles.find(fn.getSymName());
  if (it != kernelHandles.end()) {
    mlir::Operation *oldHandle = it->second;
    // Here we know that the fn did not change. Return it
    if (kernelStubs[oldHandle] == fn)
      return oldHandle;

    // We've found the function name, but F itself has changed, so we need to
    // update the references.
    if (cgm.getLangOpts().HIP) {
      // For HIP compilation the handle itself does not change, so we only need
      // to update the Stub value.
      kernelStubs[oldHandle] = fn;
      return oldHandle;
    }
    // For non-HIP compilation, erase the old Stub and fall-through to creating
    // new entries.
    kernelStubs.erase(oldHandle);
  }

  // If not targeting HIP, store the function itself
  if (!cgm.getLangOpts().HIP) {
    kernelHandles[fn.getSymName()] = fn;
    kernelStubs[fn] = fn;
    return fn;
  }

  // Create a new CIR global variable to represent the kernel handle
  CIRGenBuilderTy &builder = cgm.getBuilder();
  StringRef globalName = cgm.getMangledName(
      gd.getWithKernelReferenceKind(KernelReferenceKind::Kernel));
  cir::GlobalOp globalOp = CIRGenModule::createGlobalOp(
      cgm, fn.getLoc(), globalName, fn.getFunctionType(),
      /*isConstant=*/true);

  globalOp->setAttr("alignment", builder.getI64IntegerAttr(
                                     cgm.getPointerAlign().getQuantity()));

  // Store references
  kernelHandles[fn.getSymName()] = globalOp;
  kernelStubs[globalOp] = fn;

  return globalOp;
}