1. When converting from the GPU dialect to the ROCDL dialect, if the
function that contains a gpu.thread_id or gpu.block_id op is annotated
with gpu.known_{block,grid}_size, use that size to set a "range"
attribute on the corresponding rocdl intrinsic so that the LLVM
frontend can optimize based on that range information.
1b. When translating from the rocdl dialect to LLVM IR, use the
"range" attribute, if present, to set !range metadata on the relevant
function call.
2. Deprecate the old rocdl.max_flat_work_group_size attribute, which
was used in a tensorflow backend. Instead, use
rocdl.flat_work_group_size going forward to allow kernel generators to
specify the minimum and maximum work group sizes a kernel may be
launched with in one attribute, thus more closely matching the backend.
3. When translating from gpu.func to llvm.func within gpu-to-rocdl,
copy the known_block_size attribute as rocdl.reqd_work_group_size to
enable further translations to set the corresponding metadata on the
LLVM IR function. Also, set the rocdl.flat_work_group_size attribute
to ensure that the reqd_work_group_size metadata and the
amdgpu-flat-work-group-size metadata are consistent.
3b. Extend the ROCDL to LLVM IR translation to set the
!reqd_work_group_size metadata on LLVM functions
Also update tests and add functions to the ROCDL dialect to ensure
attribute names are used consistently.
Depends on D139865
Reviewed By: antiagainst
Differential Revision: https://reviews.llvm.org/D139866
177 lines
7.1 KiB
C++
177 lines
7.1 KiB
C++
//===- ROCDLToLLVMIRTranslation.cpp - Translate ROCDL to LLVM IR ----------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements a translation between the MLIR ROCDL dialect and
|
|
// LLVM IR.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
|
|
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/Operation.h"
|
|
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
|
|
|
|
#include "llvm/IR/IRBuilder.h"
|
|
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
|
#include "llvm/IR/MDBuilder.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::LLVM;
|
|
using mlir::LLVM::detail::createIntrinsicCall;
|
|
|
|
static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder,
|
|
llvm::Intrinsic::ID intrinsic,
|
|
DenseI32ArrayAttr maybeRange) {
|
|
auto *inst = llvm::cast<llvm::CallInst>(
|
|
createIntrinsicCall(builder, intrinsic, {}, {}));
|
|
if (maybeRange) {
|
|
SmallVector<llvm::APInt, 2> apInts;
|
|
for (int32_t i : maybeRange.asArrayRef())
|
|
apInts.push_back(llvm::APInt(32, i));
|
|
llvm::MDBuilder mdBuilder(builder.getContext());
|
|
llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]);
|
|
inst->setMetadata(llvm::LLVMContext::MD_range, range);
|
|
}
|
|
return inst;
|
|
}
|
|
|
|
// Create a call to ROCm-Device-Library function
|
|
// Currently this routine will work only for calling ROCDL functions that
|
|
// take a single int32 argument. It is likely that the interface of this
|
|
// function will change to make it more generic.
|
|
static llvm::Value *createDeviceFunctionCall(llvm::IRBuilderBase &builder,
|
|
StringRef fnName, int parameter) {
|
|
llvm::Module *module = builder.GetInsertBlock()->getModule();
|
|
llvm::FunctionType *functionType = llvm::FunctionType::get(
|
|
llvm::Type::getInt64Ty(module->getContext()), // return type.
|
|
llvm::Type::getInt32Ty(module->getContext()), // parameter type.
|
|
false); // no variadic arguments.
|
|
llvm::Function *fn = dyn_cast<llvm::Function>(
|
|
module->getOrInsertFunction(fnName, functionType).getCallee());
|
|
llvm::Value *fnOp0 = llvm::ConstantInt::get(
|
|
llvm::Type::getInt32Ty(module->getContext()), parameter);
|
|
return builder.CreateCall(fn, ArrayRef<llvm::Value *>(fnOp0));
|
|
}
|
|
|
|
namespace {
|
|
/// Implementation of the dialect interface that converts operations belonging
|
|
/// to the ROCDL dialect to LLVM IR.
|
|
class ROCDLDialectLLVMIRTranslationInterface
|
|
: public LLVMTranslationDialectInterface {
|
|
public:
|
|
using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface;
|
|
|
|
/// Translates the given operation to LLVM IR using the provided IR builder
|
|
/// and saving the state in `moduleTranslation`.
|
|
LogicalResult
|
|
convertOperation(Operation *op, llvm::IRBuilderBase &builder,
|
|
LLVM::ModuleTranslation &moduleTranslation) const final {
|
|
Operation &opInst = *op;
|
|
#include "mlir/Dialect/LLVMIR/ROCDLConversions.inc"
|
|
|
|
return failure();
|
|
}
|
|
|
|
/// Attaches module-level metadata for functions marked as kernels.
|
|
LogicalResult
|
|
amendOperation(Operation *op, NamedAttribute attribute,
|
|
LLVM::ModuleTranslation &moduleTranslation) const final {
|
|
if (attribute.getName() == ROCDL::ROCDLDialect::getKernelFuncAttrName()) {
|
|
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
|
if (!func)
|
|
return failure();
|
|
|
|
// For GPU kernels,
|
|
// 1. Insert AMDGPU_KERNEL calling convention.
|
|
// 2. Insert amdgpu-flat-work-group-size(1, 256) attribute unless the user
|
|
// has overriden this value - 256 is the default in clang
|
|
// 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL
|
|
// and HIP kernels per Clang)
|
|
llvm::Function *llvmFunc =
|
|
moduleTranslation.lookupFunction(func.getName());
|
|
llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
|
|
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
|
|
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
|
|
}
|
|
llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");
|
|
}
|
|
// Override flat-work-group-size
|
|
// TODO: update clients to rocdl.flat_work_group_size instead,
|
|
// then remove this half of the branch
|
|
if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
|
|
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
|
if (!func)
|
|
return failure();
|
|
auto value = attribute.getValue().dyn_cast<IntegerAttr>();
|
|
if (!value)
|
|
return failure();
|
|
|
|
llvm::Function *llvmFunc =
|
|
moduleTranslation.lookupFunction(func.getName());
|
|
llvm::SmallString<8> llvmAttrValue;
|
|
llvm::raw_svector_ostream attrValueStream(llvmAttrValue);
|
|
attrValueStream << "1," << value.getInt();
|
|
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
|
|
}
|
|
if (ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName() ==
|
|
attribute.getName()) {
|
|
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
|
if (!func)
|
|
return failure();
|
|
auto value = attribute.getValue().dyn_cast<StringAttr>();
|
|
if (!value)
|
|
return failure();
|
|
|
|
llvm::Function *llvmFunc =
|
|
moduleTranslation.lookupFunction(func.getName());
|
|
llvm::SmallString<8> llvmAttrValue;
|
|
llvmAttrValue.append(value.getValue());
|
|
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
|
|
}
|
|
|
|
// Set reqd_work_group_size metadata
|
|
if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
|
|
attribute.getName()) {
|
|
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
|
|
if (!func)
|
|
return failure();
|
|
auto value = attribute.getValue().dyn_cast<DenseI32ArrayAttr>();
|
|
if (!value)
|
|
return failure();
|
|
llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
|
|
SmallVector<llvm::Metadata *, 3> metadata;
|
|
llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
|
|
for (int32_t i : value.asArrayRef()) {
|
|
llvm::Constant *constant = llvm::ConstantInt::get(i32, i);
|
|
metadata.push_back(llvm::ConstantAsMetadata::get(constant));
|
|
}
|
|
llvm::Function *llvmFunc =
|
|
moduleTranslation.lookupFunction(func.getName());
|
|
llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
|
|
llvmFunc->setMetadata("reqd_work_group_size", node);
|
|
}
|
|
return success();
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
void mlir::registerROCDLDialectTranslation(DialectRegistry ®istry) {
|
|
registry.insert<ROCDL::ROCDLDialect>();
|
|
registry.addExtension(+[](MLIRContext *ctx, ROCDL::ROCDLDialect *dialect) {
|
|
dialect->addInterfaces<ROCDLDialectLLVMIRTranslationInterface>();
|
|
});
|
|
}
|
|
|
|
void mlir::registerROCDLDialectTranslation(MLIRContext &context) {
|
|
DialectRegistry registry;
|
|
registerROCDLDialectTranslation(registry);
|
|
context.appendDialectRegistry(registry);
|
|
}
|