This commit moves FuncOp out of the builtin dialect, and into the Func dialect. This move has been planned in some capacity from the moment we made FuncOp an operation (years ago). This commit handles the functional aspects of the move, but various aspects are left untouched to ease migration: func::FuncOp is re-exported into mlir to reduce the actual API churn, the assembly format still accepts the unqualified `func`. These temporary measures will remain for a little while to simplify migration before being removed. Differential Revision: https://reviews.llvm.org/D121266
360 lines
16 KiB
C++
360 lines
16 KiB
C++
//===- LowerGpuOpsToNVVMOps.cpp - MLIR GPU to NVVM lowering passes --------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements a pass to generate NVVMIR operations for higher-level
|
|
// GPU operations.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
|
|
|
|
#include "mlir/Conversion/ArithmeticToLLVM/ArithmeticToLLVM.h"
|
|
#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
|
|
#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
|
|
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
|
|
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
|
|
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
|
|
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
|
|
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
|
|
#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
|
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
|
#include "mlir/Dialect/GPU/GPUDialect.h"
|
|
#include "mlir/Dialect/GPU/Passes.h"
|
|
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
|
|
#include "mlir/Dialect/Math/IR/Math.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/IR/BlockAndValueMapping.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "llvm/Support/FormatVariadic.h"
|
|
|
|
#include "../GPUCommon/GPUOpsLowering.h"
|
|
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
|
|
#include "../GPUCommon/OpToFuncCallLowering.h"
|
|
#include "../PassDetail.h"
|
|
|
|
using namespace mlir;
|
|
|
|
namespace {
|
|
|
|
/// NVVM memory space identifiers.
|
|
enum NVVMMemorySpace {
|
|
/// Global memory space identifier.
|
|
kGlobalMemorySpace = 1,
|
|
/// Shared memory space identifier.
|
|
kSharedMemorySpace = 3
|
|
};
|
|
|
|
/// Convert gpu dialect shfl mode enum to the equivalent nvvm one.
|
|
static NVVM::ShflKind convertShflKind(gpu::ShuffleMode mode) {
|
|
switch (mode) {
|
|
case gpu::ShuffleMode::XOR:
|
|
return NVVM::ShflKind::bfly;
|
|
case gpu::ShuffleMode::UP:
|
|
return NVVM::ShflKind::up;
|
|
case gpu::ShuffleMode::DOWN:
|
|
return NVVM::ShflKind::down;
|
|
case gpu::ShuffleMode::IDX:
|
|
return NVVM::ShflKind::idx;
|
|
}
|
|
llvm_unreachable("unknown shuffle mode");
|
|
}
|
|
|
|
struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
|
|
using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
|
|
|
|
/// Lowers a shuffle to the corresponding NVVM op.
|
|
///
|
|
/// Convert the `width` argument into an activeMask (a bitmask which specifies
|
|
/// which threads participate in the shuffle) and a maskAndClamp (specifying
|
|
/// the highest lane which participates in the shuffle).
|
|
///
|
|
/// %one = llvm.constant(1 : i32) : i32
|
|
/// %minus_one = llvm.constant(-1 : i32) : i32
|
|
/// %thirty_two = llvm.constant(32 : i32) : i32
|
|
/// %num_lanes = llvm.sub %thirty_two, %width : i32
|
|
/// %active_mask = llvm.lshr %minus_one, %num_lanes : i32
|
|
/// %mask_and_clamp = llvm.sub %width, %one : i32
|
|
/// %shfl = nvvm.shfl.sync.bfly %active_mask, %value, %offset,
|
|
/// %mask_and_clamp : !llvm<"{ float, i1 }">
|
|
/// %shfl_value = llvm.extractvalue %shfl[0 : index] :
|
|
/// !llvm<"{ float, i1 }">
|
|
/// %shfl_pred = llvm.extractvalue %shfl[1 : index] :
|
|
/// !llvm<"{ float, i1 }">
|
|
LogicalResult
|
|
matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
Location loc = op->getLoc();
|
|
|
|
auto valueTy = adaptor.value().getType();
|
|
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
|
|
auto predTy = IntegerType::get(rewriter.getContext(), 1);
|
|
auto resultTy = LLVM::LLVMStructType::getLiteral(rewriter.getContext(),
|
|
{valueTy, predTy});
|
|
|
|
Value one = rewriter.create<LLVM::ConstantOp>(
|
|
loc, int32Type, rewriter.getI32IntegerAttr(1));
|
|
Value minusOne = rewriter.create<LLVM::ConstantOp>(
|
|
loc, int32Type, rewriter.getI32IntegerAttr(-1));
|
|
Value thirtyTwo = rewriter.create<LLVM::ConstantOp>(
|
|
loc, int32Type, rewriter.getI32IntegerAttr(32));
|
|
Value numLeadInactiveLane = rewriter.create<LLVM::SubOp>(
|
|
loc, int32Type, thirtyTwo, adaptor.width());
|
|
// Bit mask of active lanes: `(-1) >> (32 - activeWidth)`.
|
|
Value activeMask = rewriter.create<LLVM::LShrOp>(loc, int32Type, minusOne,
|
|
numLeadInactiveLane);
|
|
Value maskAndClamp;
|
|
if (op.mode() == gpu::ShuffleMode::UP) {
|
|
// Clamp lane: `32 - activeWidth`
|
|
maskAndClamp = numLeadInactiveLane;
|
|
} else {
|
|
// Clamp lane: `activeWidth - 1`
|
|
maskAndClamp =
|
|
rewriter.create<LLVM::SubOp>(loc, int32Type, adaptor.width(), one);
|
|
}
|
|
|
|
auto returnValueAndIsValidAttr = rewriter.getUnitAttr();
|
|
Value shfl = rewriter.create<NVVM::ShflOp>(
|
|
loc, resultTy, activeMask, adaptor.value(), adaptor.offset(),
|
|
maskAndClamp, convertShflKind(op.mode()), returnValueAndIsValidAttr);
|
|
Value shflValue = rewriter.create<LLVM::ExtractValueOp>(
|
|
loc, valueTy, shfl, rewriter.getIndexArrayAttr(0));
|
|
Value isActiveSrcLane = rewriter.create<LLVM::ExtractValueOp>(
|
|
loc, predTy, shfl, rewriter.getIndexArrayAttr(1));
|
|
|
|
rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct GPUAsyncCopyLowering
|
|
: public ConvertOpToLLVMPattern<gpu::DeviceAsyncCopyOp> {
|
|
using ConvertOpToLLVMPattern<gpu::DeviceAsyncCopyOp>::ConvertOpToLLVMPattern;
|
|
|
|
LogicalResult
|
|
matchAndRewrite(gpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
Location loc = op->getLoc();
|
|
auto dstMemrefType = op.dst().getType().cast<MemRefType>();
|
|
Value dstPtr = getStridedElementPtr(loc, dstMemrefType, adaptor.dst(),
|
|
adaptor.dstIndices(), rewriter);
|
|
auto i8Ty = IntegerType::get(op.getContext(), 8);
|
|
auto dstPointerType =
|
|
LLVM::LLVMPointerType::get(i8Ty, dstMemrefType.getMemorySpaceAsInt());
|
|
dstPtr = rewriter.create<LLVM::BitcastOp>(loc, dstPointerType, dstPtr);
|
|
|
|
auto srcMemrefType = op.src().getType().cast<MemRefType>();
|
|
|
|
Value scrPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.src(),
|
|
adaptor.srcIndices(), rewriter);
|
|
auto srcPointerType =
|
|
LLVM::LLVMPointerType::get(i8Ty, srcMemrefType.getMemorySpaceAsInt());
|
|
scrPtr = rewriter.create<LLVM::BitcastOp>(loc, srcPointerType, scrPtr);
|
|
// Intrinsics takes a global pointer so we need an address space cast.
|
|
auto srcPointerGlobalType =
|
|
LLVM::LLVMPointerType::get(i8Ty, NVVMMemorySpace::kGlobalMemorySpace);
|
|
scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,
|
|
scrPtr);
|
|
int64_t numElements = adaptor.numElements().getZExtValue();
|
|
int64_t sizeInBytes =
|
|
(dstMemrefType.getElementTypeBitWidth() / 8) * numElements;
|
|
rewriter.create<NVVM::CpAsyncOp>(loc, dstPtr, scrPtr,
|
|
rewriter.getI32IntegerAttr(sizeInBytes));
|
|
|
|
// Drop the result token.
|
|
Value zero = rewriter.create<LLVM::ConstantOp>(
|
|
op->getLoc(), IntegerType::get(op.getContext(), 32),
|
|
rewriter.getI32IntegerAttr(0));
|
|
rewriter.replaceOp(op, zero);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct GPUAsyncCreateGroupLowering
|
|
: public ConvertOpToLLVMPattern<gpu::DeviceAsyncCreateGroupOp> {
|
|
using ConvertOpToLLVMPattern<
|
|
gpu::DeviceAsyncCreateGroupOp>::ConvertOpToLLVMPattern;
|
|
|
|
LogicalResult
|
|
matchAndRewrite(gpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
rewriter.create<NVVM::CpAsyncCommitGroupOp>(op.getLoc());
|
|
// Drop the result token.
|
|
Value zero = rewriter.create<LLVM::ConstantOp>(
|
|
op->getLoc(), IntegerType::get(op.getContext(), 32),
|
|
rewriter.getI32IntegerAttr(0));
|
|
rewriter.replaceOp(op, zero);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct GPUAsyncWaitLowering
|
|
: public ConvertOpToLLVMPattern<gpu::DeviceAsyncWaitOp> {
|
|
using ConvertOpToLLVMPattern<gpu::DeviceAsyncWaitOp>::ConvertOpToLLVMPattern;
|
|
|
|
LogicalResult
|
|
matchAndRewrite(gpu::DeviceAsyncWaitOp op, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// If numGroup is not present pick 0 as a conservative correct value.
|
|
int32_t numGroups = adaptor.numGroups() ? *adaptor.numGroups() : 0;
|
|
rewriter.create<NVVM::CpAsyncWaitGroupOp>(op.getLoc(), numGroups);
|
|
rewriter.eraseOp(op);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Import the GPU Ops to NVVM Patterns.
|
|
#include "GPUToNVVM.cpp.inc"
|
|
|
|
/// A pass that replaces all occurrences of GPU device operations with their
|
|
/// corresponding NVVM equivalent.
|
|
///
|
|
/// This pass only handles device code and is not meant to be run on GPU host
|
|
/// code.
|
|
struct LowerGpuOpsToNVVMOpsPass
|
|
: public ConvertGpuOpsToNVVMOpsBase<LowerGpuOpsToNVVMOpsPass> {
|
|
LowerGpuOpsToNVVMOpsPass() = default;
|
|
LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
|
|
this->indexBitwidth = indexBitwidth;
|
|
}
|
|
|
|
void runOnOperation() override {
|
|
gpu::GPUModuleOp m = getOperation();
|
|
|
|
/// Customize the bitwidth used for the device side index computations.
|
|
LowerToLLVMOptions options(
|
|
m.getContext(),
|
|
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
|
|
options.emitCWrappers = true;
|
|
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
|
|
options.overrideIndexBitwidth(indexBitwidth);
|
|
|
|
/// MemRef conversion for GPU to NVVM lowering. The GPU dialect uses memory
|
|
/// space 5 for private memory attributions, but NVVM represents private
|
|
/// memory allocations as local `alloca`s in the default address space. This
|
|
/// converter drops the private memory space to support the use case above.
|
|
LLVMTypeConverter converter(m.getContext(), options);
|
|
converter.addConversion([&](MemRefType type) -> Optional<Type> {
|
|
if (type.getMemorySpaceAsInt() !=
|
|
gpu::GPUDialect::getPrivateAddressSpace())
|
|
return llvm::None;
|
|
return converter.convertType(MemRefType::Builder(type).setMemorySpace(0));
|
|
});
|
|
/// device-side async tokens cannot be materialized in nvvm. We just convert
|
|
/// them to a dummy i32 type in order to easily drop them during conversion.
|
|
converter.addConversion([&](gpu::DeviceAsyncTokenType type) -> Type {
|
|
return converter.convertType(IntegerType::get(type.getContext(), 32));
|
|
});
|
|
// Lowering for MMAMatrixType.
|
|
converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
|
|
return convertMMAToLLVMType(type);
|
|
});
|
|
RewritePatternSet patterns(m.getContext());
|
|
RewritePatternSet llvmPatterns(m.getContext());
|
|
|
|
// Apply in-dialect lowering first. In-dialect lowering will replace ops
|
|
// which need to be lowered further, which is not supported by a single
|
|
// conversion pass.
|
|
populateGpuRewritePatterns(patterns);
|
|
(void)applyPatternsAndFoldGreedily(m, std::move(patterns));
|
|
|
|
arith::populateArithmeticToLLVMConversionPatterns(converter, llvmPatterns);
|
|
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
|
|
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
|
|
populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
|
|
populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
|
|
populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
|
|
LLVMConversionTarget target(getContext());
|
|
configureGpuToNVVMConversionLegality(target);
|
|
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
|
|
signalPassFailure();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
|
|
target.addIllegalOp<FuncOp>();
|
|
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
|
|
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
|
|
target.addIllegalDialect<gpu::GPUDialect>();
|
|
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::Exp2Op, LLVM::FAbsOp,
|
|
LLVM::FCeilOp, LLVM::FFloorOp, LLVM::LogOp, LLVM::Log10Op,
|
|
LLVM::Log2Op, LLVM::PowOp, LLVM::SinOp, LLVM::SqrtOp>();
|
|
|
|
// TODO: Remove once we support replacing non-root ops.
|
|
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
|
|
}
|
|
|
|
void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
|
|
RewritePatternSet &patterns) {
|
|
populateWithGenerated(patterns);
|
|
patterns
|
|
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
|
|
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
|
|
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
|
|
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
|
|
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
|
|
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
|
|
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
|
|
NVVM::GridDimYOp, NVVM::GridDimZOp>,
|
|
GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
|
|
|
|
// Explicitly drop memory space when lowering private memory
|
|
// attributions since NVVM models it as `alloca`s in the default
|
|
// memory space and does not support `alloca`s with addrspace(5).
|
|
patterns.add<GPUFuncOpLowering>(
|
|
converter, /*allocaAddrSpace=*/0,
|
|
StringAttr::get(&converter.getContext(),
|
|
NVVM::NVVMDialect::getKernelFuncAttrName()));
|
|
|
|
patterns.add<OpToFuncCallLowering<math::AbsOp>>(converter, "__nv_fabsf",
|
|
"__nv_fabs");
|
|
patterns.add<OpToFuncCallLowering<math::AtanOp>>(converter, "__nv_atanf",
|
|
"__nv_atan");
|
|
patterns.add<OpToFuncCallLowering<math::Atan2Op>>(converter, "__nv_atan2f",
|
|
"__nv_atan2");
|
|
patterns.add<OpToFuncCallLowering<math::CeilOp>>(converter, "__nv_ceilf",
|
|
"__nv_ceil");
|
|
patterns.add<OpToFuncCallLowering<math::CosOp>>(converter, "__nv_cosf",
|
|
"__nv_cos");
|
|
patterns.add<OpToFuncCallLowering<math::ExpOp>>(converter, "__nv_expf",
|
|
"__nv_exp");
|
|
patterns.add<OpToFuncCallLowering<math::Exp2Op>>(converter, "__nv_exp2f",
|
|
"__nv_exp2");
|
|
patterns.add<OpToFuncCallLowering<math::ExpM1Op>>(converter, "__nv_expm1f",
|
|
"__nv_expm1");
|
|
patterns.add<OpToFuncCallLowering<math::FloorOp>>(converter, "__nv_floorf",
|
|
"__nv_floor");
|
|
patterns.add<OpToFuncCallLowering<math::LogOp>>(converter, "__nv_logf",
|
|
"__nv_log");
|
|
patterns.add<OpToFuncCallLowering<math::Log1pOp>>(converter, "__nv_log1pf",
|
|
"__nv_log1p");
|
|
patterns.add<OpToFuncCallLowering<math::Log10Op>>(converter, "__nv_log10f",
|
|
"__nv_log10");
|
|
patterns.add<OpToFuncCallLowering<math::Log2Op>>(converter, "__nv_log2f",
|
|
"__nv_log2");
|
|
patterns.add<OpToFuncCallLowering<math::PowFOp>>(converter, "__nv_powf",
|
|
"__nv_pow");
|
|
patterns.add<OpToFuncCallLowering<math::RsqrtOp>>(converter, "__nv_rsqrtf",
|
|
"__nv_rsqrt");
|
|
patterns.add<OpToFuncCallLowering<math::SinOp>>(converter, "__nv_sinf",
|
|
"__nv_sin");
|
|
patterns.add<OpToFuncCallLowering<math::SqrtOp>>(converter, "__nv_sqrtf",
|
|
"__nv_sqrt");
|
|
patterns.add<OpToFuncCallLowering<math::TanhOp>>(converter, "__nv_tanhf",
|
|
"__nv_tanh");
|
|
patterns.add<GPUAsyncCopyLowering, GPUAsyncCreateGroupLowering,
|
|
GPUAsyncWaitLowering>(converter);
|
|
}
|
|
|
|
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
|
mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth) {
|
|
return std::make_unique<LowerGpuOpsToNVVMOpsPass>(indexBitwidth);
|
|
}
|