608 lines
27 KiB
C++
608 lines
27 KiB
C++
//===- XeGPUPeepHoleOptimizer.cpp - XeGPU optimize block loads -*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
|
|
#include "mlir/Dialect/Utils/IndexingUtils.h"
|
|
#include "mlir/Dialect/Utils/StaticValueUtils.h"
|
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
|
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
|
|
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
|
|
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
|
|
#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/OpDefinition.h"
|
|
#include "mlir/IR/Types.h"
|
|
#include "mlir/IR/Value.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include <optional>
|
|
|
|
namespace mlir {
|
|
namespace xegpu {
|
|
#define GEN_PASS_DEF_XEGPUPEEPHOLEOPTIMIZER
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
|
|
} // namespace xegpu
|
|
} // namespace mlir
|
|
|
|
#define DEBUG_TYPE "xegpu-optimize-peephole"
|
|
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
|
|
|
|
using namespace mlir;
|
|
|
|
namespace {
|
|
|
|
/// Get the 2D lane data from a tensor desc type if it exists.
|
|
static std::optional<SmallVector<int64_t>>
|
|
getMaybeLaneData(xegpu::TensorDescType tdescType) {
|
|
auto layout = tdescType.getLayoutAttr();
|
|
if (!layout)
|
|
return std::nullopt;
|
|
auto laneData = layout.getEffectiveLaneDataAsInt();
|
|
if (laneData.size() != 2)
|
|
return std::nullopt;
|
|
return laneData;
|
|
}
|
|
|
|
/// Get the 2D lane layout from a tensor desc type if it exists.
|
|
static std::optional<SmallVector<int64_t>>
|
|
getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
|
|
auto layout = tdescType.getLayoutAttr();
|
|
if (!layout)
|
|
return std::nullopt;
|
|
auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
|
|
if (laneLayout.size() != 2)
|
|
return std::nullopt;
|
|
return laneLayout;
|
|
}
|
|
|
|
/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
|
|
/// lane[1] == 1), but inner lane data is not equal to [1, 1].
|
|
/// Example:
|
|
/// !xegpu.tensor_desc<16x16xf16,
|
|
/// #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
|
|
/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
|
|
/// indicating that this is a load that requires transpose effect. However,
|
|
/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
|
|
/// the inner dimension. We convert this to a optimized form by converting the
|
|
/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
|
|
/// later lowering easily use the load with transpose instruction.
|
|
static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
|
|
ArrayRef<int64_t> laneData) {
|
|
if (laneLayout.size() != 2 || laneData.size() != 2)
|
|
return false;
|
|
if (laneLayout[0] == 1 || laneLayout[1] != 1)
|
|
return false;
|
|
if (laneData[0] != 1 || laneData[1] == 1)
|
|
return false;
|
|
return true;
|
|
}
|
|
|
|
/// A tensor desc type can be optimized if its element type is less than 32 bits
|
|
/// and its layout can be optimized.
|
|
static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
|
|
// If the dtype is greater or equal to 32 bits, layout must be valid.
|
|
int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
|
|
if (elementTyBitwidth >= 32)
|
|
return false;
|
|
auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
|
|
auto maybeLaneData = getMaybeLaneData(tdescType);
|
|
if (!maybeLaneData || !maybeLaneLayout)
|
|
return false;
|
|
return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
|
|
}
|
|
|
|
/// Check if a tensor desc type can be optimized for transpose, if so return the
|
|
/// new optimized tensor desc type with a valid transpose layout.
|
|
static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
|
|
const uArch *targetuArch) {
|
|
if (!canBeOptimizedForTranspose(tdescType))
|
|
return tdescType;
|
|
auto laneData = getMaybeLaneData(tdescType)
|
|
.value(); // Lane data must exist if we reach here.
|
|
int64_t innerLaneData = laneData[1];
|
|
int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
|
|
// Required shape is total shape of the vector result that this tensor desc
|
|
// must eventually load after adjusting for the new bitwidth and array
|
|
// length.
|
|
SmallVector<int64_t> requiredShape(tdescType.getShape());
|
|
requiredShape.back() =
|
|
requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
|
|
int newBitWidth = elementTyBitwidth * innerLaneData;
|
|
Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
|
|
// Supported shape is the max transpose shape that can be supported by
|
|
// hardware that is less than or equal to required shape.
|
|
auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
|
|
targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
|
|
auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
|
|
newElemTy, /** has transform */ false, /** has transpose */ true);
|
|
// If no HW params found, return the original type.
|
|
if (!maybeHWParams)
|
|
return tdescType;
|
|
auto [widths, heights, counts] = maybeHWParams.value();
|
|
// TODO: Currently we expect array length to be 1 for transpose case.
|
|
if (counts.size() != 1 || counts[0] != 1)
|
|
return tdescType;
|
|
int arrayLen = counts[0];
|
|
int supportedHeight =
|
|
xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
|
|
int supportedWidth =
|
|
xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
|
|
// If no supported height or width found, return the original type.
|
|
if (supportedHeight == -1 || supportedWidth == -1)
|
|
return tdescType;
|
|
|
|
SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
|
|
xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
|
|
tdescType.getContext(), tdescType.getLayoutAttr().getLaneLayout(),
|
|
DenseI32ArrayAttr::get(tdescType.getContext(), {1, 1}),
|
|
tdescType.getLayoutAttr().getOrder());
|
|
// Array length can not be larger than 1 for transpose case.
|
|
return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
|
|
tdescType.getBoundaryCheck(),
|
|
tdescType.getMemorySpace(), newLayout);
|
|
}
|
|
|
|
/// Helper to convert an OpFoldResult to Value.
|
|
static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
|
|
OpFoldResult ofr) {
|
|
std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
|
|
if (mayBeInt)
|
|
return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
|
|
return llvm::cast<Value>(ofr);
|
|
}
|
|
|
|
/// Helper to divide a Value by a constant integer.
|
|
static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
|
|
Value val, int64_t constant) {
|
|
// If the constant is a power of 2, use right shift for division.
|
|
if (llvm::isPowerOf2_64(constant)) {
|
|
int64_t shiftAmount = llvm::Log2_64(constant);
|
|
return arith::ShRUIOp::create(
|
|
rewriter, loc, val,
|
|
arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
|
|
.getResult())
|
|
.getResult();
|
|
}
|
|
auto constantOp =
|
|
arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
|
|
return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
|
|
}
|
|
|
|
/// This function takes a larger register block `data` and generates multiple
|
|
/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
|
|
/// starting from `offsets`.
|
|
static Value generateLoads(ConversionPatternRewriter &rewriter,
|
|
TypedValue<VectorType> data,
|
|
SmallVector<OpFoldResult> offsets,
|
|
TypedValue<xegpu::TensorDescType> newTensorDesc,
|
|
xegpu::LoadNdOp origLoadOp) {
|
|
Location loc = data.getLoc();
|
|
assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
|
|
Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
|
|
Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
|
|
SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
|
|
// Compute the ratio between original shape and supported shape. We need to
|
|
// generate loads in this ratio arrangement.
|
|
auto shapeRatio = computeShapeRatio(data.getType().getShape(),
|
|
supportedShape)
|
|
.value(); // `ratio` must be defined if we reach here.
|
|
for (int64_t h = 0; h < shapeRatio[0]; ++h) {
|
|
for (int64_t w = 0; w < shapeRatio[1]; ++w) {
|
|
int64_t localOffsetDim0 = h * supportedShape[0];
|
|
int64_t localOffsetDim1 = w * supportedShape[1];
|
|
Value loadOffsetX = arith::AddIOp::create(
|
|
rewriter, loc, offsetDim0,
|
|
arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
|
|
.getResult());
|
|
Value loadOffsetY = arith::AddIOp::create(
|
|
rewriter, loc, offsetDim1,
|
|
arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
|
|
.getResult());
|
|
auto loadOp = xegpu::LoadNdOp::create(
|
|
rewriter, loc,
|
|
VectorType::get(supportedShape, data.getType().getElementType()),
|
|
newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
|
|
origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
|
|
origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
|
|
origLoadOp.getL3HintAttr(), origLoadOp.getLayoutAttr());
|
|
// Set the layout for the loadOp.
|
|
auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
|
|
loadOp.setAnchorLayout(layoutAttr);
|
|
// Insert the loaded block into the right position in data.
|
|
auto insertOp = vector::InsertStridedSliceOp::create(
|
|
rewriter, loc, loadOp.getResult(), data,
|
|
ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
|
|
ArrayRef<int64_t>{1, 1});
|
|
// InsertOp must have the same layout as newTensorDesc.
|
|
xegpu::setTemporaryLayout(insertOp->getOpResult(0), layoutAttr);
|
|
data = insertOp.getResult();
|
|
}
|
|
}
|
|
return data;
|
|
}
|
|
|
|
/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
|
|
/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
|
|
/// the base pointer from the original memory source and adjusting the shape and
|
|
/// strides of the tensor desc to fit with the new optimized transpose layout.
|
|
class XeGPUCreateNdDescOpPattern final
|
|
: public OpConversionPattern<xegpu::CreateNdDescOp> {
|
|
public:
|
|
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
|
|
LogicalResult
|
|
matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
auto tdescTy = createNdOp.getType();
|
|
// Get the target uArch info.
|
|
auto chipStr = xegpu::getChipStr(createNdOp);
|
|
// Check if the chip is supported.
|
|
assert(
|
|
chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
|
|
"Expecting target chip to be pvc or bmg for transpose optimization.");
|
|
const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
|
|
|
|
auto convertType = tryOptimize(tdescTy, targetuArch);
|
|
if (convertType == tdescTy)
|
|
return failure();
|
|
auto strides = createNdOp.getMixedStrides();
|
|
auto maybeConstInnerStride = getConstantIntValue(strides.back());
|
|
// Only row-major memrefs are expected for now.
|
|
if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
|
|
return rewriter.notifyMatchFailure(
|
|
createNdOp, "Expecting row-major memref for transpose optimization.");
|
|
Value source = createNdOp.getSource();
|
|
auto optionalLaneData = getMaybeLaneData(tdescTy);
|
|
assert(optionalLaneData && "Expected 2D lane data");
|
|
auto laneData = optionalLaneData.value();
|
|
int64_t innerLaneData = laneData[1];
|
|
auto memrefType = dyn_cast<MemRefType>(source.getType());
|
|
// Inner dimension of the shape must be adjusted based on innerLaneData.
|
|
SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
|
|
modifiedShape.back() = divideByConstant(
|
|
rewriter, createNdOp.getLoc(),
|
|
convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
|
|
innerLaneData);
|
|
// Similarly, second to last stride must be adjusted.
|
|
assert(strides.size() >= 2 &&
|
|
"Expected at least 2 strides for CreateNdDescOp");
|
|
SmallVector<OpFoldResult> modifiedStrides(strides);
|
|
modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
|
|
rewriter, createNdOp.getLoc(),
|
|
convertToValue(rewriter, createNdOp.getLoc(),
|
|
modifiedStrides[modifiedStrides.size() - 2]),
|
|
innerLaneData);
|
|
|
|
// If the source is a static memref, we need to extract the pointer to
|
|
// base address.
|
|
if (memrefType && memrefType.hasStaticShape()) {
|
|
auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
|
|
rewriter, createNdOp.getLoc(), source);
|
|
source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
|
|
rewriter.getI64Type(),
|
|
extractOp.getResult())
|
|
.getResult();
|
|
}
|
|
// Create a new CreateNdDescOp with the modified shape and converted type.
|
|
auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
|
|
rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
|
|
modifiedStrides);
|
|
rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
|
|
/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
|
|
/// adjusted tensor desc type. This can result in multiple LoadNdOps being
|
|
/// generated to fill in the original load shape.
|
|
class XeGPULoadNdDescOpPattern final
|
|
: public OpConversionPattern<xegpu::LoadNdOp> {
|
|
public:
|
|
using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
|
|
LogicalResult
|
|
matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
auto origTensorDescType = loadNdOp.getTensorDescType();
|
|
auto adaptorType =
|
|
cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
|
|
if (adaptorType == origTensorDescType)
|
|
return failure();
|
|
// Offsets must be adjusted based on innerLaneData.
|
|
auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
|
|
int64_t innerLaneData = laneData[1];
|
|
auto offsets = loadNdOp.getMixedOffsets();
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(loadNdOp,
|
|
"Expecting offsets in LoadNd");
|
|
SmallVector<OpFoldResult> modifiedOffsets(offsets);
|
|
modifiedOffsets.back() = divideByConstant(
|
|
rewriter, loadNdOp.getLoc(),
|
|
convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
|
|
innerLaneData);
|
|
// Get the 2D data shape of this loadNdOp in its original type including
|
|
// array length.
|
|
SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
|
|
// Adjust the data shape based on innerLaneData.
|
|
origDataShape.back() /= innerLaneData;
|
|
// HW supported shape is the new tensor desc shape after conversion.
|
|
SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
|
|
VectorType origVectorType =
|
|
VectorType::get(origDataShape, adaptorType.getElementType());
|
|
Value data;
|
|
// Orig data shape is 3D for the array length case.
|
|
if (origTensorDescType.getArrayLength() > 1) {
|
|
SmallVector<Value> arraySlices;
|
|
for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
|
|
Value slice = arith::ConstantOp::create(
|
|
rewriter, loadNdOp->getLoc(), origVectorType,
|
|
rewriter.getZeroAttr(origVectorType));
|
|
// Increase the Y offset for each array slice.
|
|
Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
|
|
modifiedOffsets.back());
|
|
modifiedOffsets.back() =
|
|
arith::AddIOp::create(
|
|
rewriter, loadNdOp->getLoc(), offsetY,
|
|
arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
|
|
i * origDataShape[1])
|
|
.getResult())
|
|
.getResult();
|
|
slice = generateLoads(
|
|
rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
|
|
cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
|
|
loadNdOp);
|
|
// BitCast back to original load shape without array length.
|
|
auto bitcastType = VectorType::get(origTensorDescType.getShape(),
|
|
origTensorDescType.getElementType());
|
|
auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
|
|
bitcastType, slice);
|
|
// BitCastOp must have the same layout as the original loadNdOp.
|
|
xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
|
|
origTensorDescType.getLayoutAttr());
|
|
arraySlices.push_back(bitCastOp.getResult());
|
|
}
|
|
rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
|
|
return success();
|
|
}
|
|
data = arith::ConstantOp::create(
|
|
rewriter, loadNdOp->getLoc(),
|
|
VectorType::get(origDataShape, adaptorType.getElementType()),
|
|
rewriter.getZeroAttr(origVectorType));
|
|
data = generateLoads(
|
|
rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
|
|
cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
|
|
loadNdOp);
|
|
auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
|
|
loadNdOp.getType(), data);
|
|
// BitCastOp must have the same layout as the original loadNdOp.
|
|
xegpu::setTemporaryLayout(bitCastOp->getOpResult(0),
|
|
origTensorDescType.getLayoutAttr());
|
|
rewriter.replaceOp(loadNdOp, bitCastOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Vector ExtractOp must be processed if the original tensor desc type has
|
|
/// array length greater than 1. In this case, the LoadNdOp is replaced with
|
|
/// multiple LoadNdOps for each array slice making the extraction unnecessary.
|
|
/// In this case, we simply remove the ExtractOp.
|
|
class VectorExtractOpPattern final
|
|
: public OpConversionPattern<vector::ExtractOp> {
|
|
public:
|
|
using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
|
|
LogicalResult
|
|
matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
// Check if the source of the extraction is split to multiple values.
|
|
if (adaptor.getSource().size() == 1)
|
|
return failure();
|
|
auto mixedPos = extractOp.getMixedPosition();
|
|
if (mixedPos.size() != 1)
|
|
return failure();
|
|
auto mayBeInt = getConstantIntValue(mixedPos[0]);
|
|
if (!mayBeInt)
|
|
return failure();
|
|
rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Performs a reduction over 2 dimensions by decomposing it into two 1D
|
|
/// reductions ordered based on layout to minimize cross-lane communication.
|
|
class MultiRed2dOpPattern
|
|
: public OpConversionPattern<vector::MultiDimReductionOp> {
|
|
using OpConversionPattern::OpConversionPattern;
|
|
LogicalResult
|
|
matchAndRewrite(vector::MultiDimReductionOp reductionOp, OpAdaptor adaptor,
|
|
ConversionPatternRewriter &rewriter) const override {
|
|
auto sourceVecType = reductionOp.getSourceVectorType();
|
|
if (reductionOp.getReductionDims().size() != 2)
|
|
return rewriter.notifyMatchFailure(reductionOp, "Expected 2D reduction");
|
|
auto resLayout = xegpu::getDistributeLayoutAttr(reductionOp.getResult());
|
|
// Retrieve and order dims for 1D decomposition (prefer intra-lane first).
|
|
auto dims = llvm::to_vector(reductionOp.getReductionDims());
|
|
auto [intraLaneDim, crossLaneDim] = getReductionDimOrder(dims, resLayout);
|
|
// Order does not matter
|
|
if (intraLaneDim == -1 || crossLaneDim == -1) {
|
|
intraLaneDim = dims[0];
|
|
crossLaneDim = dims[1];
|
|
}
|
|
auto loc = reductionOp.getLoc();
|
|
auto acc = reductionOp.getAcc();
|
|
|
|
// If the result is scalar after reduction, look for consumer
|
|
// convert_layout op and remove it. The layout propagation pass will
|
|
// re-install it properly after the decomposition.
|
|
Type resultType = reductionOp.getResult().getType();
|
|
if (resultType.isIntOrFloat()) {
|
|
for (auto &use : reductionOp.getResult().getUses()) {
|
|
if (auto convertLayoutOp =
|
|
llvm::dyn_cast<xegpu::ConvertLayoutOp>(use.getOwner())) {
|
|
rewriter.replaceOp(convertLayoutOp, reductionOp.getResult());
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
SmallVector<int64_t> accShape(sourceVecType.getShape());
|
|
accShape.erase(accShape.begin() + intraLaneDim);
|
|
Type eTy = sourceVecType.getElementType();
|
|
Value constNeutralVal = xegpu::createReductionNeutralValue(
|
|
rewriter, loc, VectorType::get(accShape, eTy), reductionOp.getKind());
|
|
|
|
Value intraLaneReduced = vector::MultiDimReductionOp::create(
|
|
rewriter, loc, reductionOp.getKind(), reductionOp.getSource(),
|
|
constNeutralVal, ArrayRef<int64_t>(intraLaneDim));
|
|
|
|
// Adjust crossLaneDim after the first reduction.
|
|
if (crossLaneDim > intraLaneDim)
|
|
crossLaneDim -= 1;
|
|
Value crossLaneReduced = vector::MultiDimReductionOp::create(
|
|
rewriter, loc, reductionOp.getKind(), intraLaneReduced, acc,
|
|
ArrayRef<int64_t>(crossLaneDim));
|
|
assert(crossLaneReduced.getType() == reductionOp.getResult().getType() &&
|
|
"Type mismatch");
|
|
rewriter.replaceOp(reductionOp, crossLaneReduced);
|
|
return success();
|
|
}
|
|
|
|
private:
|
|
std::pair<int64_t, int64_t>
|
|
getReductionDimOrder(ArrayRef<int64_t> reductionDims,
|
|
xegpu::DistributeLayoutAttr layout) const {
|
|
assert(layout.isForSubgroup() && "Must know the lane layout");
|
|
assert(reductionDims.size() == 2 && "Expected 2D reduction");
|
|
int64_t intra, cross = -1;
|
|
xegpu::LayoutAttr layoutAttr = dyn_cast<xegpu::LayoutAttr>(layout);
|
|
if (auto layoutSliceAttr = dyn_cast<xegpu::SliceAttr>(layout))
|
|
layoutAttr =
|
|
dyn_cast<xegpu::LayoutAttr>(layoutSliceAttr.flatten().getParent());
|
|
assert(layoutAttr);
|
|
SmallVector<int64_t> laneLayout = layoutAttr.getEffectiveLaneLayoutAsInt();
|
|
|
|
assert(laneLayout.size() && "Expected a non-empty layout");
|
|
// try to pick a dim that does not communicate
|
|
for (auto dim : reductionDims) {
|
|
if (laneLayout[dim] == 1)
|
|
intra = dim;
|
|
else
|
|
cross = dim;
|
|
}
|
|
return {intra, cross};
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
void xegpu::populateXeGPUPeepHoleOptimizerPatterns(
|
|
RewritePatternSet &patterns) {
|
|
patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
|
|
VectorExtractOpPattern, MultiRed2dOpPattern>(
|
|
patterns.getContext());
|
|
}
|
|
|
|
namespace {
|
|
|
|
struct XeGPUPeepHoleOptimizerPass final
|
|
: public xegpu::impl::XeGPUPeepHoleOptimizerBase<
|
|
XeGPUPeepHoleOptimizerPass> {
|
|
void runOnOperation() override {
|
|
MLIRContext &context = getContext();
|
|
TypeConverter converter;
|
|
RewritePatternSet patterns(&context);
|
|
ConversionTarget target(context);
|
|
|
|
// This pass is only meant for PVC and BMG targets. If unsupported target
|
|
// is found, exit early.
|
|
bool isTargetSupported = false;
|
|
getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
|
|
auto chipStr = xegpu::getChipStr(funcOp);
|
|
if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
|
|
isTargetSupported = true;
|
|
});
|
|
|
|
if (!isTargetSupported) {
|
|
DBGS() << "XeGPUPeepHoleOptimizerPass only supports PVC and BMG targets."
|
|
<< "\n";
|
|
return;
|
|
}
|
|
|
|
// CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
|
|
// converted.
|
|
target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
|
|
[&](xegpu::CreateNdDescOp createNdOp) {
|
|
return !canBeOptimizedForTranspose(createNdOp.getType());
|
|
});
|
|
target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
|
|
[&](xegpu::LoadNdOp loadNdOp) {
|
|
return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
|
|
});
|
|
// Vector ExtractOps can have optimizable layouts if they extract from
|
|
// LoadNdOps with array length greater than 1. These ExtractOps must be
|
|
// converted.
|
|
target.addDynamicallyLegalOp<vector::ExtractOp>(
|
|
[&](vector::ExtractOp extractOp) {
|
|
auto layout = xegpu::getTemporaryLayout(
|
|
dyn_cast<OpResult>(extractOp.getResult()));
|
|
if (!layout)
|
|
return true;
|
|
auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
|
|
auto laneData = layout.getEffectiveLaneDataAsInt();
|
|
return !canBeOptimizedForTranspose(laneLayout, laneData);
|
|
});
|
|
|
|
target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
|
|
[=](Operation *op) -> bool {
|
|
auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
|
|
if (!layout || !layout.isForSubgroup())
|
|
return true;
|
|
if (auto reductionOp = dyn_cast<vector::MultiDimReductionOp>(op))
|
|
return reductionOp.getReductionDims().size() != 2;
|
|
return true;
|
|
});
|
|
|
|
converter.addConversion([](Type type) { return type; });
|
|
|
|
target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
|
|
vector::VectorDialect>();
|
|
scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
|
|
target);
|
|
xegpu::populateXeGPUPeepHoleOptimizerPatterns(patterns);
|
|
if (failed(applyPartialConversion(getOperation(), target,
|
|
std::move(patterns)))) {
|
|
DBGS() << "Optimize block loads pass failed.\n";
|
|
return signalPassFailure();
|
|
}
|
|
|
|
// Apply folding for cleaning up IR.
|
|
MLIRContext *ctx = &getContext();
|
|
RewritePatternSet emptyPatterns(ctx);
|
|
(void)applyPatternsGreedily(getOperation(), std::move(emptyPatterns));
|
|
|
|
// Remove the temporary layout after all patterns are applied.
|
|
getOperation()->walk([](Operation *op) {
|
|
SmallVector<StringAttr> attrsToRemove;
|
|
for (auto namedAttr : op->getDiscardableAttrs()) {
|
|
if (isa<xegpu::DistributeLayoutAttr>(namedAttr.getValue()))
|
|
attrsToRemove.push_back(namedAttr.getName());
|
|
}
|
|
for (auto attrName : attrsToRemove)
|
|
op->removeDiscardableAttr(attrName);
|
|
});
|
|
}
|
|
};
|
|
|
|
} // namespace
|