This PR refactors layout propagation into two distinct components: result/anchor layout setup and source layout inference from the result. For operations that require a specific result layout due to semantic or hardware constraints, the propagation logic explicitly sets up the result or anchor layout. Otherwise, it infers the source layout from the backward-propagated consumer layout. The result or anchor layout may differ from the backward-propagated consumer layout; any such discrepancies are resolved via the existing layout-conflict mechanism. **This PR introduces the following utility functions:** Source layout inference: > inferBroadcastSourceLayout() > inferMultiReductionSourceLayout() > inferBitCastSourceLayout() > inferShapeCastSourceLayout() > inferInsertStridedSliceSourceLayout() Result / anchor layout setup: > setupMultiReductionResultLayout() > setupBitCastResultLayout() > setupInsertStridedSliceResultLayout() > setupLoadMatrixAnchorLayout() > setupStoreMatrixAnchorLayout() > setupLoadGatherAnchorLayout() > setupStoreScatterAnchorLayout() Part of subgroup distribution related code changes are separated and created as PR https://github.com/llvm/llvm-project/pull/179018/changes.
444 lines
17 KiB
C++
444 lines
17 KiB
C++
//===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
|
|
|
|
#include "mlir/Dialect/Index/IR/IndexDialect.h"
|
|
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
|
|
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
|
|
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
|
|
#include "mlir/Interfaces/LoopLikeInterface.h"
|
|
#include "mlir/Pass/PassManager.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/Support/DebugLog.h"
|
|
|
|
namespace mlir {
|
|
namespace xegpu {
|
|
#define GEN_PASS_DEF_XEGPUBLOCKING
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
|
|
} // namespace xegpu
|
|
} // namespace mlir
|
|
|
|
#define DEBUG_TYPE "xegpu-blocking"
|
|
|
|
using namespace mlir;
|
|
|
|
namespace {
|
|
|
|
// reslove the unrealized conversion cast ops generated when doing SCF
|
|
// Structural Type Conversion. It will have two formats, N:1 vector
|
|
// cast and 1:N vector cast. vector::insert_strided_slice ops will be
|
|
// used for the first case, and vector::extract_strided_slice ops will be
|
|
// used for the second case.
|
|
static void
|
|
resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
|
|
ValueRange inputs = castOp.getInputs();
|
|
ValueRange outputs = castOp.getOutputs();
|
|
|
|
auto hasIdenticalVectorTypes = [](ValueRange values) {
|
|
auto types = values.getTypes();
|
|
return llvm::all_of(types, [&](Type type) {
|
|
return isa<VectorType>(type) && type == types.front();
|
|
});
|
|
};
|
|
|
|
// We only interest in the case where all inputs and outputs have the
|
|
// identical VectorTypes
|
|
if (!hasIdenticalVectorTypes(inputs) || !hasIdenticalVectorTypes(outputs)) {
|
|
LDBG() << "skip unrealized conversion cast op not emulating pack/unpack.";
|
|
return;
|
|
}
|
|
|
|
VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
|
|
OpBuilder builder(castOp);
|
|
if (inputs.size() > 1 && outputs.size() == 1) {
|
|
// the castOp is emulating an unpack op
|
|
ArrayRef<int64_t> shape = outputTy.getShape();
|
|
Value result = xegpu::createVectorWithShapeFromValues(
|
|
builder, castOp.getLoc(), inputs, shape);
|
|
castOp->replaceAllUsesWith(ValueRange(result));
|
|
castOp->erase();
|
|
} else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
|
|
// the castOp is emulating a pack op
|
|
ArrayRef<int64_t> tileShape = outputTy.getShape();
|
|
SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
|
|
builder, castOp.getLoc(), inputs[0], tileShape);
|
|
castOp->replaceAllUsesWith(results);
|
|
castOp->erase();
|
|
}
|
|
}
|
|
|
|
// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
|
|
// layout attributes. Since both producer and consumer operations handle data
|
|
// partitioning based on their own inst_data, while maintaining original input
|
|
// and output shape, ConvertLayoutOp does not need to manage inst_data.
|
|
struct ConvertLayoutOpPattern
|
|
: public OpRewritePattern<xegpu::ConvertLayoutOp> {
|
|
using OpRewritePattern::OpRewritePattern;
|
|
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
|
|
PatternRewriter &rewriter) const override {
|
|
xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
|
|
xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
|
|
if (inputLayout.getEffectiveInstDataAsInt().empty() ||
|
|
targetLayout.getEffectiveInstDataAsInt().empty())
|
|
return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");
|
|
|
|
inputLayout = inputLayout.dropInstData();
|
|
targetLayout = targetLayout.dropInstData();
|
|
auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
|
|
op.getLoc(), op.getType(), op.getSource(), inputLayout, targetLayout);
|
|
rewriter.replaceOp(op, newOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
//===------------------------------------------------------------------------===//
|
|
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
|
|
// to partition operations that process large shapes into multiple operations on
|
|
// smaller shapes, as specified by the inst_data in the layout attribute. This
|
|
// enables each resulting operation to be efficiently mapped to a hardware
|
|
// instruction.
|
|
//===------------------------------------------------------------------------===//
|
|
|
|
class XeGPUBlockingPass final
|
|
: public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
|
|
public:
|
|
void runOnOperation() override;
|
|
|
|
private:
|
|
// Get the tile shape for a given OpOperand or OpResult by examining the
|
|
// corresponding layout attribute. If layout is not present or is not a
|
|
// subgroup level layout, it returns std::nullopt.
|
|
template <typename T,
|
|
typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
|
|
std::is_same_v<T, OpResult>>>
|
|
std::optional<SmallVector<int64_t>>
|
|
getTileShape(const T &operandOrResult) const;
|
|
|
|
// Get the tile shape for a given operation.
|
|
std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;
|
|
|
|
// Determine if the operation requires unrolling. Return false if all operands
|
|
// and results have tile shapes identical to their original types. Otherwise,
|
|
// return true.
|
|
bool needsUnroll(Operation *op) const;
|
|
};
|
|
} // namespace
|
|
|
|
template <typename T, typename>
|
|
std::optional<SmallVector<int64_t>>
|
|
XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
|
|
Value value;
|
|
Operation *ownerOp;
|
|
if constexpr (std::is_same_v<T, OpOperand>) {
|
|
value = operandOrResult.get();
|
|
ownerOp = operandOrResult.getOwner();
|
|
} else {
|
|
value = (Value)operandOrResult;
|
|
ownerOp = value.getDefiningOp();
|
|
}
|
|
|
|
xegpu::DistributeLayoutAttr layout =
|
|
xegpu::getDistributeLayoutAttr(operandOrResult);
|
|
if (layout && layout.isForSubgroup()) {
|
|
if (!layout.getEffectiveInstDataAsInt().empty()) {
|
|
SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
|
|
// Remove leading unit dimensions from inst_data for non-rank-sensitive
|
|
// ops. For example, if the inst_data is [1, 1, 32] it will pass [32] as
|
|
// the unroll/blocking size.
|
|
// Skip it for rank-sensitive ops, whose semantics depend on the tensor
|
|
// rank (and consequently its shape), and therefore must not alter the
|
|
// input tile rank or shape, such as by dropping leading dimensions.
|
|
bool skipLeadingUnitDimRemoval =
|
|
ownerOp &&
|
|
(isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
|
|
xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
|
|
xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
|
|
vector::TransposeOp, vector::ShapeCastOp,
|
|
vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
|
|
if (!skipLeadingUnitDimRemoval) {
|
|
auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
|
|
instData.erase(instData.begin(), it);
|
|
}
|
|
return instData;
|
|
}
|
|
|
|
if (auto type = dyn_cast<ShapedType>(value.getType()))
|
|
return llvm::to_vector(type.getShape());
|
|
}
|
|
LDBG() << "failed to getTileShape for: " << value;
|
|
return std::nullopt;
|
|
}
|
|
|
|
std::optional<SmallVector<int64_t>>
|
|
XeGPUBlockingPass::getTileShape(Operation *op) const {
|
|
if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
|
|
xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
|
|
return getTileShape(op->getOpResult(0));
|
|
if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
|
|
xegpu::StoreMatrixOp>(op))
|
|
return getTileShape(op->getOpOperand(0));
|
|
if (isa<xegpu::StoreNdOp>(op))
|
|
return getTileShape(op->getOpOperand(1));
|
|
|
|
// Handle LoadGatherOp and StoreScatterOp (with and without offset)
|
|
if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
|
|
if (loadGatherOp.getOffsets())
|
|
return getTileShape(loadGatherOp->getOpResult(0));
|
|
else
|
|
return getTileShape(loadGatherOp->getOpOperand(0));
|
|
}
|
|
|
|
if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
|
|
return getTileShape(storeScatterOp.getOffsets()
|
|
? storeScatterOp->getOpOperand(0)
|
|
: storeScatterOp->getOpOperand(1));
|
|
|
|
if (isa<xegpu::DpasOp>(op)) {
|
|
std::optional<SmallVector<int64_t>> aTile =
|
|
getTileShape(op->getOpOperand(0));
|
|
std::optional<SmallVector<int64_t>> bTile =
|
|
getTileShape(op->getOpOperand(1));
|
|
|
|
if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
|
|
return std::nullopt;
|
|
|
|
// semantic check for A and B
|
|
if ((*aTile)[1] != (*bTile)[0])
|
|
return std::nullopt;
|
|
|
|
// semantic check for C
|
|
if (op->getNumOperands() == 3) {
|
|
std::optional<SmallVector<int64_t>> cTile =
|
|
getTileShape(op->getOpOperand(2));
|
|
int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
|
|
if (!cTile || !llvm::equal(*cTile, expectedCTile))
|
|
return std::nullopt;
|
|
}
|
|
|
|
return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
|
|
}
|
|
|
|
if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
|
|
return getTileShape(op->getOpResult(0));
|
|
|
|
if (isa<vector::MultiDimReductionOp>(op))
|
|
return getTileShape(op->getOpOperand(0));
|
|
|
|
if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
|
|
vector::ConstantMaskOp, vector::CreateMaskOp>(op))
|
|
return getTileShape(op->getOpResult(0));
|
|
|
|
return std::nullopt;
|
|
}
|
|
|
|
bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
|
|
// skip the op if any of its operands or results has workgroup level layouts
|
|
bool hasWgLayoutOperands =
|
|
llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
|
|
xegpu::DistributeLayoutAttr layout =
|
|
xegpu::getDistributeLayoutAttr(opr);
|
|
return layout && layout.isForWorkgroup();
|
|
});
|
|
bool hasWgLayoutResults =
|
|
llvm::any_of(op->getOpResults(), [](OpResult result) {
|
|
xegpu::DistributeLayoutAttr layout =
|
|
xegpu::getDistributeLayoutAttr(result);
|
|
return layout && layout.isForWorkgroup();
|
|
});
|
|
if (hasWgLayoutOperands || hasWgLayoutResults) {
|
|
LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
|
|
return false;
|
|
}
|
|
|
|
auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
|
|
Type valTy = value.getType();
|
|
if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
|
|
xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
|
|
return layout && !layout.getEffectiveInstDataAsInt().empty();
|
|
}
|
|
auto shapedType = dyn_cast<ShapedType>(valTy);
|
|
return shapedType && !llvm::equal(tileShape, shapedType.getShape());
|
|
};
|
|
|
|
bool hasUnrollableOperands =
|
|
llvm::any_of(op->getOpOperands(), [&](OpOperand &opr) {
|
|
std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
|
|
return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
|
|
});
|
|
bool hasUnrollableResults =
|
|
llvm::any_of(op->getOpResults(), [&](OpResult result) {
|
|
std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
|
|
return tileShape.has_value() && isUnrollable(result, *tileShape);
|
|
});
|
|
return hasUnrollableOperands || hasUnrollableResults;
|
|
}
|
|
|
|
void XeGPUBlockingPass::runOnOperation() {
|
|
MLIRContext *ctx = &getContext();
|
|
Operation *op = getOperation();
|
|
|
|
// TODO-LayoutRefactor: unify the local propagation for layout preprocessing
|
|
// replace the function with recoverTemporaryLayouts
|
|
// if (!xegpu::recoverTemporaryLayouts(op)) {
|
|
// signalPassFailure();
|
|
// return;
|
|
// }
|
|
xegpu::recoverTemporaryLayoutsDeprecated(op);
|
|
|
|
auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
|
|
xegpu::LayoutAttr layout) {
|
|
int count = 1;
|
|
SmallVector<int64_t> tileShape(shape);
|
|
if (layout && layout.getInstData()) {
|
|
DenseI32ArrayAttr instData = layout.getInstData();
|
|
tileShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
|
|
count = computeProduct(shape) / computeProduct(tileShape);
|
|
}
|
|
return std::make_pair(tileShape, count);
|
|
};
|
|
|
|
// Perform type conversion for SCF control folow ops
|
|
TypeConverter converter;
|
|
converter.addConversion([](Type type) -> Type { return type; });
|
|
converter.addConversion(
|
|
[&](RankedTensorType type,
|
|
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
|
|
Type elemTy = type.getElementType();
|
|
ArrayRef<int64_t> shape = type.getShape();
|
|
|
|
auto layout =
|
|
llvm::dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding());
|
|
if (layout && layout.isForWorkgroup())
|
|
return failure();
|
|
|
|
int count;
|
|
SmallVector<int64_t> subShape;
|
|
std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
|
|
auto newTy = VectorType::get(subShape, elemTy);
|
|
result.append(count, newTy);
|
|
return success();
|
|
});
|
|
converter.addConversion(
|
|
[&](xegpu::TensorDescType type,
|
|
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
|
|
Type elemTy = type.getElementType();
|
|
ArrayRef<int64_t> shape = type.getShape();
|
|
|
|
xegpu::LayoutAttr layout = type.getLayoutAttr();
|
|
if (layout && layout.isForWorkgroup())
|
|
return failure();
|
|
|
|
int count;
|
|
SmallVector<int64_t> subShape;
|
|
std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
|
|
|
|
if (layout)
|
|
layout = layout.dropInstData();
|
|
|
|
auto newTy = xegpu::TensorDescType::get(
|
|
type.getContext(), subShape, elemTy, type.getEncoding(), layout);
|
|
result.append(count, newTy);
|
|
return success();
|
|
});
|
|
|
|
xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter);
|
|
|
|
// Remove leading unit dimensions from vector ops and then
|
|
// do the unrolling.
|
|
{
|
|
RewritePatternSet patterns(ctx);
|
|
vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
|
|
(void)applyPatternsGreedily(op, std::move(patterns));
|
|
}
|
|
xegpu::UnrollOptions options;
|
|
options.setFilterConstraint(
|
|
[&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });
|
|
|
|
options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });
|
|
|
|
options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape,
|
|
bool returnSingleType = false) {
|
|
Type elemTy = type.getElementType();
|
|
Type newTy;
|
|
|
|
if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {
|
|
|
|
Attribute encoding = tdescTy.getEncoding();
|
|
// If the encoding is a ScatterTensorDescAttr, we need to
|
|
// potentially adjust the chunk size based on the inst_data.
|
|
if (tdescTy.isScattered()) {
|
|
int64_t chunkSize = tdescTy.getChunkSizeAsInt();
|
|
|
|
if (chunkSize > 1) {
|
|
int64_t blockedChunkSize = chunkSize;
|
|
auto instData = tdescTy.getLayoutAttr().getInstData();
|
|
if (!instData.empty())
|
|
blockedChunkSize = instData.asArrayRef().back();
|
|
|
|
// To create a new attribute with a different chunk_size:
|
|
auto newEncoding = xegpu::ScatterTensorDescAttr::get(
|
|
ctx, tdescTy.getMemorySpace(), blockedChunkSize);
|
|
encoding = newEncoding;
|
|
}
|
|
}
|
|
|
|
newTy =
|
|
xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
|
|
tdescTy.getLayoutAttr().dropInstData());
|
|
} else {
|
|
newTy = VectorType::get(tileShape, elemTy);
|
|
}
|
|
|
|
if (returnSingleType)
|
|
return SmallVector<Type>{newTy};
|
|
std::optional<SmallVector<int64_t>> ratio =
|
|
computeShapeRatio(type.getShape(), tileShape);
|
|
assert(ratio && "The shape of the type must be a multiple of tileShape.");
|
|
return SmallVector<Type>(computeProduct(*ratio), newTy);
|
|
});
|
|
|
|
RewritePatternSet patterns(ctx);
|
|
patterns.add<ConvertLayoutOpPattern>(ctx);
|
|
|
|
vector::UnrollVectorOptions vectorOptions;
|
|
vectorOptions.setNativeShapeFn(options.nativeShape);
|
|
|
|
populateXeGPUUnrollPatterns(patterns, options);
|
|
vector::populateVectorUnrollPatterns(patterns, vectorOptions);
|
|
|
|
(void)applyPatternsGreedily(op, std::move(patterns));
|
|
|
|
op->walk([](Operation *op) {
|
|
// Remove the layout attributes cached per operands.
|
|
for (OpOperand &opr : op->getOpOperands()) {
|
|
std::string name = xegpu::getTemporaryLayoutName(opr);
|
|
if (op->hasAttrOfType<xegpu::DistributeLayoutAttr>(name))
|
|
op->removeAttr(name);
|
|
}
|
|
|
|
// Update the layout attributes per result.
|
|
for (OpResult result : op->getOpResults()) {
|
|
std::string name = xegpu::getTemporaryLayoutName(result);
|
|
if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
|
|
op->removeAttr(name);
|
|
if (!isa<LoopLikeOpInterface>(op))
|
|
xegpu::setDistributeLayoutAttr(result, layout.dropInstData());
|
|
}
|
|
}
|
|
|
|
// Resolve unrealized conversion cast ops emulating pack/unpack
|
|
if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
|
|
resolveUnrealizedConversionCastOp(castOp);
|
|
});
|
|
}
|