
The greedy rewriter is used in many different flows and it has a lot of convenience (work list management, debugging actions, tracing, etc). But it combines two kinds of greedy behavior 1) how ops are matched, 2) folding wherever it can. These are independent forms of greedy and leads to inefficiency. E.g., cases where one need to create different phases in lowering and is required to applying patterns in specific order split across different passes. Using the driver one ends up needlessly retrying folding/having multiple rounds of folding attempts, where one final run would have sufficed. Of course folks can locally avoid this behavior by just building their own, but this is also a common requested feature that folks keep on working around locally in suboptimal ways. For downstream users, there should be no behavioral change. Updating from the deprecated should just be a find and replace (e.g., `find ./ -type f -exec sed -i 's|applyPatternsAndFoldGreedily|applyPatternsGreedily|g' {} \;` variety) as the API arguments hasn't changed between the two.
337 lines
12 KiB
C++
337 lines
12 KiB
C++
//===- VectorToXeGPU.cpp - Convert vector to XeGPU dialect ------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements lowering of vector operations to XeGPU dialect ops.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h"
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
|
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
|
|
#include "mlir/Pass/Pass.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "mlir/Transforms/Passes.h"
|
|
#include "llvm/ADT/TypeSwitch.h"
|
|
|
|
#include <algorithm>
|
|
#include <optional>
|
|
|
|
namespace mlir {
|
|
#define GEN_PASS_DEF_CONVERTVECTORTOXEGPU
|
|
#include "mlir/Conversion/Passes.h.inc"
|
|
} // namespace mlir
|
|
|
|
using namespace mlir;
|
|
|
|
namespace {
|
|
|
|
// Return true if value represents a zero constant.
|
|
static bool isZeroConstant(Value val) {
|
|
auto constant = val.getDefiningOp<arith::ConstantOp>();
|
|
if (!constant)
|
|
return false;
|
|
|
|
return TypeSwitch<Attribute, bool>(constant.getValue())
|
|
.Case<FloatAttr>(
|
|
[](auto floatAttr) { return floatAttr.getValue().isZero(); })
|
|
.Case<IntegerAttr>(
|
|
[](auto intAttr) { return intAttr.getValue().isZero(); })
|
|
.Default([](auto) { return false; });
|
|
}
|
|
|
|
static LogicalResult storeLoadPreconditions(PatternRewriter &rewriter,
|
|
Operation *op, VectorType vecTy) {
|
|
// Validate only vector as the basic vector store and load ops guarantee
|
|
// XeGPU-compatible memref source.
|
|
unsigned vecRank = vecTy.getRank();
|
|
if (!(vecRank == 1 || vecRank == 2))
|
|
return rewriter.notifyMatchFailure(op, "Expects 1D or 2D vector");
|
|
|
|
return success();
|
|
}
|
|
|
|
static LogicalResult transferPreconditions(PatternRewriter &rewriter,
|
|
VectorTransferOpInterface xferOp) {
|
|
if (xferOp.getMask())
|
|
return rewriter.notifyMatchFailure(xferOp,
|
|
"Masked transfer is not supported");
|
|
|
|
auto srcTy = dyn_cast<MemRefType>(xferOp.getShapedType());
|
|
if (!srcTy)
|
|
return rewriter.notifyMatchFailure(xferOp, "Expects memref source");
|
|
|
|
// Perform common data transfer checks.
|
|
VectorType vecTy = xferOp.getVectorType();
|
|
if (failed(storeLoadPreconditions(rewriter, xferOp, vecTy)))
|
|
return failure();
|
|
|
|
// Validate further transfer op semantics.
|
|
SmallVector<int64_t> strides;
|
|
int64_t offset;
|
|
if (failed(getStridesAndOffset(srcTy, strides, offset)) ||
|
|
strides.back() != 1)
|
|
return rewriter.notifyMatchFailure(
|
|
xferOp, "Buffer must be contiguous in the innermost dimension");
|
|
|
|
unsigned vecRank = vecTy.getRank();
|
|
if (xferOp.hasOutOfBoundsDim() && vecRank < 2)
|
|
return rewriter.notifyMatchFailure(
|
|
xferOp, "Boundary check is available only for block instructions.");
|
|
|
|
AffineMap map = xferOp.getPermutationMap();
|
|
if (!map.isProjectedPermutation(/*allowZeroInResults=*/false))
|
|
return rewriter.notifyMatchFailure(xferOp, "Unsupported permutation map");
|
|
unsigned numInputDims = map.getNumInputs();
|
|
for (AffineExpr expr : map.getResults().take_back(vecRank)) {
|
|
auto dim = dyn_cast<AffineDimExpr>(expr);
|
|
if (dim.getPosition() < (numInputDims - vecRank))
|
|
return rewriter.notifyMatchFailure(
|
|
xferOp, "Only the innermost dimensions can be accessed");
|
|
}
|
|
|
|
return success();
|
|
}
|
|
|
|
static xegpu::CreateNdDescOp
|
|
createNdDescriptor(PatternRewriter &rewriter, Location loc,
|
|
xegpu::TensorDescType descType, TypedValue<MemRefType> src,
|
|
Operation::operand_range offsets) {
|
|
MemRefType srcTy = src.getType();
|
|
auto [strides, offset] = getStridesAndOffset(srcTy);
|
|
|
|
xegpu::CreateNdDescOp ndDesc;
|
|
if (srcTy.hasStaticShape()) {
|
|
ndDesc = rewriter.create<xegpu::CreateNdDescOp>(loc, descType, src,
|
|
getAsOpFoldResult(offsets));
|
|
} else {
|
|
// In case of any dynamic shapes, source's shape and strides have to be
|
|
// explicitly provided.
|
|
SmallVector<Value> sourceDims;
|
|
unsigned srcRank = srcTy.getRank();
|
|
for (unsigned i = 0; i < srcRank; ++i)
|
|
sourceDims.push_back(rewriter.create<memref::DimOp>(loc, src, i));
|
|
|
|
SmallVector<int64_t> constOffsets;
|
|
SmallVector<Value> dynOffsets;
|
|
for (Value offset : offsets) {
|
|
std::optional<int64_t> staticVal = getConstantIntValue(offset);
|
|
if (!staticVal)
|
|
dynOffsets.push_back(offset);
|
|
constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));
|
|
}
|
|
|
|
SmallVector<Value> dynShapes;
|
|
for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) {
|
|
if (shape == ShapedType::kDynamic)
|
|
dynShapes.push_back(sourceDims[idx]);
|
|
}
|
|
|
|
// Compute strides in reverse order.
|
|
SmallVector<Value> dynStrides;
|
|
Value accStride = rewriter.create<arith::ConstantIndexOp>(loc, 1);
|
|
// Last stride is guaranteed to be static and unit.
|
|
for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {
|
|
accStride =
|
|
rewriter.create<arith::MulIOp>(loc, accStride, sourceDims[i + 1]);
|
|
if (strides[i] == ShapedType::kDynamic)
|
|
dynStrides.push_back(accStride);
|
|
}
|
|
std::reverse(dynStrides.begin(), dynStrides.end());
|
|
|
|
ndDesc = rewriter.create<xegpu::CreateNdDescOp>(
|
|
loc, descType, src, dynOffsets, dynShapes, dynStrides,
|
|
DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),
|
|
DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),
|
|
DenseI64ArrayAttr::get(rewriter.getContext(), strides));
|
|
}
|
|
|
|
return ndDesc;
|
|
}
|
|
|
|
struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
|
|
using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
|
|
|
|
LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
|
|
PatternRewriter &rewriter) const override {
|
|
Location loc = readOp.getLoc();
|
|
|
|
if (failed(transferPreconditions(rewriter, readOp)))
|
|
return failure();
|
|
|
|
bool isOutOfBounds = readOp.hasOutOfBoundsDim();
|
|
if (isOutOfBounds && !isZeroConstant(readOp.getPadding()))
|
|
return rewriter.notifyMatchFailure(
|
|
readOp, "Unsupported non-zero padded out-of-bounds read");
|
|
|
|
AffineMap readMap = readOp.getPermutationMap();
|
|
bool isTransposeLoad = !readMap.isMinorIdentity();
|
|
|
|
VectorType vecTy = readOp.getVectorType();
|
|
Type elementType = vecTy.getElementType();
|
|
unsigned minTransposeBitWidth = 32;
|
|
if (isTransposeLoad &&
|
|
elementType.getIntOrFloatBitWidth() < minTransposeBitWidth)
|
|
return rewriter.notifyMatchFailure(
|
|
readOp, "Unsupported data type for tranposition");
|
|
|
|
// If load is transposed, get the base shape for the tensor descriptor.
|
|
SmallVector<int64_t> descShape{vecTy.getShape()};
|
|
if (isTransposeLoad)
|
|
std::reverse(descShape.begin(), descShape.end());
|
|
auto descType = xegpu::TensorDescType::get(
|
|
descShape, elementType, /*array_length=*/1,
|
|
/*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
|
|
|
|
xegpu::CreateNdDescOp ndDesc =
|
|
createNdDescriptor(rewriter, loc, descType,
|
|
dyn_cast<TypedValue<MemRefType>>(readOp.getSource()),
|
|
readOp.getIndices());
|
|
|
|
DenseI64ArrayAttr transposeAttr =
|
|
!isTransposeLoad ? nullptr
|
|
: DenseI64ArrayAttr::get(rewriter.getContext(),
|
|
ArrayRef<int64_t>{1, 0});
|
|
// By default, no specific caching policy is assigned.
|
|
xegpu::CachePolicyAttr hint = nullptr;
|
|
auto loadOp = rewriter.create<xegpu::LoadNdOp>(
|
|
loc, vecTy, ndDesc, /*packed=*/nullptr, transposeAttr,
|
|
/*l1_hint=*/hint,
|
|
/*l2_hint=*/hint, /*l3_hint=*/hint);
|
|
rewriter.replaceOp(readOp, loadOp);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct TransferWriteLowering
|
|
: public OpRewritePattern<vector::TransferWriteOp> {
|
|
using OpRewritePattern<vector::TransferWriteOp>::OpRewritePattern;
|
|
|
|
LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
|
|
PatternRewriter &rewriter) const override {
|
|
Location loc = writeOp.getLoc();
|
|
|
|
if (failed(transferPreconditions(rewriter, writeOp)))
|
|
return failure();
|
|
|
|
AffineMap map = writeOp.getPermutationMap();
|
|
if (!map.isMinorIdentity())
|
|
return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
|
|
|
|
VectorType vecTy = writeOp.getVectorType();
|
|
auto descType = xegpu::TensorDescType::get(
|
|
vecTy.getShape(), vecTy.getElementType(),
|
|
/*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(),
|
|
xegpu::MemorySpace::Global);
|
|
xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
|
|
rewriter, loc, descType,
|
|
dyn_cast<TypedValue<MemRefType>>(writeOp.getSource()),
|
|
writeOp.getIndices());
|
|
|
|
// By default, no specific caching policy is assigned.
|
|
xegpu::CachePolicyAttr hint = nullptr;
|
|
auto storeOp =
|
|
rewriter.create<xegpu::StoreNdOp>(loc, writeOp.getVector(), ndDesc,
|
|
/*l1_hint=*/hint,
|
|
/*l2_hint=*/hint, /*l3_hint=*/hint);
|
|
rewriter.replaceOp(writeOp, storeOp);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
|
|
using OpRewritePattern<vector::LoadOp>::OpRewritePattern;
|
|
|
|
LogicalResult matchAndRewrite(vector::LoadOp loadOp,
|
|
PatternRewriter &rewriter) const override {
|
|
Location loc = loadOp.getLoc();
|
|
|
|
VectorType vecTy = loadOp.getResult().getType();
|
|
if (failed(storeLoadPreconditions(rewriter, loadOp, vecTy)))
|
|
return failure();
|
|
|
|
// Boundary check is available only for block instructions.
|
|
bool boundaryCheck = vecTy.getRank() > 1;
|
|
|
|
auto descType = xegpu::TensorDescType::get(
|
|
vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1,
|
|
boundaryCheck, xegpu::MemorySpace::Global);
|
|
xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
|
|
rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices());
|
|
|
|
// By default, no specific caching policy is assigned.
|
|
xegpu::CachePolicyAttr hint = nullptr;
|
|
auto loadNdOp = rewriter.create<xegpu::LoadNdOp>(
|
|
loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
|
|
/*l1_hint=*/hint,
|
|
/*l2_hint=*/hint, /*l3_hint=*/hint);
|
|
rewriter.replaceOp(loadOp, loadNdOp);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
|
|
using OpRewritePattern<vector::StoreOp>::OpRewritePattern;
|
|
|
|
LogicalResult matchAndRewrite(vector::StoreOp storeOp,
|
|
PatternRewriter &rewriter) const override {
|
|
Location loc = storeOp.getLoc();
|
|
|
|
TypedValue<VectorType> vector = storeOp.getValueToStore();
|
|
VectorType vecTy = vector.getType();
|
|
if (failed(storeLoadPreconditions(rewriter, storeOp, vecTy)))
|
|
return failure();
|
|
|
|
// Boundary check is available only for block instructions.
|
|
bool boundaryCheck = vecTy.getRank() > 1;
|
|
|
|
auto descType = xegpu::TensorDescType::get(
|
|
vecTy.getShape(), vecTy.getElementType(),
|
|
/*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global);
|
|
xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
|
|
rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices());
|
|
|
|
// By default, no specific caching policy is assigned.
|
|
xegpu::CachePolicyAttr hint = nullptr;
|
|
auto storeNdOp =
|
|
rewriter.create<xegpu::StoreNdOp>(loc, vector, ndDesc,
|
|
/*l1_hint=*/hint,
|
|
/*l2_hint=*/hint, /*l3_hint=*/hint);
|
|
rewriter.replaceOp(storeOp, storeNdOp);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
struct ConvertVectorToXeGPUPass
|
|
: public impl::ConvertVectorToXeGPUBase<ConvertVectorToXeGPUPass> {
|
|
void runOnOperation() override {
|
|
RewritePatternSet patterns(&getContext());
|
|
populateVectorToXeGPUConversionPatterns(patterns);
|
|
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
|
|
return signalPassFailure();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
void mlir::populateVectorToXeGPUConversionPatterns(
|
|
RewritePatternSet &patterns) {
|
|
patterns.add<TransferReadLowering, TransferWriteLowering, LoadLowering,
|
|
StoreLowering>(patterns.getContext());
|
|
}
|
|
|
|
std::unique_ptr<Pass> mlir::createConvertVectorToXeGPUPass() {
|
|
return std::make_unique<ConvertVectorToXeGPUPass>();
|
|
}
|