llvm-project/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
Jianhui Li 6072e4006f
[MLIR][XeGPU] Preserve leading unit dimension during blocking (#180884)
This PR preserve leading dimension during blocking. This ensures the
blocking process avoid generating unnecessary
insert/extract_strided_slice, which under certain condition becomes
difficult to be canceled, and creates extra burden in lane layout
propagation and subgroup distribution.
This PR also extended subgroup distribution so load and store can
support payload/mask/offsets with leading unit dimension. The
distributed load/store works on 1d only, but shapecast is inserted to
remove and add the leading dimension for the input/output vectors.
Comparing to the insert/extract inserted at subgroup level, the
shapecast inserted at lane level handling leading unit dimension is
essentially a nop and can be processed lightly.
2026-02-12 11:41:37 -08:00

2242 lines
103 KiB
C++

//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeRange.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SmallVectorExtras.h"
namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir
#define DEBUG_TYPE "xegpu-subgroup-distribute"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
using namespace mlir;
static const char *const resolveSIMTTypeMismatch =
"resolve_simt_type_mismatch"; // Attribute name for identifying
// UnrelizedConversionCastOp added to resolve
// SIMT type mismatches.
namespace {
//===----------------------------------------------------------------------===//
// SIMT Distribution Patterns
//===----------------------------------------------------------------------===//
/// In certain cases, we may need to favor XeGPU specific distribution patterns
/// over generic vector distribution patterns. In such cases, we can assign
/// priorities to patterns.
enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };
/// Helper function to resolve types if the distributed type out of
/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
/// Example 1:
/// distributed type: vector<8x1xf32>
/// expected type: vector<8xf32>
/// resolved using,
/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
/// Example 2:
/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
/// expected type: xegpu.tensor_desc<8x16xf32>
/// resolved using,
/// %0 = unrealized_conversion_cast %1 :
/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
/// xegpu.tensor_desc<8x16xf32>
template <typename T>
static Value resolveDistributedTy(Value orig, T expected,
PatternRewriter &rewriter) {
// If orig and expected types are the same, return orig.
if (orig.getType() == expected)
return orig;
// If orig is a vector type, create a shape cast op to reconcile the types.
if (isa<VectorType>(orig.getType())) {
auto castOp =
vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
return castOp.getResult();
}
// If orig is a tensor descriptor type, create an unrealized conversion cast
// op to reconcile the types.
if (isa<xegpu::TensorDescType>(orig.getType())) {
auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
expected, orig);
castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
return castOp.getResult(0);
}
llvm_unreachable("Unsupported type for reconciliation");
return orig;
}
/// Given a vector type and its distributed vector type, return the list of
/// dimensions that are distributed.
static SmallVector<int64_t> getDistributedDims(VectorType originalType,
VectorType distributedType) {
assert(originalType.getRank() == distributedType.getRank() &&
"sequential and distributed vector types must have the same rank");
SmallVector<int64_t> distributedDims;
for (int64_t i = 0; i < originalType.getRank(); ++i) {
if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
distributedDims.push_back(i);
}
}
return distributedDims;
}
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
/// contained within a WarpExecuteOnLane0Op.
/// Example:
///
/// ```
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
/// ...
/// ...
/// gpu.return %result: vector<8x16xf32>
/// }
/// ```
/// To
/// ```
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
/// %laneid = gpu.lane_id : index
/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
/// ...
/// ...
/// gpu.yield %result: vector<8x16xf32>
/// }
/// return %0
/// }
struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
PatternRewriter &rewriter) const override {
auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
if (!uArch)
return rewriter.notifyMatchFailure(
gpuFuncOp, "Subgroup distribution requires target attribute attached "
"to set the warp size");
// If the function only contains a single void return, skip.
if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
}))
return failure();
// If the function already moved inside a warp_execute_on_lane0, skip.
if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
return isa<gpu::WarpExecuteOnLane0Op>(op);
}))
return failure();
// Create a new function with the same signature and same attributes.
SmallVector<Type> workgroupAttributionsTypes =
llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
[](BlockArgument arg) { return arg.getType(); });
SmallVector<Type> privateAttributionsTypes =
llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
[](BlockArgument arg) { return arg.getType(); });
auto newGpuFunc = gpu::GPUFuncOp::create(
rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
privateAttributionsTypes);
newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
// Create a WarpExecuteOnLane0Op with same arguments and results as the
// original gpuFuncOp.
rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
auto laneId = gpu::LaneIdOp::create(
rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
/** upperBound = **/ mlir::IntegerAttr());
ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
auto warpOp = gpu::WarpExecuteOnLane0Op::create(
rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
uArch->getSubgroupSize(), newGpuFunc.getArguments(),
newGpuFunc.getArgumentTypes());
Block &warpBodyBlock = warpOp.getBodyRegion().front();
// Replace the ReturnOp of the original gpu function with a YieldOp.
auto origRetunOp =
cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
rewriter.setInsertionPointAfter(origRetunOp);
gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
origRetunOp.getOperands());
rewriter.eraseOp(origRetunOp);
// Move the original function body to the WarpExecuteOnLane0Op body.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
warpOp.getBodyRegion().begin());
rewriter.eraseBlock(&warpBodyBlock);
// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
rewriter.setInsertionPointAfter(warpOp);
gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
rewriter.replaceOp(gpuFuncOp, newGpuFunc);
return success();
}
};
/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
/// still contain the original op that will not be used by the yield op (and
/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
/// arguments. Tensor descriptor shape is not distributed because it is a
/// uniform value across all work items within the subgroup. However, the
/// layout information is dropped in the new tensor descriptor type.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
/// %td = xegpu.create_nd_tdesc %arg0
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %td
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
/// ...
/// %dead = xegpu.create_nd_tdesc %arg0
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
/// vector.yield %arg0, %dead
/// }
/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
/// -> !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a xegpu::CreateNdDesc op");
auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
unsigned operandIdx = operand->getOperandNumber();
xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
descOp, "the tensor descriptor lacks layout attribute");
// CreateNdOp must not have offsets.
if (descOp.getMixedOffsets().size())
return rewriter.notifyMatchFailure(
descOp, "xegpu::CreateNdDescOp must not have offsets");
SmallVector<size_t> newRetIndices;
rewriter.setInsertionPoint(warpOp);
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
/* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
SmallVector<Value> newDescOperands = llvm::map_to_vector(
newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
rewriter.setInsertionPointAfter(newWarpOp);
xegpu::TensorDescType distributedTensorDescTy =
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
// does not contain layout info.
Value newDescOp = xegpu::CreateNdDescOp::create(
rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
descOp->getAttrs());
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the distributed type to the expected type.
newDescOp =
resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
rewriter.replaceAllUsesWith(distributedVal, newDescOp);
return success();
}
};
/// Distribute a store_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
/// through the warp op interface they would be propagated as returned values.
/// Source vector is distributed based on lane layout. Appropriate cast ops are
/// inserted if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
/// ...
/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
/// }
/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0>
/// -> !xegpu.tensor_desc<4x8xf32>
/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
/// !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
if (!storeOp)
return failure();
SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
// Expecting offsets to be present.
if (offsets.empty())
return rewriter.notifyMatchFailure(storeOp,
"the store op must have offsets");
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
SmallVector<Type> offsetTypes = llvm::map_to_vector(
offsetsAsValues, [](Value v) { return v.getType(); });
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
storeOp, "the source tensor descriptor lacks layout attribute");
FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
if (failed(distributedTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(storeOp,
"Failed to distribute the type");
VectorType distributedTypeByWarpOp =
distributedTypeByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
SmallVector<Value> newYieldedValues = {storeOp.getValue(),
storeOp.getTensorDesc()};
SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
// Create a new store op outside the warp op with the distributed vector
// type. Tensor descriptor is not distributed.
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newStoreOperands;
// For the value operand, there can be a mismatch between the vector type
// distributed by the warp op and (xegpu-specific) distributed type
// supported by the store op. Type mismatch must be resolved using
// appropriate cast op.
FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
xegpu::getDistributedVectorType(storeOp.getTensorDescType());
if (failed(storeNdDistributedValueTyOrFailure))
return rewriter.notifyMatchFailure(
storeOp, "Failed to get distributed vector type for the store op");
newStoreOperands.push_back(resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]),
storeNdDistributedValueTyOrFailure.value(), rewriter));
// For the tensor descriptor operand, the layout attribute is dropped after
// distribution. Types needs to be resolved in this case also.
xegpu::TensorDescType distributedTensorDescTy =
storeOp.getTensorDescType().dropLayouts();
newStoreOperands.push_back(
resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
distributedTensorDescTy, rewriter));
// Collect offsets.
for (size_t i = 2; i < newRetIndices.size(); ++i)
newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
auto newStoreOp =
xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
newStoreOperands, storeOp->getAttrs());
xegpu::removeLayoutAttrs(newStoreOp);
rewriter.eraseOp(storeOp);
return success();
}
};
/// Distribute a load_nd op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the load's arguments. Only the loaded vector is distributed
/// according to lane layout and, tensor descriptor types is not
/// distributed. Appropriate cast ops are inserted if the distributed types does
/// not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<4x1xf32>) {
/// ...
/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
/// ->
/// vector<4x8xf32>
/// gpu.yield %ld
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
/// ...
/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
/// vector<4x8xf32> gpu.yield %dead, %arg0
/// }
/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
///
/// ```
struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
if (!isa<xegpu::LoadNdOp>(op))
return false;
// Make sure the same load op is the last operation in the warp op body.
// This ensure that load op is not sinked earlier violating any barrier
// synchronizations.
gpu::YieldOp yield = warpOp.getTerminator();
return yield->getPrevNode() == op;
});
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a xegpu::LoadNd op");
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
if (!uArch)
return rewriter.notifyMatchFailure(
loadOp, "xegpu::LoadNdOp require target attribute attached to "
"determine transpose "
"requirement");
// Chip information is required to decide if the layout requires transpose
// effect.
// Expecting offsets to be present.
SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
if (offsets.empty())
return rewriter.notifyMatchFailure(loadOp,
"the load op must have offsets");
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
SmallVector<Type> offsetTypes = llvm::map_to_vector(
offsetsAsValues, [](Value v) { return v.getType(); });
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
loadOp, "the source tensor descriptor lacks layout attribute");
unsigned operandIdx = operand->getOperandNumber();
VectorType distributedTypeByWarpOp =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
SmallVector<size_t> newRetIndices;
SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
SmallVector<Type> newYieldedTypes = {tensorDescTy};
newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
// Create a new load op outside the warp op with the distributed vector
// type.
rewriter.setInsertionPointAfter(newWarpOp);
FailureOr<VectorType> loadNdDistValueTyOrFailure =
xegpu::getDistributedVectorType(loadOp.getTensorDescType());
if (failed(loadNdDistValueTyOrFailure))
return rewriter.notifyMatchFailure(
loadOp, "Failed to get distributed vector type for the load op");
xegpu::TensorDescType distributedTensorDescTy =
loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
// descriptor type does not
// contain layout info.
SmallVector<Value> newLoadOperands{
resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
distributedTensorDescTy, rewriter)};
// Collect offsets.
for (size_t i = 1; i < newRetIndices.size(); ++i)
newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
auto newLoadOp = xegpu::LoadNdOp::create(
rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
newLoadOperands, loadOp->getAttrs());
xegpu::removeLayoutAttrs(newLoadOp);
// Set the packed attribute if the layout requires it.
newLoadOp.setPacked(xegpu::requirePacked(layout));
// Set the transpose attribute if the layout requires it.
if (xegpu::requireTranspose(layout, uArch))
newLoadOp.setTranspose(
DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
Value distributedVal = newWarpOp.getResult(operandIdx);
// There can be a conflict between the vector type distributed by the
// warp op and (xegpu-specific) distributed type supported by the load
// op. Resolve these mismatches by inserting a cast.
Value tyResolvedVal = resolveDistributedTy(
newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
return success();
}
};
/// Distribute a dpas op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
/// distributed types does not match expected xegpu SIMT types.
/// Example:
/// ```
/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
/// (vector<8x1xf32>) {
/// ...
/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
/// vector<8x16xf32>
/// gpu.yield %dpas
/// }
/// ```
/// To
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
/// vector<8x1xf16>, vector<16x1xf16>) {
/// ...
/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
/// -> vector<8x16xf32>
/// gpu.yield %dead, %arg0, %arg1
/// }
/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
/// vector<8xf32>
/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
/// ```
struct DpasDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
if (!operand)
return rewriter.notifyMatchFailure(warpOp,
"warp result is not a xegpu::Dpas op");
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
unsigned operandIdx = operand->getOperandNumber();
xegpu::LayoutAttr layoutA =
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
xegpu::LayoutAttr layoutB =
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
xegpu::LayoutAttr layoutOut =
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
if (!layoutA || !layoutB || !layoutOut)
return rewriter.notifyMatchFailure(
dpasOp,
"the xegpu::Dpas op lacks layout attribute for A, B or output");
FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
if (failed(distLhsTypeByWarpOpOrFailure) ||
failed(distRhsTypeByWarpOpOrFailure) ||
failed(distResultTypeByWarpOpOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
"Failed to distribute the A, B or output types in xegpu::Dpas op");
llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
dpasOp.getRhs()};
llvm::SmallVector<Type, 3> newYieldTypes{
distLhsTypeByWarpOpOrFailure.value(),
distRhsTypeByWarpOpOrFailure.value()};
// Dpas acc operand is optional.
if (dpasOp.getAcc()) {
newYieldValues.push_back(dpasOp.getAcc());
newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
}
// Create a new warp op without the dpas.
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
FailureOr<VectorType> expectedDistLhsTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
FailureOr<VectorType> expectedDistRhsTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
FailureOr<VectorType> expectedDistResultTyOrFailure =
xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
if (failed(expectedDistLhsTyOrFailure) ||
failed(expectedDistRhsTyOrFailure) ||
failed(expectedDistResultTyOrFailure))
return rewriter.notifyMatchFailure(
dpasOp,
"Failed to get distributed vector type for the dpas operands.");
// Create a new dpas op outside the warp op.
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newDpasOperands;
SmallVector<VectorType> newDpasOperandExpectedTypes;
// Resolve the distributed types with the original types.
newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
if (dpasOp.getAcc())
newDpasOperandExpectedTypes.push_back(distributedResultTy);
for (unsigned i = 0; i < newRetIndices.size(); i++) {
newDpasOperands.push_back(
resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
newDpasOperandExpectedTypes[i], rewriter));
}
auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
distributedResultTy, newDpasOperands,
dpasOp->getAttrs());
xegpu::removeLayoutAttrs(newDpasOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the output type.
Value typeResolved =
resolveDistributedTy(newDpasOp.getResult(),
distResultTypeByWarpOpOrFailure.value(), rewriter);
rewriter.replaceAllUsesWith(distributedVal, typeResolved);
return success();
}
};
/// Distribute a prefetch_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
/// through the warp op interface they would be propagated as returned values.
/// Tensor descriptor shape is not distributed because it is a uniform value
/// across all work items within the subgroup. Appropriate cast ops are inserted
/// if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
/// ...
/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
/// }
/// ```
/// To
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
/// index
/// }
/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
if (!prefetchOp)
return failure();
SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
// PrefetchNdOp must have offsets.
if (offsets.empty())
return rewriter.notifyMatchFailure(prefetchOp,
"the prefetch op must have offsets");
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
SmallVector<Type> offsetTypes = llvm::map_to_vector(
offsetsAsValues, [](Value v) { return v.getType(); });
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
prefetchOp, "the source tensor descriptor lacks layout attribute");
SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
// Create a new prefetch op outside the warp op with updated tensor
// descriptor type. Source tensor descriptor require type resolution.
xegpu::TensorDescType newTensorDescTy =
prefetchOp.getTensorDescType().dropLayouts();
rewriter.setInsertionPointAfter(newWarpOp);
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
// Collect offsets.
for (size_t i = 1; i < newRetIndices.size(); ++i)
newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
prefetchOp->getAttrs());
xegpu::removeLayoutAttrs(newPrefetchOp);
rewriter.eraseOp(prefetchOp);
return success();
}
};
/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
/// region. This will simply move the barrier op outside of the warp op.
struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
// The last node must be a gpu::BarrierOp.
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
if (!barrierOp)
return failure();
// Move the barrier op outside of the warp op.
rewriter.setInsertionPointAfter(warpOp);
gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
barrierOp->getResultTypes(),
barrierOp->getOperands(), barrierOp->getAttrs());
rewriter.eraseOp(barrierOp);
return success();
}
};
/// Distribute a scattered store op. The offsets argument is required.
/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
/// The layouts are fixed and implicit: one offset/mask per lane.
/// The pass changes the offset/mask vector shapes to a
/// single-element vector, **it is assumed that their producer will also be
/// distributed**. The payload vector also has a fixed distribution:
/// no chunk size -> vector of one element.
/// chunk size -> vector of the innermost dimension of the SG-payload.
/// Example 1 (no chunk size):
/// %mask = producer_op : vector<16xi1>
/// %offset = producer_op : vector<16xindex>
/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
/// memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
/// %mask = producer_op : vector<1xi1>
/// %offset = producer_op : vector<1xindex>
/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
/// memref<256xf16>, vector<1xindex>, vector<1xi1>
/// Example 2 (chunk size, same mask and offsets):
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
///
/// Note that the store distribution pattern also handles leading unit
/// dimensions in the payload, mask and offsets vectors. In this case the store
/// distribution will only change the dimensions corresponding to the SG
/// distribution and keep the leading unit dimensions unchanged.
/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
/// offset/mask/payload when necessary so that the distributed store is workign
/// on 1D shape vector to match the HW capability.
struct StoreDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
Operation *lastNode = warpOp.getTerminator()->getPrevNode();
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
if (!storeScatterOp)
return failure();
auto offsets = storeScatterOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()))
return rewriter.notifyMatchFailure(
storeScatterOp, "Store op must have a vector of offsets argument");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
// Add handling for leading unit dimensions support
int chunkSize = storeScatterOp.getChunkSize().value_or(1);
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
// Check that all leading dimensions are unit dimensions
for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
if (storeVecTy.getShape()[i] != 1) {
return rewriter.notifyMatchFailure(
storeScatterOp, "Only unit dimensions allowed for the leading "
"dimensions of the store vector!");
}
}
auto layoutPayload =
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
auto layoutOffsets =
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
auto layoutMask =
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
FailureOr<VectorType> distMaskByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
if (failed(distStoreVecByWarpOpOrFailure) ||
failed(distOffsetsByWarpOpOrFailure) ||
failed(distMaskByWarpOpOrFailure)) {
return rewriter.notifyMatchFailure(
storeScatterOp,
"Some vector operands have no layouts, using defaults instead.");
}
VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = storeScatterOp->getOperands();
SmallVector<Type> operandTypesToYield = {
distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
// Distributed store payload type is always 1D without leading unit dims
VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
distPayloadTy.getElementType());
VectorType distOffsetsTy1D = VectorType::get(
{distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
distMaskTy.getElementType());
// Resolve distributed types to 1D for SIMT execution
Value distPayloadVal = resolveDistributedTy(
newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
Value distOffsetVal = resolveDistributedTy(
newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
Value distMaskVal = resolveDistributedTy(
newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
SmallVector<Value> newStoreScatterOpOperands = {
distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
distMaskVal};
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
storeScatterOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
rewriter.eraseOp(storeScatterOp);
return success();
}
};
static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
SmallVector<Value> newCoods;
auto maybeCoords =
layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
if (failed(maybeCoords))
return {};
assert(maybeCoords.value().size() == 1 &&
"Expected one set of distributed offsets");
SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
getAsOpFoldResult(origOffsets));
newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
return newCoods;
}
/// Pattern for distributing xegpu::LoadMatrixOp.
struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
if (!matrixOp)
return failure();
OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
});
if (!producedByLastLoad)
return rewriter.notifyMatchFailure(
warpOp, "The last op is not xegpu::LoadMatrixOp");
const int operandIdx = producedByLastLoad->getOperandNumber();
VectorType sgPayloadTy =
dyn_cast<VectorType>(matrixOp.getResult().getType());
VectorType warpResultTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
if (!sgPayloadTy)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix op payload must be a vector type");
auto loc = matrixOp.getLoc();
auto offsets = matrixOp.getMixedOffsets();
if (offsets.empty())
return rewriter.notifyMatchFailure(matrixOp,
"the load op must have offsets");
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
auto layout = matrixOp.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix operation lacks layout attribute");
FailureOr<VectorType> distPayloadByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
if (failed(distPayloadByWarpOpOrFailure))
return rewriter.notifyMatchFailure(
matrixOp, "Failed to distribute matrix op payload based on layout.");
SmallVector<Value> operands = {matrixOp.getMemDesc()};
const unsigned offsetsStartIdx = operands.size();
operands.append(offsetsAsValues);
SmallVector<Type> operandTypes =
llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypes, newRetIndices);
SmallVector<Value> newOperands = llvm::map_to_vector(
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
ShapedType::kDynamic);
DenseI64ArrayAttr newConstOffsetsAttr =
rewriter.getDenseI64ArrayAttr(newConstOffsets);
ValueRange currentOffsets =
ValueRange(newOperands).drop_front(offsetsStartIdx);
SmallVector<Value> newCoords = currentOffsets;
rewriter.setInsertionPointAfter(newWarpOp);
if (!matrixOp.getSubgroupBlockIoAttr()) {
newCoords = computeDistributedCoordinatesForMatrixOp(
rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
currentOffsets);
}
xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
// Resolve the output type and replace all uses.
rewriter.replaceAllUsesWith(
newWarpOp.getResult(operandIdx),
resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
return success();
}
};
/// Pattern for distributing xegpu::StoreMatrixOp.
struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
gpu::YieldOp yield = warpOp.getTerminator();
Operation *lastNode = yield->getPrevNode();
auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
if (!matrixOp)
return failure();
VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
if (!sgPayloadTy)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix op payload must be a vector type");
auto loc = matrixOp.getLoc();
auto offsets = matrixOp.getMixedOffsets();
if (offsets.empty())
return rewriter.notifyMatchFailure(matrixOp,
"the store op must have offsets");
SmallVector<Value> offsetsAsValues =
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
auto layout = matrixOp.getLayoutAttr();
if (!layout)
return rewriter.notifyMatchFailure(
matrixOp, "the matrix operation lacks layout attribute");
FailureOr<VectorType> distPayloadByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
if (failed(distPayloadByWarpOpOrFailure))
return rewriter.notifyMatchFailure(
matrixOp, "Failed to distribute matrix op payload based on layout.");
SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
const unsigned offsetsStartIdx = operands.size();
operands.append(offsetsAsValues);
SmallVector<Type> operandTypes =
llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
operandTypes[0] = *distPayloadByWarpOpOrFailure;
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypes, newRetIndices);
SmallVector<Value> newOperands = llvm::map_to_vector(
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
ShapedType::kDynamic);
DenseI64ArrayAttr newConstOffsetsAttr =
rewriter.getDenseI64ArrayAttr(newConstOffsets);
ValueRange currentOffsets =
ValueRange(newOperands).drop_front(offsetsStartIdx);
SmallVector<Value> newCoords = currentOffsets;
rewriter.setInsertionPointAfter(newWarpOp);
if (!matrixOp.getSubgroupBlockIoAttr()) {
newCoords = computeDistributedCoordinatesForMatrixOp(
rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
currentOffsets);
}
xegpu::StoreMatrixOp::create(
rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
ValueRange(newCoords), newConstOffsetsAttr,
matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
rewriter.eraseOp(matrixOp);
return success();
}
};
/// Distribute a scattered load op. The logic and requirements are the same as
/// for the scattered store distribution. The warpOp's payload vector is
/// expected to be distributed by the load's result consumer.
/// Example 1 (no chunk size):
/// %mask = producer_op : vector<16xi1>
/// %offset = producer_op : vector<16xindex>
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
/// To
/// %mask = producer_op : vector<1xi1>
/// %offset = producer_op : vector<1xindex>
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
/// Example 2 (chunk size, same mask and offsets):
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
/// To
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
///
/// Note that the load distribution pattern also handles leading unit dimensions
/// in the payload, mask, and offsets vector.The load distribution will only
/// change the dimensions corresponding to the SG distribution and keep the
/// leading unit dimensions unchanged. For example, a load with result type
/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
/// offset/mask/payload when necessary so that the distributed load is workign
/// on 1D shape vector to match the HW capability.
struct LoadDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
// Check if the yield operand that was produced by the *last* scattered
// load op to avoid sinking it before barriers (maintain memory order).
return isa<xegpu::LoadGatherOp>(op) &&
warpOp.getTerminator()->getPrevNode() == op;
});
if (!producedByLastLoad)
return rewriter.notifyMatchFailure(
warpOp, "The last op is not xegpu::LoadGatherOp");
auto loadGatherOp =
producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
auto offsets = loadGatherOp.getOffsets();
if (!offsets || !isa<VectorType>(offsets.getType()) ||
!isa<VectorType>(loadGatherOp.getMask().getType()))
return rewriter.notifyMatchFailure(
loadGatherOp,
"Load op must have a vector arguments for offsets and mask");
VectorType offsetsTy = cast<VectorType>(offsets.getType());
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
VectorType resultVecTy =
cast<VectorType>(loadGatherOp.getResult().getType());
// add handling leading unit dimensions support
int chunkSize = loadGatherOp.getChunkSize().value_or(1);
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
if (resultVecTy.getShape()[i] != 1) {
return rewriter.notifyMatchFailure(
loadGatherOp, "Only unit dimensions allowed for the leading "
"dimensions of the load vector!");
}
}
auto layoutOffsets =
xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
FailureOr<VectorType> distMaskByWarpOpOrFailure =
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
if (failed(distOffsetsByWarpOpOrFailure) ||
failed(distMaskByWarpOpOrFailure)) {
return rewriter.notifyMatchFailure(
loadGatherOp,
"Some vector operands have no layouts, using defaults instead.");
}
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands = loadGatherOp->getOperands();
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
VectorType distResultTy =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
SmallVector<Type> operandTypesToYield = {operands[0].getType(),
distOffsetsTy, distMaskTy};
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
// Distributed load op will always be 1D.
VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
distResultTy.getElementType());
VectorType distOffsetsTy1D =
VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
distOffsetsByWarpOpOrFailure.value().getElementType());
VectorType distMaskTy1D =
VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
distMaskByWarpOpOrFailure.value().getElementType());
Value distOffsetVal = resolveDistributedTy(
newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
Value distmaskVal = resolveDistributedTy(
newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
SmallVector<Value> newLoadGatherOperands = {
newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
loadGatherOp->getAttrs());
xegpu::removeLayoutAttrs(newOp);
Value distributedVal = newWarpOp.getResult(operandIdx);
// Resolve the output type and replace all uses.
rewriter.replaceAllUsesWith(
distributedVal,
resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
return success();
}
};
// Sink SG-uniform ops. An op is uniform if none
// of its operands/results has a distribution layout attribute.
// Non-uniform vectors are handled by dedicated patterns.
// This pattern must have a higher priority than vector dialect distribution
// patterns, because a distributable shape may be logically intended as
// uniform (i.e., no layout), so we want to omit its distribution.
struct SinkUniformOps final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
// Take the last op
Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
// Any ops with nested regions must be handled carefully in dedicated
// patterns.
if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
return failure();
int operandIdx = -1;
if (warpRegionPreYieldOp->getNumResults()) {
OpOperand *operand = getWarpResult(
warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
if (!operand)
return failure();
operandIdx = operand->getOperandNumber();
if (warpRegionPreYieldOp->getResult(0).getType() !=
warpOp.getResult(operandIdx).getType())
return rewriter.notifyMatchFailure(warpOp,
"The op result is not uniform.");
}
// The op must have no layout-based operands or results.
bool uniformValuesOnly =
llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
return !xegpu::getDistributeLayoutAttr(v);
});
uniformValuesOnly &=
llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
return !xegpu::getDistributeLayoutAttr(opr);
});
if (!uniformValuesOnly)
return rewriter.notifyMatchFailure(warpOp,
"Some values are not uniform.");
SmallVector<size_t> newRetIndices;
SmallVector<Value> operands =
llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
SmallVector<Type> operandTypes =
llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, operands, operandTypes, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
IRMapping operandMapper;
for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
newWarpOp->getResult(newOperandIdx));
Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
if (!clonedOp->getNumResults())
rewriter.eraseOp(warpRegionPreYieldOp);
else {
assert(operandIdx != -1 && "Expected a warp result for the operation");
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
clonedOp->getResult(0));
}
return success();
}
};
/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
/// VectorReductionOps. We also insert layouts for the newly created ops.
static Value lowerToVectorReductions(TypedValue<VectorType> src,
TypedValue<VectorType> acc,
vector::CombiningKind kind,
int64_t reductionDim, Location loc,
PatternRewriter &rewriter) {
// Expecting a 2D source vector.
assert(src.getType().getRank() == 2 && "expected a 2D source vector");
VectorType sourceType = src.getType();
int64_t sourceH = sourceType.getShape()[0];
int64_t sourceW = sourceType.getShape()[1];
int nSlices = (reductionDim == 0) ? sourceW : sourceH;
// Create a constant vector to hold the result of the reduction.
TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
Value reductionResult = arith::ConstantOp::create(
rewriter, loc, acc.getType(),
DenseElementsAttr::get(acc.getType(), zeroAttr));
// Reduction result should have the same layout as the accumulator.
xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
// For each slice of the source, extract the slice vector, do a reduction
// and, insert the reduced value back to the result vector.
for (int i = 0; i < nSlices; ++i) {
SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
if (reductionDim == 1) {
sliceOffsets = {i, 0};
sliceSizes = {1, sourceW};
} else {
sliceOffsets = {0, i};
sliceSizes = {sourceH, 1};
}
vector::ExtractStridedSliceOp extractOp =
vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
sliceSizes, {1, 1});
int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
vector::ShapeCastOp slice = vector::ShapeCastOp::create(
rewriter, loc,
VectorType::get({nSliceElements}, sourceType.getElementType()),
extractOp.getResult());
// Shape cast is currently handled in xegpu side. So layouts must be
// retained during lowering. Shape cast output has the same layout as the
// accumulator. Shape cast source has the same layout as the original
// reduction source.
// TODO: other ops generated here may also need layout attributes.
auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
// Extract and reduction results in scalars, so no result layout is needed.
Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
Value reduction = vector::ReductionOp::create(
rewriter, loc, kind, slice.getResult(), accExtract);
reductionResult =
vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
}
return reductionResult;
}
/// This patterns distribute the `vector.multi_reduction` operation across
/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
/// layouts for the source and accumulator vectors,
/// * If the reduction dimension is distributed across lanes, the reduction is
/// non-lane-local and the reduction is done using warp shuffles. Here we
/// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
/// the warp op body.
/// * If the reduction dimension is not distributed across lanes, the reduction
/// is lane-local. In this case, we yield the source and accumulator vectors
/// from the warp op and perform the lane-local reduction outside the warp op
/// using a sequence of ReductionOps.
/// Example 1 (Reduction is lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
/// %acc = "some_def"() : () -> (vector<32xf32>)
/// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
/// vector<32xf32> gpu.yield %1 : vector<32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
/// vector<1xf32>) {
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
/// %acc = "some_def"() : () -> (vector<32xf32>)
/// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
/// }
/// %c = arith.constant dense<0.0> : vector<1xf32>
/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
/// ```
/// Example 2 (Reduction is non-lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
/// %acc = "some_def"() : () -> (vector<2xf32>)
/// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
/// vector<2xf32>
/// gpu.yield %1 : vector<2xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
/// %acc = "some_def"() : () -> (vector<2xf32>)
/// %1 = arith.constant dense<0.0> : vector<2xf32>
/// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
/// %3 = ("warp.reduction %2") : f32
/// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
/// ... repeat for row 1
/// gpu.yield %1 : vector<2xf32>
/// }
struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *yieldOperand =
getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
if (!yieldOperand)
return failure();
auto reductionOp =
cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
unsigned operandIdx = yieldOperand->getOperandNumber();
VectorType sourceType = reductionOp.getSourceVectorType();
// Only 2D vectors are supported.
if (sourceType.getRank() != 2)
return rewriter.notifyMatchFailure(warpOp,
"Only 2D reductions are supported.");
ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
// Only 1 reduction dimension supported. This also ensures that the result
// is vector type.
if (reductionDims.size() != 1)
return rewriter.notifyMatchFailure(
warpOp, "Only 1 reduction dimension is supported.");
int64_t reductionDim = reductionDims[0];
VectorType distributedResultType =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
VectorType resultType = cast<VectorType>(reductionOp.getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
if (failed(sourceDistTypeOrFailure))
return rewriter.notifyMatchFailure(
warpOp, "Failed to distribute the source vector type.");
VectorType sourceDistType = sourceDistTypeOrFailure.value();
// Only single dimension distribution is supported.
bool dim0Distributed =
sourceDistType.getShape()[0] != sourceType.getShape()[0];
bool dim1Distributed =
sourceDistType.getShape()[1] != sourceType.getShape()[1];
if (dim0Distributed && dim1Distributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting source to be distributed in a single dimension.");
int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
if (sourceDistDim == -1)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed source vector.");
bool resultDistributed =
distributedResultType.getNumElements() < resultType.getNumElements();
// If the lane owns all the data required for reduction (i.e. reduction is
// fully parallel accross lanes), then each lane owns part of the result
// (i.e. result is distributed). If the reduction require cross-lane
// shuffling, then the result is shared among all lanes (broadcasted).
// Therefore we expect following cases:
//
// | Source vector | Reduction dim | Result vector |
// |----------------------|----------------|----------------|
// | dim-0 distributed | 0 | broadcasted |
// | dim-0 distributed | 1 | distributed |
// | dim-1 distributed | 0 | distributed |
// | dim-1 distributed | 1 | broadcasted |
bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
(sourceDistDim == 1 && reductionDim == 0);
if (isReductionLaneLocal && !resultDistributed)
return rewriter.notifyMatchFailure(
warpOp, "Expecting a distributed result for lane-local reduction.");
if (!isReductionLaneLocal && resultDistributed)
return rewriter.notifyMatchFailure(
warpOp,
"Expecting a broadcasted result for non-lane-local reduction.");
// Handle lane-local reduction case. In this case we fully distribute the
// reduction result.
if (isReductionLaneLocal) {
// Yield the source and acc vectors from the WarpOp.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
{sourceDistType, distributedResultType}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value result = lowerToVectorReductions(
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
return success();
}
// For non-lane-local case, we simply rewrite the MultiReductionOp in terms
// of multiple ReductionOps. Actual distribution is done by the
// WarpOpReduction pattern.
rewriter.setInsertionPointAfter(reductionOp);
Value result = lowerToVectorReductions(
cast<TypedValue<VectorType>>(reductionOp.getSource()),
cast<TypedValue<VectorType>>(reductionOp.getAcc()),
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
// Replace the warp op result with the final result.
rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
return success();
}
};
/// This pattern distributes the `vector.broadcast` operation across lanes in a
/// warp. The pattern supports three use cases:
///
/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
/// vector
/// must have a slice layout of the result. If the distributed source and
/// target vector types are identical, this lowers to a no-op; otherwise, it
/// remains a broadcast but operates on distributed vectors.
///
/// 2) Broadcast a same-rank vector with identical layouts for source and
/// target:
/// The source vector must have unit dimensions, and lane_data must be unit
/// size for those unit dims. This always lowers to a no-op.
///
/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
/// scalar to distributed result type.
///
/// Example 1 (lowering to a broadcast with distributed types):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
/// %0 = "some_def"() {layout_result_0 =
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
/// dims = [0]> } : () -> (vector<32xf32>)
/// %2 = vector.broadcast %0 {layout_result_0 =
/// #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
/// : vector<32xf32> to vector<8x32xf32>
/// gpu.yield %1 : vector<8x32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
/// %0 = "some_def"() {layout_result_0 =
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
/// dims = [0]> } : () -> (vector<32xf32>)
/// gpu.yield %0 : vector<32xf32>
/// }
/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
///
/// Example 2 (no-op):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
/// %0 = "some_def"() {layout_result_0 =
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
/// dims = [1]> } : () -> (vector<8xf32>)
/// %1 = vector.shape_cast %0
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
/// 1]>}: vector<8xf32> to vector<8x1xf32>
/// %2 = vector.broadcast %1
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
/// 1]>}: vector<8x1xf32> to vector<8x32xf32>
/// gpu.yield %1 : vector<8x32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
/// %0 = "some_def"() {layout_result_0 =
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
/// dims = [1]> } : () -> (vector<8xf32>)
/// %1 = vector.shape_cast %0
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
/// 1]>}: vector<8xf32> to vector<8x1xf32>
/// gpu.yield %1 : vector<8x1xf32>
/// }
/// // The broadcast is implicit through layout transformation (no-op)
/// "some_use"(%r#0)
/// ```
struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *yieldOperand =
getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
if (!yieldOperand)
return failure();
auto broadcastOp =
cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
unsigned operandIdx = yieldOperand->getOperandNumber();
VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
VectorType destType =
dyn_cast<VectorType>(broadcastOp.getResult().getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));
FailureOr<VectorType> sourceDistType;
Type sourceElemOrDistType;
if (sourceType) {
// Case 1 and 2: source is a vector type.
int64_t rankDiff = destType.getRank() - sourceType.getRank();
if (rankDiff > 0) {
// Case 1: source is lower-rank than result.
bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
if (!isSliceOf)
return rewriter.notifyMatchFailure(
warpOp,
"Broadcast input layout must be a slice of result layout.");
}
// case 2: source and result have same rank
if (rankDiff == 0) {
auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
broadcastUnitDimsSet.end());
bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
if (!isEqualTo)
return rewriter.notifyMatchFailure(
warpOp, "For same-rank broadcast, source must be identical to "
"adjusted result layouts with unit dims.");
resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
}
sourceDistType =
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
if (failed(sourceDistType)) {
return rewriter.notifyMatchFailure(
warpOp, "Failed to distribute the source vector type.");
}
sourceElemOrDistType = sourceDistType.value();
} else {
// Case 3: source is a scalar type.
if (sourceLayout) {
return rewriter.notifyMatchFailure(
warpOp, "Broadcast from scalar must not have a layout attribute.");
}
sourceElemOrDistType = broadcastOp.getSourceType();
}
FailureOr<VectorType> destDistType =
getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
if (failed(destDistType)) {
return rewriter.notifyMatchFailure(
warpOp, "Failed to distribute the dest vector type.");
}
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
newRetIndices);
Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
Value newBroadcast = distributedSource;
if (sourceElemOrDistType != destDistType.value()) {
rewriter.setInsertionPointAfter(newWarpOp);
newBroadcast =
vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
destDistType.value(), distributedSource);
}
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
return success();
}
};
/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region.
struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *yieldOperand =
getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
if (!yieldOperand)
return failure();
auto shapeCastOp =
cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
unsigned operandNumber = yieldOperand->getOperandNumber();
auto resultDistTy =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
if (!sourceLayout || !resultLayout)
return rewriter.notifyMatchFailure(
warpOp,
"the source or result of shape_cast op lacks distribution layout");
FailureOr<VectorType> sourceDistTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout,
shapeCastOp.getSourceVectorType());
if (failed(sourceDistTypeOrFailure))
return rewriter.notifyMatchFailure(
warpOp, "failed to get distributed vector type for source");
VectorType sourceDistType = sourceDistTypeOrFailure.value();
// Create a new warp op that yields the source of the shape_cast op.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value source = newWarpOp.getResult(newRetIndices[0]);
// Create a new shape_cast op outside the warp op.
Value newShapeCast = vector::ShapeCastOp::create(
rewriter, shapeCastOp.getLoc(), resultDistTy, source);
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
newShapeCast);
return success();
}
};
// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
// advanced cases where the distributed dimension is partially extracted and
// currently not supported by the generic vector distribution patterns.
struct VectorExtractStridedSliceDistribution
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
if (!operand)
return failure();
auto extractOp =
cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
unsigned operandIdx = operand->getOperandNumber();
auto distributedType =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
// Find the distributed dimensions.
auto extractResultType = cast<VectorType>(operand->get().getType());
auto distributedDims =
getDistributedDims(extractResultType, distributedType);
// Collect updated source type, sizes and offsets. They may be adjusted
// later if the data is distributed to lanes (as opposed to being owned by
// all lanes uniformly).
VectorType updatedSourceType = extractOp.getSourceVectorType();
SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
extractOp.getSizes(), [](Attribute attr) { return attr; });
SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
extractOp.getOffsets(), [](Attribute attr) { return attr; });
SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
extractOp.getStrides(), [](Attribute attr) { return attr; });
// If the provided sizes, offsets, strides are less than the rank, pad them
// with full sizes, zero offsets, and unit strides. This makes it easier to
// adjust them later.
int64_t sourceRank = extractOp.getSourceVectorType().getRank();
for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
updatedSizes.push_back(rewriter.getI64IntegerAttr(
extractOp.getSourceVectorType().getDimSize(i)));
updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
updatedStrides.push_back(
rewriter.getI64IntegerAttr(1)); // stride is always 1.
}
// If the result is distributed, it must be distributed in exactly one
// dimension. In this case, we adjust the sourceDistType, distributedSizes
// and distributedOffsets accordingly.
if (distributedDims.size() > 0) {
if (distributedDims.size() != 1)
return rewriter.notifyMatchFailure(
warpOp, "Source can not be distributed in multiple dimensions.");
int64_t distributedDim = distributedDims[0];
int sourceDistrDimSize =
extractOp.getSourceVectorType().getShape()[distributedDim];
auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
return rewriter.notifyMatchFailure(
warpOp, "the source of extract_strided_slice op lacks distribution "
"layout");
auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
// Because only single dimension distribution is supported, lane layout
// size at the distributed dim must be the subgroup size.
int subgroupSize = sourceLaneLayout[distributedDim];
// Check if the source size in the distributed dimension is a multiple of
// subgroup size.
if (sourceDistrDimSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp,
"Source size along distributed dimension is not a multiple of "
"subgroup size.");
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
// We expect lane data to be all ones in this case.
if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
return rewriter.notifyMatchFailure(
warpOp, "Expecting unit lane data in source layout");
// The offsets in the distributed dimention must be a multiple of subgroup
// size.
int64_t distrDimOffset =
cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
if (distrDimOffset % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp, "Offset along distributed dimension "
"is not a multiple of subgroup size.");
updatedSourceType = getDistVecTypeBasedOnLaneLayout(
sourceLayout, extractOp.getSourceVectorType())
.value();
// Update the distributed sizes to match the distributed type.
updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
distributedType.getDimSize(distributedDim));
// Update the distributed offsets to match round robin distribution (i.e.
// each lane owns data at `subgroupSize` stride given unit lane data).
updatedOffsets[distributedDim] =
rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
}
// Do the distribution by yielding the source of the extract op from
// the warp op and creating a new extract op outside the warp op.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value source = newWarpOp.getResult(newRetIndices[0]);
// Create a new extract op outside the warp op.
Value newExtractOp = vector::ExtractStridedSliceOp::create(
rewriter, extractOp.getLoc(), distributedType, source,
ArrayAttr::get(rewriter.getContext(), updatedOffsets),
ArrayAttr::get(rewriter.getContext(), updatedSizes),
ArrayAttr::get(rewriter.getContext(), updatedStrides));
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
return success();
}
};
/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
/// advanced cases where the distributed dimension is partially inserted and
/// currently not supported by the generic vector distribution patterns.
struct VectorInsertStridedSliceDistribution
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
// Check if the InsertStridedSliceOp is the last op before yield op
return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
warpOp.getTerminator()->getPrevNode() == op;
});
if (!operand)
return failure();
unsigned int operandNumber = operand->getOperandNumber();
auto insertOp =
operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
auto distributedType =
cast<VectorType>(warpOp.getResult(operandNumber).getType());
// Find the distributed dimensions of the dest vector.
auto insertResultType = cast<VectorType>(operand->get().getType());
auto destDistributedDims =
getDistributedDims(insertResultType, distributedType);
// Collect updated offsets, source type and dest type. They may be adjusted
// later if the data is distributed to lanes (as opposed to being owned by
// all lanes uniformly).
SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
insertOp.getOffsets(), [](Attribute attr) { return attr; });
VectorType updatedSourceType = insertOp.getSourceVectorType();
VectorType updatedDestType = insertOp.getDestVectorType();
if (destDistributedDims.size() > 0) {
// Only single dimension distribution is supported.
if (destDistributedDims.size() != 1)
return rewriter.notifyMatchFailure(
warpOp,
"Expecting source to be distributed in a single dimension.");
int64_t destDistributedDim = destDistributedDims[0];
VectorType srcType = insertOp.getSourceVectorType();
VectorType destType = insertOp.getDestVectorType();
// Currently we require that both source (kD) and dest (nD) vectors are
// distributed. This requires that distributedDim (d) is contained in the
// last k dims of the dest vector (d >= n - k).
int64_t sourceDistributedDim =
destDistributedDim - (destType.getRank() - srcType.getRank());
if (sourceDistributedDim < 0)
return rewriter.notifyMatchFailure(
insertOp,
"distributed dimension must be in the last k (i.e. source "
"rank) dims of dest vector");
int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
// Obtain the source and dest layouts.
auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
if (!destLayout || !sourceLayout ||
destLayout.getEffectiveLaneLayoutAsInt().empty() ||
sourceLayout.getEffectiveLaneLayoutAsInt().empty())
return rewriter.notifyMatchFailure(
warpOp, "the source or dest of insert_strided_slice op lacks "
"distribution layout");
// Because only single dimension distribution is supported, lane layout
// size at the distributed dim must be the subgroup size.
int subgroupSize =
destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
// We require that source and dest lane data are all ones to ensure
// uniform round robin distribution.
auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
return rewriter.notifyMatchFailure(
warpOp, "Expecting unit lane data in source and dest layouts");
// Source distributed dim size must be multiples of subgroup size.
if (srcDistrDimSize % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp, "Distributed dimension size in source is not a multiple of "
"subgroup size.");
// Offsets in the distributed dimension must be multiples of subgroup
// size.
int64_t destDistrDimOffset =
cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
if (destDistrDimOffset % subgroupSize != 0)
return rewriter.notifyMatchFailure(
warpOp,
"Offset along distributed dimension in dest is not a multiple of "
"subgroup size.");
// Update the source and dest types based on their layouts.
updatedSourceType = getDistVecTypeBasedOnLaneLayout(
sourceLayout, insertOp.getSourceVectorType())
.value();
updatedDestType = getDistVecTypeBasedOnLaneLayout(
destLayout, insertOp.getDestVectorType())
.value();
// Update the distributed offsets to match round robin distribution (i.e.
// each lane owns data at `subgroupSize` stride given unit lane data).
updatedOffsets[destDistributedDim] =
rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
}
// Do the distribution by yielding the source and dest of the insert op
// from the warp op and creating a new insert op outside the warp op.
SmallVector<size_t> newRetIndices;
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
{updatedSourceType, updatedDestType}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
Value dest = newWarpOp.getResult(newRetIndices[1]);
// Create a new insert op outside the warp op.
Value newInsertOp = vector::InsertStridedSliceOp::create(
rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
ArrayAttr::get(rewriter.getContext(), updatedOffsets),
insertOp.getStrides());
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
newInsertOp);
return success();
}
};
/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
/// outside of the warp op.
struct MemrefExtractAlignedPointerAsIndexDistribution final
: public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand = getWarpResult(
warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp,
"warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
auto extractOp =
operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
unsigned operandIdx = operand->getOperandNumber();
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, extractOp.getSource(),
TypeRange{extractOp.getSource().getType()}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
rewriter, newWarpOp.getLoc(), extractOp.getType(),
newWarpOp.getResult(newRetIndices[0]));
Value resultVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
return success();
}
};
/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
/// created outside of the warp op with distributed source vector type (computed
/// using assigned layout).
struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a vector::BitCast op");
auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
unsigned operandIdx = operand->getOperandNumber();
VectorType distributedSourceType =
getDistVecTypeBasedOnLaneLayout(
xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
bitcastOp.getSourceVectorType())
.value_or(VectorType());
if (!distributedSourceType)
return rewriter.notifyMatchFailure(
bitcastOp, "Failed to distribute the source vector type in "
"vector::BitCast op");
VectorType distributedResultType =
cast<VectorType>(warpOp.getResult(operandIdx).getType());
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, bitcastOp.getSource(),
TypeRange{distributedSourceType}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
auto newBitcastOp = vector::BitCastOp::create(
rewriter, newWarpOp.getLoc(), distributedResultType,
newWarpOp.getResult(newRetIndices[0]));
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
return success();
}
};
/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
/// supported. In most cases, transpose is a no op because it is entirely
/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
/// vector::TransposeOp outside of the warp op with distributed source vector
/// type (computed using assigned layout).
struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
using gpu::WarpDistributionPattern::WarpDistributionPattern;
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
PatternRewriter &rewriter) const override {
OpOperand *operand =
getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
if (!operand)
return rewriter.notifyMatchFailure(
warpOp, "warp result is not a vector::Transpose op");
auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
unsigned operandIdx = operand->getOperandNumber();
xegpu::DistributeLayoutAttr sourceLayout =
xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
xegpu::DistributeLayoutAttr resultLayout =
xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
if (!sourceLayout || !resultLayout)
return rewriter.notifyMatchFailure(
transposeOp,
"the source or result vector of the transpose op lacks layout "
"attribute");
int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
int64_t resultRank = transposeOp.getResultVectorType().getRank();
// Only 2D transposes are supported for now.
// TODO: Support nD transposes.
if (sourceRank != 2 || resultRank != 2)
return rewriter.notifyMatchFailure(
transposeOp, "the source or result vector of the transpose op "
"does not have 2D layout");
ArrayRef<int64_t> perm = transposeOp.getPermutation();
// Result layout must be a transpose of source layout.
if (!resultLayout.isTransposeOf(sourceLayout, perm))
return rewriter.notifyMatchFailure(
transposeOp,
"the source or result vector layouts must be 2D transposes of each "
"other");
FailureOr<VectorType> distributedSourceTypeOrFailure =
getDistVecTypeBasedOnLaneLayout(sourceLayout,
transposeOp.getSourceVectorType());
if (failed(distributedSourceTypeOrFailure))
return rewriter.notifyMatchFailure(
transposeOp, "Failed to distribute the source vector type in "
"vector::Transpose op");
SmallVector<size_t> newRetIndices;
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
rewriter, warpOp, transposeOp.getVector(),
TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
rewriter.setInsertionPointAfter(newWarpOp);
auto newTransposeOp = vector::TransposeOp::create(
rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
perm);
Value distributedVal = newWarpOp.getResult(operandIdx);
rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
return success();
}
};
} // namespace
namespace {
struct XeGPUSubgroupDistributePass final
: public xegpu::impl::XeGPUSubgroupDistributeBase<
XeGPUSubgroupDistributePass> {
void runOnOperation() override;
};
} // namespace
void xegpu::populateXeGPUSubgroupDistributePatterns(
RewritePatternSet &patterns) {
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
GpuBarrierDistribution, VectorMultiReductionDistribution,
LoadDistribution, StoreDistribution, VectorTransposeDistribution,
VectorBitcastDistribution, LoadMatrixDistribution,
StoreMatrixDistribution,
MemrefExtractAlignedPointerAsIndexDistribution>(
patterns.getContext(),
/*pattern benefit=*/PatternHierarchy::Regular);
// For following patterns, we need to override the regular vector distribution
// patterns. Therefore, assign higher benefit.
patterns
.add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
SinkUniformOps>(patterns.getContext(),
/*pattern benefit=*/PatternHierarchy::AboveRegular);
}
void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
RewritePatternSet &patterns) {
patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
}
void XeGPUSubgroupDistributePass::runOnOperation() {
// Step 1: Attach layouts to op operands.
// TODO: Following assumptions are made:
// 1) It is assumed that there are no layout conflicts.
// 2) Any existing layout attributes attached to the operands are ignored.
Operation *op = getOperation();
if (!xegpu::recoverTemporaryLayouts(op)) {
signalPassFailure();
return;
}
// Step 2: Move all operations of a GPU function inside
// gpu.warp_execute_on_lane_0 operation.
{
RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
signalPassFailure();
return;
}
// At this point, we have moved the entire function body inside the
// warpOp. Now move any scalar uniform code outside of the warpOp (like
// GPU index ops, scalar constants, etc.). This will simplify the
// later lowering and avoid custom patterns for these ops.
getOperation()->walk([&](Operation *op) {
if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
vector::moveScalarUniformCode(warpOp);
});
}
// Step 3: Apply subgroup to workitem distribution patterns.
RewritePatternSet patterns(&getContext());
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
// distributionFn is used by vector distribution patterns to determine the
// distributed vector type for a given vector value. In XeGPU subgroup
// distribution context, we compute this based on lane layout.
auto distributionFn = [](Value val) {
VectorType vecType = dyn_cast<VectorType>(val.getType());
int64_t vecRank = vecType ? vecType.getRank() : 0;
if (vecRank == 0)
return AffineMap::get(val.getContext());
// Get the layout of the vector type.
xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
// If no layout is specified, assume uniform case (no distribution).
if (!layout)
return AffineMap::get(val.getContext());
// Expecting vector and layout rank to match.
assert(layout.getRank() == vecRank &&
"Expecting vector and layout rank to match");
// A dimension is distributed only if layout suggests there are
// multiple lanes assigned for this dimension and the shape can be evenly
// distributed to those lanes.
SmallVector<unsigned int> distributedDims;
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
if (v > 1 && vecType.getShape()[i] % v == 0)
distributedDims.push_back(i);
}
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
val.getContext());
};
// TODO: shuffleFn is not used.
auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
int64_t warpSz) { return Value(); };
auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
vector::CombiningKind kind, uint32_t size) {
// First reduce on a single thread to get per lane reduction value.
Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
// Parallel reduction using butterfly shuffles.
for (uint64_t i = 1; i < size; i <<= 1) {
Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
/*width=*/size,
/*mode=*/gpu::ShuffleMode::XOR)
.getShuffleResult();
laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
}
return laneVal;
};
vector::populateDistributeReduction(
patterns, warpReduction,
/*pattern benefit=*/PatternHierarchy::Regular);
vector::populatePropagateWarpVectorDistributionPatterns(
patterns, distributionFn, shuffleFn,
/*pattern benefit=*/PatternHierarchy::Regular);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
signalPassFailure();
return;
}
// Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
// due to tensor desc type mismatches created by using upstream distribution
// patterns (scf.for). This cleanup should only be done if all the ops are
// distributed successfully, if some ops are still not distributed and remains
// inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
// breaking the IR.
bool foundWarpOp = false;
getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
// Look for WarpOps that are not trivially dead.
if (isOpTriviallyDead(warpOp))
return WalkResult::advance();
foundWarpOp = true;
return WalkResult::interrupt();
});
if (foundWarpOp)
return;
getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
// We are only interested in UnrealizedConversionCastOps there were added
// for resolving SIMT type mismatches.
if (!op->getAttr(resolveSIMTTypeMismatch))
return WalkResult::skip();
Value input = op.getOperand(0);
Value output = op.getResult(0);
// Both input and output must have tensor descriptor types.
xegpu::TensorDescType inputDescType =
mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
xegpu::TensorDescType outputDescType =
mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
assert(inputDescType && outputDescType &&
"Unrealized conversion cast must have tensor descriptor types");
// tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
// This occurs inside scf.for body to resolve the block argument type to
// SIMT type.
if (inputDescType.getLayout()) {
auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
if (argument) {
argument.setType(output.getType());
output.replaceAllUsesWith(argument);
if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
argument.getOwner()->getParentOp())) {
auto result = loopOp.getTiedLoopResult(argument);
result.setType(output.getType());
}
}
}
// tensor_desc<shape> -> tensor_desc<shape, layout> Type of
// conversions. This occurs at the yield op of scf.for body to go back
// from SIMT type to original type.
if (outputDescType.getLayout())
output.replaceAllUsesWith(input);
if (op->use_empty())
op->erase();
return WalkResult::advance();
});
}