This PR adds support for lowering of `vector.reduction` and `vector.multi_reduction` ops in subgroup to work-item distribution. Following cases are considered currently (more support will be added later): * `vector.reduction` : This assumes the source vector is distributed to all lanes and lanes must shuffle data to do a collaborative reduction. result is shared among all lanes. This is done by emitting `gpu::ShuffleOp` s and doing a butterfly reduction. Refer `VectorDistribution` for more details. * `vector.multi_reduction`: 2 cases are considered, 1. **Reduction is lane-local**: simply lower to a lane local multi reduction op. each lane does its own reduction. result is distributed. 2. **Reduction is not lane-local:** This one is handled indirectly. In this case, we rewrite the reduction in terms of `vector.reduction` ops (plus exrtact. insert) before the WI distribution even begin. Then whole things is distributed using `gpu::ShuffleOp` s later (not fullly supported yet).
2164 lines
100 KiB
C++
2164 lines
100 KiB
C++
//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
|
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
|
|
#include "mlir/Dialect/Index/IR/IndexDialect.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
|
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
|
|
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
|
|
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
|
|
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
|
|
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
|
|
#include "mlir/IR/AffineMap.h"
|
|
#include "mlir/IR/Attributes.h"
|
|
#include "mlir/IR/Builders.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/BuiltinOps.h"
|
|
#include "mlir/IR/BuiltinTypes.h"
|
|
#include "mlir/IR/Operation.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
#include "mlir/IR/TypeRange.h"
|
|
#include "mlir/IR/Value.h"
|
|
#include "mlir/IR/Visitors.h"
|
|
#include "mlir/Interfaces/FunctionInterfaces.h"
|
|
#include "mlir/Support/LLVM.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "mlir/Transforms/InliningUtils.h"
|
|
#include "llvm/ADT/ArrayRef.h"
|
|
#include "llvm/ADT/STLExtras.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/SmallVectorExtras.h"
|
|
|
|
namespace mlir {
|
|
namespace xegpu {
|
|
#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
|
|
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
|
|
} // namespace xegpu
|
|
} // namespace mlir
|
|
|
|
#define DEBUG_TYPE "xegpu-subgroup-distribute"
|
|
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
|
|
|
|
using namespace mlir;
|
|
|
|
static const char *const resolveSIMTTypeMismatch =
|
|
"resolve_simt_type_mismatch"; // Attribute name for identifying
|
|
// UnrelizedConversionCastOp added to resolve
|
|
// SIMT type mismatches.
|
|
|
|
namespace {
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// SIMT Distribution Patterns
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// In certain cases, we may need to favor XeGPU specific distribution patterns
|
|
/// over generic vector distribution patterns. In such cases, we can assign
|
|
/// priorities to patterns.
|
|
enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };
|
|
|
|
/// Helper function to resolve types if the distributed type out of
|
|
/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
|
|
/// Example 1:
|
|
/// distributed type: vector<8x1xf32>
|
|
/// expected type: vector<8xf32>
|
|
/// resolved using,
|
|
/// %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
|
|
/// Example 2:
|
|
/// distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
|
|
/// expected type: xegpu.tensor_desc<8x16xf32>
|
|
/// resolved using,
|
|
/// %0 = unrealized_conversion_cast %1 :
|
|
/// xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
|
|
/// xegpu.tensor_desc<8x16xf32>
|
|
template <typename T>
|
|
static Value resolveDistributedTy(Value orig, T expected,
|
|
PatternRewriter &rewriter) {
|
|
// If orig and expected types are the same, return orig.
|
|
if (orig.getType() == expected)
|
|
return orig;
|
|
// If orig is a vector type, create a shape cast op to reconcile the types.
|
|
if (isa<VectorType>(orig.getType())) {
|
|
auto castOp =
|
|
vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
|
|
return castOp.getResult();
|
|
}
|
|
// If orig is a tensor descriptor type, create an unrealized conversion cast
|
|
// op to reconcile the types.
|
|
if (isa<xegpu::TensorDescType>(orig.getType())) {
|
|
auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
|
|
expected, orig);
|
|
castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
|
|
return castOp.getResult(0);
|
|
}
|
|
llvm_unreachable("Unsupported type for reconciliation");
|
|
return orig;
|
|
}
|
|
|
|
/// Given a vector type and its distributed vector type, return the list of
|
|
/// dimensions that are distributed.
|
|
static SmallVector<int64_t> getDistributedDims(VectorType originalType,
|
|
VectorType distributedType) {
|
|
assert(originalType.getRank() == distributedType.getRank() &&
|
|
"sequential and distributed vector types must have the same rank");
|
|
SmallVector<int64_t> distributedDims;
|
|
for (int64_t i = 0; i < originalType.getRank(); ++i) {
|
|
if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
|
|
distributedDims.push_back(i);
|
|
}
|
|
}
|
|
return distributedDims;
|
|
}
|
|
|
|
/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
|
|
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
|
|
/// contained within a WarpExecuteOnLane0Op.
|
|
/// Example:
|
|
///
|
|
/// ```
|
|
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
|
|
/// ...
|
|
/// ...
|
|
/// gpu.return %result: vector<8x16xf32>
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
|
|
/// %laneid = gpu.lane_id : index
|
|
/// %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
|
|
/// ...
|
|
/// ...
|
|
/// gpu.yield %result: vector<8x16xf32>
|
|
/// }
|
|
/// return %0
|
|
/// }
|
|
struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
|
|
using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
|
|
LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
|
|
PatternRewriter &rewriter) const override {
|
|
auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
|
|
if (!uArch)
|
|
return rewriter.notifyMatchFailure(
|
|
gpuFuncOp, "Subgroup distribution requires target attribute attached "
|
|
"to set the warp size");
|
|
// If the function only contains a single void return, skip.
|
|
if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
|
|
return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
|
|
}))
|
|
return failure();
|
|
// If the function already moved inside a warp_execute_on_lane0, skip.
|
|
if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
|
|
return isa<gpu::WarpExecuteOnLane0Op>(op);
|
|
}))
|
|
return failure();
|
|
// Create a new function with the same signature and same attributes.
|
|
SmallVector<Type> workgroupAttributionsTypes =
|
|
llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
|
|
[](BlockArgument arg) { return arg.getType(); });
|
|
SmallVector<Type> privateAttributionsTypes =
|
|
llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
|
|
[](BlockArgument arg) { return arg.getType(); });
|
|
auto newGpuFunc = gpu::GPUFuncOp::create(
|
|
rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
|
|
gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
|
|
privateAttributionsTypes);
|
|
newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
|
|
// Create a WarpExecuteOnLane0Op with same arguments and results as the
|
|
// original gpuFuncOp.
|
|
rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
|
|
auto laneId = gpu::LaneIdOp::create(
|
|
rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
|
|
/** upperBound = **/ mlir::IntegerAttr());
|
|
ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
|
|
auto warpOp = gpu::WarpExecuteOnLane0Op::create(
|
|
rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
|
|
uArch->getSubgroupSize(), newGpuFunc.getArguments(),
|
|
newGpuFunc.getArgumentTypes());
|
|
Block &warpBodyBlock = warpOp.getBodyRegion().front();
|
|
// Replace the ReturnOp of the original gpu function with a YieldOp.
|
|
auto origRetunOp =
|
|
cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
|
|
rewriter.setInsertionPointAfter(origRetunOp);
|
|
gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
|
|
origRetunOp.getOperands());
|
|
rewriter.eraseOp(origRetunOp);
|
|
// Move the original function body to the WarpExecuteOnLane0Op body.
|
|
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
|
|
warpOp.getBodyRegion().begin());
|
|
rewriter.eraseBlock(&warpBodyBlock);
|
|
// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
|
|
rewriter.setInsertionPointAfter(warpOp);
|
|
gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
|
|
rewriter.replaceOp(gpuFuncOp, newGpuFunc);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
|
|
/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
|
|
/// still contain the original op that will not be used by the yield op (and
|
|
/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
|
|
/// arguments. Tensor descriptor shape is not distributed because it is a
|
|
/// uniform value across all work items within the subgroup. However, the
|
|
/// layout information is dropped in the new tensor descriptor type.
|
|
///
|
|
/// Example:
|
|
///
|
|
/// ```
|
|
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
|
|
/// (!xegpu.tensor_desc<4x8xf32, #layout0>) {
|
|
/// ...
|
|
/// %td = xegpu.create_nd_tdesc %arg0
|
|
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
|
|
/// vector.yield %td
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
|
|
/// ...
|
|
/// %dead = xegpu.create_nd_tdesc %arg0
|
|
/// : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
|
|
/// vector.yield %arg0, %dead
|
|
/// }
|
|
/// %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
|
|
/// -> !xegpu.tensor_desc<4x8xf32>
|
|
///
|
|
/// ```
|
|
struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand =
|
|
getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "warp result is not a xegpu::CreateNdDesc op");
|
|
auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
|
|
xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
descOp, "the tensor descriptor lacks layout attribute");
|
|
// CreateNdOp must not have offsets.
|
|
if (descOp.getMixedOffsets().size())
|
|
return rewriter.notifyMatchFailure(
|
|
descOp, "xegpu::CreateNdDescOp must not have offsets");
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
rewriter.setInsertionPoint(warpOp);
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
|
|
/* new yielded types = */ descOp.getOperandTypes(), newRetIndices);
|
|
|
|
SmallVector<Value> newDescOperands = llvm::map_to_vector(
|
|
newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
xegpu::TensorDescType distributedTensorDescTy =
|
|
descOp.getType().dropLayouts(); // Distributed tensor descriptor type
|
|
// does not contain layout info.
|
|
Value newDescOp = xegpu::CreateNdDescOp::create(
|
|
rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
|
|
descOp->getAttrs());
|
|
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
// Resolve the distributed type to the expected type.
|
|
newDescOp =
|
|
resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
|
|
rewriter.replaceAllUsesWith(distributedVal, newDescOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a store_nd op at the end of enclosing
|
|
/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
|
|
/// through the warp op interface they would be propagated as returned values.
|
|
/// Source vector is distributed based on lane layout. Appropriate cast ops are
|
|
/// inserted if the distributed types does not match expected xegpu SIMT types.
|
|
///
|
|
/// Example:
|
|
///
|
|
/// ```
|
|
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
|
|
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
|
|
/// ...
|
|
/// xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
|
|
/// !xegpu.tensor_desc<4x8xf32, #layout0>
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
|
|
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
|
|
/// ...
|
|
/// gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
|
|
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
|
|
/// }
|
|
/// %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
|
|
/// %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
|
|
/// #layout0>
|
|
/// -> !xegpu.tensor_desc<4x8xf32>
|
|
/// xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
|
|
/// !xegpu.tensor_desc<4x8xf32>
|
|
///
|
|
/// ```
|
|
struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
Operation *lastNode = yield->getPrevNode();
|
|
auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
|
|
if (!storeOp)
|
|
return failure();
|
|
|
|
SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
|
|
// Expecting offsets to be present.
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(storeOp,
|
|
"the store op must have offsets");
|
|
SmallVector<Value> offsetsAsValues =
|
|
vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
|
|
SmallVector<Type> offsetTypes = llvm::map_to_vector(
|
|
offsetsAsValues, [](Value v) { return v.getType(); });
|
|
xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
|
|
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
storeOp, "the source tensor descriptor lacks layout attribute");
|
|
|
|
FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
|
|
xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
|
|
if (failed(distributedTypeByWarpOpOrFailure))
|
|
return rewriter.notifyMatchFailure(storeOp,
|
|
"Failed to distribute the type");
|
|
VectorType distributedTypeByWarpOp =
|
|
distributedTypeByWarpOpOrFailure.value();
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
SmallVector<Value> newYieldedValues = {storeOp.getValue(),
|
|
storeOp.getTensorDesc()};
|
|
SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
|
|
newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
|
|
newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
|
|
// Create a new store op outside the warp op with the distributed vector
|
|
// type. Tensor descriptor is not distributed.
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
SmallVector<Value> newStoreOperands;
|
|
|
|
// For the value operand, there can be a mismatch between the vector type
|
|
// distributed by the warp op and (xegpu-specific) distributed type
|
|
// supported by the store op. Type mismatch must be resolved using
|
|
// appropriate cast op.
|
|
FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
|
|
xegpu::getDistributedVectorType(storeOp.getTensorDescType());
|
|
if (failed(storeNdDistributedValueTyOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
storeOp, "Failed to get distributed vector type for the store op");
|
|
newStoreOperands.push_back(resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[0]),
|
|
storeNdDistributedValueTyOrFailure.value(), rewriter));
|
|
// For the tensor descriptor operand, the layout attribute is dropped after
|
|
// distribution. Types needs to be resolved in this case also.
|
|
xegpu::TensorDescType distributedTensorDescTy =
|
|
storeOp.getTensorDescType().dropLayouts();
|
|
newStoreOperands.push_back(
|
|
resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
|
|
distributedTensorDescTy, rewriter));
|
|
// Collect offsets.
|
|
for (size_t i = 2; i < newRetIndices.size(); ++i)
|
|
newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
|
|
|
|
auto newStoreOp =
|
|
xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
|
|
newStoreOperands, storeOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newStoreOp);
|
|
rewriter.eraseOp(storeOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a load_nd op feeding into vector.yield op for the enclosing
|
|
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
|
|
/// The warp op will still contain the original op that will not be used by
|
|
/// the yield op (and should be cleaned up later). The yield op will
|
|
/// bypass the load's arguments. Only the loaded vector is distributed
|
|
/// according to lane layout and, tensor descriptor types is not
|
|
/// distributed. Appropriate cast ops are inserted if the distributed types does
|
|
/// not match expected xegpu SIMT types.
|
|
///
|
|
/// Example:
|
|
///
|
|
/// ```
|
|
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
|
|
/// (vector<4x1xf32>) {
|
|
/// ...
|
|
/// %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
|
|
/// ->
|
|
/// vector<4x8xf32>
|
|
/// gpu.yield %ld
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
|
|
/// !xegpu.tensor_desc<4x8xf32, #layout0>) {
|
|
/// ...
|
|
/// %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
|
|
/// vector<4x8xf32> gpu.yield %dead, %arg0
|
|
/// }
|
|
/// %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
|
|
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
|
|
/// %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
|
|
/// %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
|
|
///
|
|
/// ```
|
|
struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
|
|
if (!isa<xegpu::LoadNdOp>(op))
|
|
return false;
|
|
// Make sure the same load op is the last operation in the warp op body.
|
|
// This ensure that load op is not sinked earlier violating any barrier
|
|
// synchronizations.
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
return yield->getPrevNode() == op;
|
|
});
|
|
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "warp result is not a xegpu::LoadNd op");
|
|
|
|
auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
|
|
auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
|
|
if (!uArch)
|
|
return rewriter.notifyMatchFailure(
|
|
loadOp, "xegpu::LoadNdOp require target attribute attached to "
|
|
"determine transpose "
|
|
"requirement");
|
|
// Chip information is required to decide if the layout requires transpose
|
|
// effect.
|
|
// Expecting offsets to be present.
|
|
SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(loadOp,
|
|
"the load op must have offsets");
|
|
SmallVector<Value> offsetsAsValues =
|
|
vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
|
|
SmallVector<Type> offsetTypes = llvm::map_to_vector(
|
|
offsetsAsValues, [](Value v) { return v.getType(); });
|
|
|
|
xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
|
|
xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
loadOp, "the source tensor descriptor lacks layout attribute");
|
|
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
VectorType distributedTypeByWarpOp =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
|
|
SmallVector<Type> newYieldedTypes = {tensorDescTy};
|
|
newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
|
|
newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
|
|
|
|
// Create a new load op outside the warp op with the distributed vector
|
|
// type.
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
FailureOr<VectorType> loadNdDistValueTyOrFailure =
|
|
xegpu::getDistributedVectorType(loadOp.getTensorDescType());
|
|
if (failed(loadNdDistValueTyOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
loadOp, "Failed to get distributed vector type for the load op");
|
|
xegpu::TensorDescType distributedTensorDescTy =
|
|
loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
|
|
// descriptor type does not
|
|
// contain layout info.
|
|
SmallVector<Value> newLoadOperands{
|
|
resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
|
|
distributedTensorDescTy, rewriter)};
|
|
// Collect offsets.
|
|
for (size_t i = 1; i < newRetIndices.size(); ++i)
|
|
newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
|
|
auto newLoadOp = xegpu::LoadNdOp::create(
|
|
rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
|
|
newLoadOperands, loadOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newLoadOp);
|
|
// Set the packed attribute if the layout requires it.
|
|
newLoadOp.setPacked(xegpu::requirePacked(layout));
|
|
// Set the transpose attribute if the layout requires it.
|
|
if (xegpu::requireTranspose(layout, uArch))
|
|
newLoadOp.setTranspose(
|
|
DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
// There can be a conflict between the vector type distributed by the
|
|
// warp op and (xegpu-specific) distributed type supported by the load
|
|
// op. Resolve these mismatches by inserting a cast.
|
|
Value tyResolvedVal = resolveDistributedTy(
|
|
newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
|
|
rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a dpas op feeding into vector.yield op for the enclosing
|
|
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
|
|
/// The warp op will still contain the original op that will not be used by
|
|
/// the yield op (and should be cleaned up later). The yield op will
|
|
/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
|
|
/// distributed types does not match expected xegpu SIMT types.
|
|
/// Example:
|
|
/// ```
|
|
/// #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
|
|
/// #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
|
|
/// #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid) ->
|
|
/// (vector<8x1xf32>) {
|
|
/// ...
|
|
/// %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
|
|
/// vector<8x16xf32>
|
|
/// gpu.yield %dpas
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
|
|
/// vector<8x1xf16>, vector<16x1xf16>) {
|
|
/// ...
|
|
/// %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
|
|
/// -> vector<8x16xf32>
|
|
/// gpu.yield %dead, %arg0, %arg1
|
|
/// }
|
|
/// %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
|
|
/// %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
|
|
/// %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
|
|
/// vector<8xf32>
|
|
/// %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
|
|
/// ```
|
|
struct DpasDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(warpOp,
|
|
"warp result is not a xegpu::Dpas op");
|
|
|
|
auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
|
|
xegpu::LayoutAttr layoutA =
|
|
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
|
|
xegpu::LayoutAttr layoutB =
|
|
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
|
|
xegpu::LayoutAttr layoutOut =
|
|
dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());
|
|
|
|
if (!layoutA || !layoutB || !layoutOut)
|
|
return rewriter.notifyMatchFailure(
|
|
dpasOp,
|
|
"the xegpu::Dpas op lacks layout attribute for A, B or output");
|
|
|
|
FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
|
|
FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
|
|
FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());
|
|
|
|
if (failed(distLhsTypeByWarpOpOrFailure) ||
|
|
failed(distRhsTypeByWarpOpOrFailure) ||
|
|
failed(distResultTypeByWarpOpOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
dpasOp,
|
|
"Failed to distribute the A, B or output types in xegpu::Dpas op");
|
|
|
|
llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
|
|
dpasOp.getRhs()};
|
|
llvm::SmallVector<Type, 3> newYieldTypes{
|
|
distLhsTypeByWarpOpOrFailure.value(),
|
|
distRhsTypeByWarpOpOrFailure.value()};
|
|
// Dpas acc operand is optional.
|
|
if (dpasOp.getAcc()) {
|
|
newYieldValues.push_back(dpasOp.getAcc());
|
|
newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
|
|
}
|
|
// Create a new warp op without the dpas.
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
|
|
|
|
FailureOr<VectorType> expectedDistLhsTyOrFailure =
|
|
xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
|
|
FailureOr<VectorType> expectedDistRhsTyOrFailure =
|
|
xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
|
|
FailureOr<VectorType> expectedDistResultTyOrFailure =
|
|
xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);
|
|
|
|
if (failed(expectedDistLhsTyOrFailure) ||
|
|
failed(expectedDistRhsTyOrFailure) ||
|
|
failed(expectedDistResultTyOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
dpasOp,
|
|
"Failed to get distributed vector type for the dpas operands.");
|
|
// Create a new dpas op outside the warp op.
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
SmallVector<Value> newDpasOperands;
|
|
SmallVector<VectorType> newDpasOperandExpectedTypes;
|
|
|
|
// Resolve the distributed types with the original types.
|
|
newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
|
|
newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
|
|
VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
|
|
if (dpasOp.getAcc())
|
|
newDpasOperandExpectedTypes.push_back(distributedResultTy);
|
|
|
|
for (unsigned i = 0; i < newRetIndices.size(); i++) {
|
|
newDpasOperands.push_back(
|
|
resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
|
|
newDpasOperandExpectedTypes[i], rewriter));
|
|
}
|
|
auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
|
|
distributedResultTy, newDpasOperands,
|
|
dpasOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newDpasOp);
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
// Resolve the output type.
|
|
Value typeResolved =
|
|
resolveDistributedTy(newDpasOp.getResult(),
|
|
distResultTypeByWarpOpOrFailure.value(), rewriter);
|
|
rewriter.replaceAllUsesWith(distributedVal, typeResolved);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a prefetch_nd op at the end of enclosing
|
|
/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
|
|
/// through the warp op interface they would be propagated as returned values.
|
|
/// Tensor descriptor shape is not distributed because it is a uniform value
|
|
/// across all work items within the subgroup. Appropriate cast ops are inserted
|
|
/// if the distributed types does not match expected xegpu SIMT types.
|
|
///
|
|
/// Example:
|
|
///
|
|
/// ```
|
|
/// #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
|
|
/// gpu.warp_execute_on_lane_0(%laneid) -> () {
|
|
/// ...
|
|
/// xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
|
|
/// }
|
|
/// ```
|
|
/// To
|
|
/// ```
|
|
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
|
|
/// !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
|
|
/// gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
|
|
/// index
|
|
/// }
|
|
/// %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
|
|
/// #layout0> -> !xegpu.tensor_desc<4x8xf32>
|
|
/// xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
|
|
///
|
|
/// ```
|
|
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
Operation *lastNode = yield->getPrevNode();
|
|
auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
|
|
if (!prefetchOp)
|
|
return failure();
|
|
|
|
SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
|
|
// PrefetchNdOp must have offsets.
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(prefetchOp,
|
|
"the prefetch op must have offsets");
|
|
SmallVector<Value> offsetsAsValues =
|
|
vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
|
|
SmallVector<Type> offsetTypes = llvm::map_to_vector(
|
|
offsetsAsValues, [](Value v) { return v.getType(); });
|
|
|
|
xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
prefetchOp, "the source tensor descriptor lacks layout attribute");
|
|
|
|
SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
|
|
SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
|
|
newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
|
|
newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
|
|
// Create a new prefetch op outside the warp op with updated tensor
|
|
// descriptor type. Source tensor descriptor require type resolution.
|
|
xegpu::TensorDescType newTensorDescTy =
|
|
prefetchOp.getTensorDescType().dropLayouts();
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
|
|
// Collect offsets.
|
|
for (size_t i = 1; i < newRetIndices.size(); ++i)
|
|
newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
|
|
Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
|
|
rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
|
|
prefetchOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newPrefetchOp);
|
|
rewriter.eraseOp(prefetchOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
|
|
/// region. This will simply move the barrier op outside of the warp op.
|
|
struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
Operation *lastNode = yield->getPrevNode();
|
|
// The last node must be a gpu::BarrierOp.
|
|
auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
|
|
if (!barrierOp)
|
|
return failure();
|
|
// Move the barrier op outside of the warp op.
|
|
rewriter.setInsertionPointAfter(warpOp);
|
|
gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
|
|
barrierOp->getResultTypes(),
|
|
barrierOp->getOperands(), barrierOp->getAttrs());
|
|
rewriter.eraseOp(barrierOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a scattered store op. The offsets argument is required.
|
|
/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
|
|
/// The layouts are fixed and implicit: one offset/mask per lane.
|
|
/// The pass changes the offset/mask vector shapes to a
|
|
/// single-element vector, **it is assumed that their producer will also be
|
|
/// distributed**. The payload vector also has a fixed distribution:
|
|
/// no chunk size -> vector of one element.
|
|
/// chunk size -> vector of the innermost dimension of the SG-payload.
|
|
/// Example 1 (no chunk size):
|
|
/// %mask = producer_op : vector<16xi1>
|
|
/// %offset = producer_op : vector<16xindex>
|
|
/// xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
|
|
/// memref<256xf16>, vector<16xindex>, vector<16xi1>
|
|
/// To
|
|
/// %mask = producer_op : vector<1xi1>
|
|
/// %offset = producer_op : vector<1xindex>
|
|
/// xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
|
|
/// memref<256xf16>, vector<1xindex>, vector<1xi1>
|
|
/// Example 2 (chunk size, same mask and offsets):
|
|
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
|
|
/// vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
|
|
/// To
|
|
/// xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
|
|
/// vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
|
|
///
|
|
/// Note that the store distribution pattern also handles leading unit
|
|
/// dimensions in the payload, mask and offsets vectors. In this case the store
|
|
/// distribution will only change the dimensions corresponding to the SG
|
|
/// distribution and keep the leading unit dimensions unchanged.
|
|
/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
|
|
/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
|
|
/// offset/mask/payload when necessary so that the distributed store is workign
|
|
/// on 1D shape vector to match the HW capability.
|
|
struct StoreDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
Operation *lastNode = warpOp.getTerminator()->getPrevNode();
|
|
auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
|
|
if (!storeScatterOp)
|
|
return failure();
|
|
auto offsets = storeScatterOp.getOffsets();
|
|
if (!offsets || !isa<VectorType>(offsets.getType()))
|
|
return rewriter.notifyMatchFailure(
|
|
storeScatterOp, "Store op must have a vector of offsets argument");
|
|
VectorType offsetsTy = cast<VectorType>(offsets.getType());
|
|
VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
|
|
VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());
|
|
|
|
// Add handling for leading unit dimensions support
|
|
int chunkSize = storeScatterOp.getChunkSize().value_or(1);
|
|
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
|
|
|
|
// Check that all leading dimensions are unit dimensions
|
|
for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
|
|
if (storeVecTy.getShape()[i] != 1) {
|
|
return rewriter.notifyMatchFailure(
|
|
storeScatterOp, "Only unit dimensions allowed for the leading "
|
|
"dimensions of the store vector!");
|
|
}
|
|
}
|
|
|
|
auto layoutPayload =
|
|
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
|
|
auto layoutOffsets =
|
|
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
|
|
auto layoutMask =
|
|
xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));
|
|
|
|
FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
|
|
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
|
|
FailureOr<VectorType> distMaskByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
|
|
if (failed(distStoreVecByWarpOpOrFailure) ||
|
|
failed(distOffsetsByWarpOpOrFailure) ||
|
|
failed(distMaskByWarpOpOrFailure)) {
|
|
return rewriter.notifyMatchFailure(
|
|
storeScatterOp,
|
|
"Some vector operands have no layouts, using defaults instead.");
|
|
}
|
|
|
|
VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
|
|
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
|
|
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
SmallVector<Value> operands = storeScatterOp->getOperands();
|
|
SmallVector<Type> operandTypesToYield = {
|
|
distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};
|
|
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
|
|
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
|
|
// Distributed store payload type is always 1D without leading unit dims
|
|
VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
|
|
distPayloadTy.getElementType());
|
|
|
|
VectorType distOffsetsTy1D = VectorType::get(
|
|
{distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
|
|
VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
|
|
distMaskTy.getElementType());
|
|
|
|
// Resolve distributed types to 1D for SIMT execution
|
|
Value distPayloadVal = resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
|
|
Value distOffsetVal = resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
|
|
Value distMaskVal = resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);
|
|
|
|
SmallVector<Value> newStoreScatterOpOperands = {
|
|
distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
|
|
distMaskVal};
|
|
|
|
xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
|
|
rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
|
|
storeScatterOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newOp);
|
|
rewriter.eraseOp(storeScatterOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
|
|
PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
|
|
Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
|
|
SmallVector<Value> newCoods;
|
|
auto maybeCoords =
|
|
layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
|
|
if (failed(maybeCoords))
|
|
return {};
|
|
assert(maybeCoords.value().size() == 1 &&
|
|
"Expected one set of distributed offsets");
|
|
SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
|
|
rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
|
|
getAsOpFoldResult(origOffsets));
|
|
newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
|
|
return newCoods;
|
|
}
|
|
|
|
/// Pattern for distributing xegpu::LoadMatrixOp.
|
|
struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
Operation *lastNode = yield->getPrevNode();
|
|
auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
|
|
if (!matrixOp)
|
|
return failure();
|
|
|
|
OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
|
|
return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
|
|
});
|
|
if (!producedByLastLoad)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "The last op is not xegpu::LoadMatrixOp");
|
|
const int operandIdx = producedByLastLoad->getOperandNumber();
|
|
|
|
VectorType sgPayloadTy =
|
|
dyn_cast<VectorType>(matrixOp.getResult().getType());
|
|
VectorType warpResultTy =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
if (!sgPayloadTy)
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "the matrix op payload must be a vector type");
|
|
|
|
auto loc = matrixOp.getLoc();
|
|
auto offsets = matrixOp.getMixedOffsets();
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(matrixOp,
|
|
"the load op must have offsets");
|
|
SmallVector<Value> offsetsAsValues =
|
|
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
|
|
|
|
auto layout = matrixOp.getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "the matrix operation lacks layout attribute");
|
|
|
|
FailureOr<VectorType> distPayloadByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
|
|
if (failed(distPayloadByWarpOpOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "Failed to distribute matrix op payload based on layout.");
|
|
|
|
SmallVector<Value> operands = {matrixOp.getMemDesc()};
|
|
const unsigned offsetsStartIdx = operands.size();
|
|
operands.append(offsetsAsValues);
|
|
|
|
SmallVector<Type> operandTypes =
|
|
llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, operands, operandTypes, newRetIndices);
|
|
SmallVector<Value> newOperands = llvm::map_to_vector(
|
|
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
|
|
|
|
SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
|
|
ShapedType::kDynamic);
|
|
DenseI64ArrayAttr newConstOffsetsAttr =
|
|
rewriter.getDenseI64ArrayAttr(newConstOffsets);
|
|
ValueRange currentOffsets =
|
|
ValueRange(newOperands).drop_front(offsetsStartIdx);
|
|
|
|
SmallVector<Value> newCoords = currentOffsets;
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
|
|
if (!matrixOp.getSubgroupBlockIoAttr()) {
|
|
newCoords = computeDistributedCoordinatesForMatrixOp(
|
|
rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
|
|
currentOffsets);
|
|
}
|
|
xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
|
|
rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
|
|
newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
|
|
matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
|
|
// Resolve the output type and replace all uses.
|
|
rewriter.replaceAllUsesWith(
|
|
newWarpOp.getResult(operandIdx),
|
|
resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Pattern for distributing xegpu::StoreMatrixOp.
|
|
struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
gpu::YieldOp yield = warpOp.getTerminator();
|
|
Operation *lastNode = yield->getPrevNode();
|
|
auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
|
|
if (!matrixOp)
|
|
return failure();
|
|
|
|
VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
|
|
if (!sgPayloadTy)
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "the matrix op payload must be a vector type");
|
|
|
|
auto loc = matrixOp.getLoc();
|
|
auto offsets = matrixOp.getMixedOffsets();
|
|
if (offsets.empty())
|
|
return rewriter.notifyMatchFailure(matrixOp,
|
|
"the store op must have offsets");
|
|
SmallVector<Value> offsetsAsValues =
|
|
vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
|
|
|
|
auto layout = matrixOp.getLayoutAttr();
|
|
if (!layout)
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "the matrix operation lacks layout attribute");
|
|
|
|
FailureOr<VectorType> distPayloadByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
|
|
if (failed(distPayloadByWarpOpOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
matrixOp, "Failed to distribute matrix op payload based on layout.");
|
|
|
|
SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
|
|
const unsigned offsetsStartIdx = operands.size();
|
|
operands.append(offsetsAsValues);
|
|
|
|
SmallVector<Type> operandTypes =
|
|
llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
|
|
operandTypes[0] = *distPayloadByWarpOpOrFailure;
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, operands, operandTypes, newRetIndices);
|
|
SmallVector<Value> newOperands = llvm::map_to_vector(
|
|
newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
|
|
|
|
SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
|
|
ShapedType::kDynamic);
|
|
DenseI64ArrayAttr newConstOffsetsAttr =
|
|
rewriter.getDenseI64ArrayAttr(newConstOffsets);
|
|
ValueRange currentOffsets =
|
|
ValueRange(newOperands).drop_front(offsetsStartIdx);
|
|
|
|
SmallVector<Value> newCoords = currentOffsets;
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
|
|
if (!matrixOp.getSubgroupBlockIoAttr()) {
|
|
newCoords = computeDistributedCoordinatesForMatrixOp(
|
|
rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
|
|
currentOffsets);
|
|
}
|
|
|
|
xegpu::StoreMatrixOp::create(
|
|
rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
|
|
ValueRange(newCoords), newConstOffsetsAttr,
|
|
matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
|
|
rewriter.eraseOp(matrixOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a scattered load op. The logic and requirements are the same as
|
|
/// for the scattered store distribution. The warpOp's payload vector is
|
|
/// expected to be distributed by the load's result consumer.
|
|
/// Example 1 (no chunk size):
|
|
/// %mask = producer_op : vector<16xi1>
|
|
/// %offset = producer_op : vector<16xindex>
|
|
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
|
|
/// vector<16xindex>, vector<16xi1> -> vector<16xf16>
|
|
/// To
|
|
/// %mask = producer_op : vector<1xi1>
|
|
/// %offset = producer_op : vector<1xindex>
|
|
/// %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
|
|
/// vector<1xindex>, vector<1xi1> -> vector<1xf16>
|
|
/// Example 2 (chunk size, same mask and offsets):
|
|
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
|
|
/// memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
|
|
/// To
|
|
/// %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
|
|
/// memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
|
|
///
|
|
/// Note that the load distribution pattern also handles leading unit dimensions
|
|
/// in the payload, mask, and offsets vector.The load distribution will only
|
|
/// change the dimensions corresponding to the SG distribution and keep the
|
|
/// leading unit dimensions unchanged. For example, a load with result type
|
|
/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
|
|
/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
|
|
/// offset/mask/payload when necessary so that the distributed load is workign
|
|
/// on 1D shape vector to match the HW capability.
|
|
struct LoadDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
|
|
// Check if the yield operand that was produced by the *last* scattered
|
|
// load op to avoid sinking it before barriers (maintain memory order).
|
|
return isa<xegpu::LoadGatherOp>(op) &&
|
|
warpOp.getTerminator()->getPrevNode() == op;
|
|
});
|
|
if (!producedByLastLoad)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "The last op is not xegpu::LoadGatherOp");
|
|
|
|
auto loadGatherOp =
|
|
producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
|
|
auto offsets = loadGatherOp.getOffsets();
|
|
if (!offsets || !isa<VectorType>(offsets.getType()) ||
|
|
!isa<VectorType>(loadGatherOp.getMask().getType()))
|
|
return rewriter.notifyMatchFailure(
|
|
loadGatherOp,
|
|
"Load op must have a vector arguments for offsets and mask");
|
|
VectorType offsetsTy = cast<VectorType>(offsets.getType());
|
|
VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
|
|
VectorType resultVecTy =
|
|
cast<VectorType>(loadGatherOp.getResult().getType());
|
|
// add handling leading unit dimensions support
|
|
int chunkSize = loadGatherOp.getChunkSize().value_or(1);
|
|
int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
|
|
for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
|
|
if (resultVecTy.getShape()[i] != 1) {
|
|
return rewriter.notifyMatchFailure(
|
|
loadGatherOp, "Only unit dimensions allowed for the leading "
|
|
"dimensions of the load vector!");
|
|
}
|
|
}
|
|
|
|
auto layoutOffsets =
|
|
xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
|
|
auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));
|
|
|
|
FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
|
|
FailureOr<VectorType> distMaskByWarpOpOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
|
|
if (failed(distOffsetsByWarpOpOrFailure) ||
|
|
failed(distMaskByWarpOpOrFailure)) {
|
|
return rewriter.notifyMatchFailure(
|
|
loadGatherOp,
|
|
"Some vector operands have no layouts, using defaults instead.");
|
|
}
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
SmallVector<Value> operands = loadGatherOp->getOperands();
|
|
|
|
const unsigned operandIdx = producedByLastLoad->getOperandNumber();
|
|
VectorType distResultTy =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
|
|
VectorType distMaskTy = distMaskByWarpOpOrFailure.value();
|
|
|
|
SmallVector<Type> operandTypesToYield = {operands[0].getType(),
|
|
distOffsetsTy, distMaskTy};
|
|
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, operands, operandTypesToYield, newRetIndices);
|
|
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
|
|
// Distributed load op will always be 1D.
|
|
VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
|
|
distResultTy.getElementType());
|
|
|
|
VectorType distOffsetsTy1D =
|
|
VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
|
|
distOffsetsByWarpOpOrFailure.value().getElementType());
|
|
VectorType distMaskTy1D =
|
|
VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
|
|
distMaskByWarpOpOrFailure.value().getElementType());
|
|
|
|
Value distOffsetVal = resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
|
|
Value distmaskVal = resolveDistributedTy(
|
|
newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);
|
|
|
|
SmallVector<Value> newLoadGatherOperands = {
|
|
newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};
|
|
|
|
xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
|
|
rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
|
|
loadGatherOp->getAttrs());
|
|
xegpu::removeLayoutAttrs(newOp);
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
// Resolve the output type and replace all uses.
|
|
rewriter.replaceAllUsesWith(
|
|
distributedVal,
|
|
resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
|
|
return success();
|
|
}
|
|
};
|
|
|
|
// Sink SG-uniform ops. An op is uniform if none
|
|
// of its operands/results has a distribution layout attribute.
|
|
// Non-uniform vectors are handled by dedicated patterns.
|
|
// This pattern must have a higher priority than vector dialect distribution
|
|
// patterns, because a distributable shape may be logically intended as
|
|
// uniform (i.e., no layout), so we want to omit its distribution.
|
|
struct SinkUniformOps final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
// Take the last op
|
|
Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
|
|
// Any ops with nested regions must be handled carefully in dedicated
|
|
// patterns.
|
|
if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
|
|
return failure();
|
|
int operandIdx = -1;
|
|
if (warpRegionPreYieldOp->getNumResults()) {
|
|
OpOperand *operand = getWarpResult(
|
|
warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
|
|
if (!operand)
|
|
return failure();
|
|
operandIdx = operand->getOperandNumber();
|
|
if (warpRegionPreYieldOp->getResult(0).getType() !=
|
|
warpOp.getResult(operandIdx).getType())
|
|
return rewriter.notifyMatchFailure(warpOp,
|
|
"The op result is not uniform.");
|
|
}
|
|
|
|
// The op must have no layout-based operands or results.
|
|
bool uniformValuesOnly =
|
|
llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
|
|
return !xegpu::getDistributeLayoutAttr(v);
|
|
});
|
|
uniformValuesOnly &=
|
|
llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
|
|
return !xegpu::getDistributeLayoutAttr(opr);
|
|
});
|
|
if (!uniformValuesOnly)
|
|
return rewriter.notifyMatchFailure(warpOp,
|
|
"Some values are not uniform.");
|
|
SmallVector<size_t> newRetIndices;
|
|
SmallVector<Value> operands =
|
|
llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
|
|
SmallVector<Type> operandTypes =
|
|
llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, operands, operandTypes, newRetIndices);
|
|
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
IRMapping operandMapper;
|
|
for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
|
|
operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
|
|
newWarpOp->getResult(newOperandIdx));
|
|
Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
|
|
if (!clonedOp->getNumResults())
|
|
rewriter.eraseOp(warpRegionPreYieldOp);
|
|
else {
|
|
assert(operandIdx != -1 && "Expected a warp result for the operation");
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
|
|
clonedOp->getResult(0));
|
|
}
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// This patterns distribute the `vector.multi_reduction` operation across
|
|
/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
|
|
/// layouts for the source and accumulator vectors,
|
|
/// * If the reduction dimension is distributed across lanes, the reduction is
|
|
/// non-lane-local and the reduction is done using warp shuffles. Here we
|
|
/// simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
|
|
/// the warp op body.
|
|
/// * If the reduction dimension is not distributed across lanes, the reduction
|
|
/// is lane-local. In this case, we yield the source and accumulator vectors
|
|
/// from the warp op and perform the lane-local reduction outside the warp op
|
|
/// using a sequence of ReductionOps.
|
|
/// Example 1 (Reduction is lane-local):
|
|
/// ```
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
|
|
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
|
|
/// %acc = "some_def"() : () -> (vector<32xf32>)
|
|
/// %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
|
|
/// vector<32xf32> gpu.yield %1 : vector<32xf32>
|
|
/// }
|
|
/// ```
|
|
/// is lowered to:
|
|
/// ```
|
|
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
|
|
/// vector<1xf32>) {
|
|
/// %0 = "some_def"() : () -> (vector<16x32xf32>)
|
|
/// %acc = "some_def"() : () -> (vector<32xf32>)
|
|
/// gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
|
|
/// }
|
|
/// %c = arith.constant dense<0.0> : vector<1xf32>
|
|
/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
|
|
/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
|
|
/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
|
|
/// ```
|
|
/// Example 2 (Reduction is non-lane-local):
|
|
/// ```
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
|
|
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
|
|
/// %acc = "some_def"() : () -> (vector<2xf32>)
|
|
/// %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
|
|
/// vector<2xf32>
|
|
/// gpu.yield %1 : vector<2xf32>
|
|
/// }
|
|
/// ```
|
|
/// is lowered to:
|
|
/// ```
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
|
|
/// %0 = "some_def"() : () -> (vector<2x32xf32>)
|
|
/// %acc = "some_def"() : () -> (vector<2xf32>)
|
|
/// %1 = arith.constant dense<0.0> : vector<2xf32>
|
|
/// %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
|
|
/// %3 = ("warp.reduction %2") : f32
|
|
/// %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
|
|
/// ... repeat for row 1
|
|
/// gpu.yield %1 : vector<2xf32>
|
|
/// }
|
|
struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *yieldOperand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
|
|
if (!yieldOperand)
|
|
return failure();
|
|
auto reductionOp =
|
|
cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
|
|
unsigned operandIdx = yieldOperand->getOperandNumber();
|
|
VectorType sourceType = reductionOp.getSourceVectorType();
|
|
// Only 2D vectors are supported.
|
|
if (sourceType.getRank() != 2)
|
|
return rewriter.notifyMatchFailure(warpOp,
|
|
"Only 2D reductions are supported.");
|
|
ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
|
|
// Only 1 reduction dimension supported. This also ensures that the result
|
|
// is vector type.
|
|
if (reductionDims.size() != 1)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Only 1 reduction dimension is supported.");
|
|
int64_t reductionDim = reductionDims[0];
|
|
VectorType distributedResultType =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
VectorType resultType = cast<VectorType>(reductionOp.getType());
|
|
xegpu::DistributeLayoutAttr sourceLayout =
|
|
xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));
|
|
|
|
FailureOr<VectorType> sourceDistTypeOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
|
|
if (failed(sourceDistTypeOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Failed to distribute the source vector type.");
|
|
VectorType sourceDistType = sourceDistTypeOrFailure.value();
|
|
// Only single dimension distribution is supported.
|
|
bool dim0Distributed =
|
|
sourceDistType.getShape()[0] != sourceType.getShape()[0];
|
|
bool dim1Distributed =
|
|
sourceDistType.getShape()[1] != sourceType.getShape()[1];
|
|
if (dim0Distributed && dim1Distributed)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Expecting source to be distributed in a single dimension.");
|
|
int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
|
|
if (sourceDistDim == -1)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Expecting a distributed source vector.");
|
|
bool resultDistributed =
|
|
distributedResultType.getNumElements() < resultType.getNumElements();
|
|
// If the lane owns all the data required for reduction (i.e. reduction is
|
|
// fully parallel accross lanes), then each lane owns part of the result
|
|
// (i.e. result is distributed). If the reduction require cross-lane
|
|
// shuffling, then the result is shared among all lanes (broadcasted).
|
|
// Therefore we expect following cases:
|
|
//
|
|
// | Source vector | Reduction dim | Result vector |
|
|
// |----------------------|----------------|----------------|
|
|
// | dim-0 distributed | 0 | broadcasted |
|
|
// | dim-0 distributed | 1 | distributed |
|
|
// | dim-1 distributed | 0 | distributed |
|
|
// | dim-1 distributed | 1 | broadcasted |
|
|
|
|
bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
|
|
(sourceDistDim == 1 && reductionDim == 0);
|
|
if (isReductionLaneLocal && !resultDistributed)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Expecting a distributed result for lane-local reduction.");
|
|
|
|
if (!isReductionLaneLocal && resultDistributed)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"Expecting a broadcasted result for non-lane-local reduction.");
|
|
|
|
// Handle lane-local reduction case. In this case we fully distribute the
|
|
// reduction result.
|
|
if (isReductionLaneLocal) {
|
|
// Yield the source and acc vectors from the WarpOp.
|
|
SmallVector<size_t> newRetIndices;
|
|
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
|
|
{sourceDistType, distributedResultType}, newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
Value result = xegpu::lowerToVectorReductions(
|
|
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
|
|
cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
|
|
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
|
|
// Replace the warp op result with the final result.
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
|
|
return success();
|
|
}
|
|
// For non-lane-local case, we simply rewrite the MultiReductionOp in terms
|
|
// of multiple ReductionOps. Actual distribution is done by the
|
|
// WarpOpReduction pattern.
|
|
rewriter.setInsertionPointAfter(reductionOp);
|
|
Value result = xegpu::lowerToVectorReductions(
|
|
cast<TypedValue<VectorType>>(reductionOp.getSource()),
|
|
cast<TypedValue<VectorType>>(reductionOp.getAcc()),
|
|
reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
|
|
// Replace the warp op result with the final result.
|
|
rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// This pattern distributes the `vector.broadcast` operation across lanes in a
|
|
/// warp. The pattern supports three use cases:
|
|
///
|
|
/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
|
|
/// vector
|
|
/// must have a slice layout of the result. If the distributed source and
|
|
/// target vector types are identical, this lowers to a no-op; otherwise, it
|
|
/// remains a broadcast but operates on distributed vectors.
|
|
///
|
|
/// 2) Broadcast a same-rank vector with identical layouts for source and
|
|
/// target:
|
|
/// The source vector must have unit dimensions, and lane_data must be unit
|
|
/// size for those unit dims. This always lowers to a no-op.
|
|
///
|
|
/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
|
|
/// scalar to distributed result type.
|
|
///
|
|
/// Example 1 (lowering to a broadcast with distributed types):
|
|
/// ```
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
|
|
/// %0 = "some_def"() {layout_result_0 =
|
|
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
|
|
/// dims = [0]> } : () -> (vector<32xf32>)
|
|
/// %2 = vector.broadcast %0 {layout_result_0 =
|
|
/// #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
|
|
/// : vector<32xf32> to vector<8x32xf32>
|
|
/// gpu.yield %1 : vector<8x32xf32>
|
|
/// }
|
|
/// ```
|
|
/// is lowered to:
|
|
/// ```
|
|
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
|
|
/// %0 = "some_def"() {layout_result_0 =
|
|
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
|
|
/// dims = [0]> } : () -> (vector<32xf32>)
|
|
/// gpu.yield %0 : vector<32xf32>
|
|
/// }
|
|
/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
|
|
///
|
|
/// Example 2 (no-op):
|
|
/// ```
|
|
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
|
|
/// %0 = "some_def"() {layout_result_0 =
|
|
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
|
|
/// dims = [1]> } : () -> (vector<8xf32>)
|
|
/// %1 = vector.shape_cast %0
|
|
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
|
|
/// 1]>}: vector<8xf32> to vector<8x1xf32>
|
|
/// %2 = vector.broadcast %1
|
|
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
|
|
/// 1]>}: vector<8x1xf32> to vector<8x32xf32>
|
|
/// gpu.yield %1 : vector<8x32xf32>
|
|
/// }
|
|
/// ```
|
|
/// is lowered to:
|
|
/// ```
|
|
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
|
|
/// %0 = "some_def"() {layout_result_0 =
|
|
/// #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
|
|
/// dims = [1]> } : () -> (vector<8xf32>)
|
|
/// %1 = vector.shape_cast %0
|
|
/// {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
|
|
/// 1]>}: vector<8xf32> to vector<8x1xf32>
|
|
/// gpu.yield %1 : vector<8x1xf32>
|
|
/// }
|
|
/// // The broadcast is implicit through layout transformation (no-op)
|
|
/// "some_use"(%r#0)
|
|
/// ```
|
|
struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *yieldOperand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
|
|
if (!yieldOperand)
|
|
return failure();
|
|
auto broadcastOp =
|
|
cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
|
|
unsigned operandIdx = yieldOperand->getOperandNumber();
|
|
|
|
VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
|
|
VectorType destType =
|
|
dyn_cast<VectorType>(broadcastOp.getResult().getType());
|
|
|
|
xegpu::DistributeLayoutAttr sourceLayout =
|
|
xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
|
|
xegpu::DistributeLayoutAttr resultLayout =
|
|
xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));
|
|
|
|
FailureOr<VectorType> sourceDistType;
|
|
Type sourceElemOrDistType;
|
|
if (sourceType) {
|
|
|
|
// Case 1 and 2: source is a vector type.
|
|
int64_t rankDiff = destType.getRank() - sourceType.getRank();
|
|
if (rankDiff > 0) {
|
|
// Case 1: source is lower-rank than result.
|
|
bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
|
|
if (!isSliceOf)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"Broadcast input layout must be a slice of result layout.");
|
|
}
|
|
// case 2: source and result have same rank
|
|
if (rankDiff == 0) {
|
|
auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
|
|
SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
|
|
broadcastUnitDimsSet.end());
|
|
bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
|
|
if (!isEqualTo)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "For same-rank broadcast, source must be identical to "
|
|
"adjusted result layouts with unit dims.");
|
|
resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
|
|
sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
|
|
}
|
|
|
|
sourceDistType =
|
|
getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
|
|
if (failed(sourceDistType)) {
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Failed to distribute the source vector type.");
|
|
}
|
|
sourceElemOrDistType = sourceDistType.value();
|
|
|
|
} else {
|
|
// Case 3: source is a scalar type.
|
|
if (sourceLayout) {
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Broadcast from scalar must not have a layout attribute.");
|
|
}
|
|
sourceElemOrDistType = broadcastOp.getSourceType();
|
|
}
|
|
FailureOr<VectorType> destDistType =
|
|
getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
|
|
if (failed(destDistType)) {
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Failed to distribute the dest vector type.");
|
|
}
|
|
|
|
SmallVector<size_t> newRetIndices;
|
|
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
|
|
newRetIndices);
|
|
|
|
Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
|
|
|
|
Value newBroadcast = distributedSource;
|
|
|
|
if (sourceElemOrDistType != destDistType.value()) {
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
newBroadcast =
|
|
vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
|
|
destDistType.value(), distributedSource);
|
|
}
|
|
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
|
|
/// `gpu.warp_execute_on_lane_0` region.
|
|
struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *yieldOperand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
|
|
if (!yieldOperand)
|
|
return failure();
|
|
auto shapeCastOp =
|
|
cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
|
|
unsigned operandNumber = yieldOperand->getOperandNumber();
|
|
auto resultDistTy =
|
|
cast<VectorType>(warpOp.getResult(operandNumber).getType());
|
|
xegpu::DistributeLayoutAttr sourceLayout =
|
|
xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
|
|
xegpu::DistributeLayoutAttr resultLayout =
|
|
xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
|
|
if (!sourceLayout || !resultLayout)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"the source or result of shape_cast op lacks distribution layout");
|
|
|
|
FailureOr<VectorType> sourceDistTypeOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(sourceLayout,
|
|
shapeCastOp.getSourceVectorType());
|
|
if (failed(sourceDistTypeOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "failed to get distributed vector type for source");
|
|
VectorType sourceDistType = sourceDistTypeOrFailure.value();
|
|
// Create a new warp op that yields the source of the shape_cast op.
|
|
SmallVector<size_t> newRetIndices;
|
|
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
|
|
newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
Value source = newWarpOp.getResult(newRetIndices[0]);
|
|
// Create a new shape_cast op outside the warp op.
|
|
Value newShapeCast = vector::ShapeCastOp::create(
|
|
rewriter, shapeCastOp.getLoc(), resultDistTy, source);
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
|
|
newShapeCast);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
|
|
// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
|
|
// advanced cases where the distributed dimension is partially extracted and
|
|
// currently not supported by the generic vector distribution patterns.
|
|
struct VectorExtractStridedSliceDistribution
|
|
: public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
|
|
if (!operand)
|
|
return failure();
|
|
auto extractOp =
|
|
cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
auto distributedType =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
// Find the distributed dimensions.
|
|
auto extractResultType = cast<VectorType>(operand->get().getType());
|
|
auto distributedDims =
|
|
getDistributedDims(extractResultType, distributedType);
|
|
// Collect updated source type, sizes and offsets. They may be adjusted
|
|
// later if the data is distributed to lanes (as opposed to being owned by
|
|
// all lanes uniformly).
|
|
VectorType updatedSourceType = extractOp.getSourceVectorType();
|
|
SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
|
|
extractOp.getSizes(), [](Attribute attr) { return attr; });
|
|
SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
|
|
extractOp.getOffsets(), [](Attribute attr) { return attr; });
|
|
SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
|
|
extractOp.getStrides(), [](Attribute attr) { return attr; });
|
|
// If the provided sizes, offsets, strides are less than the rank, pad them
|
|
// with full sizes, zero offsets, and unit strides. This makes it easier to
|
|
// adjust them later.
|
|
int64_t sourceRank = extractOp.getSourceVectorType().getRank();
|
|
for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
|
|
updatedSizes.push_back(rewriter.getI64IntegerAttr(
|
|
extractOp.getSourceVectorType().getDimSize(i)));
|
|
updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
|
|
updatedStrides.push_back(
|
|
rewriter.getI64IntegerAttr(1)); // stride is always 1.
|
|
}
|
|
// If the result is distributed, it must be distributed in exactly one
|
|
// dimension. In this case, we adjust the sourceDistType, distributedSizes
|
|
// and distributedOffsets accordingly.
|
|
if (distributedDims.size() > 0) {
|
|
if (distributedDims.size() != 1)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Source can not be distributed in multiple dimensions.");
|
|
int64_t distributedDim = distributedDims[0];
|
|
int sourceDistrDimSize =
|
|
extractOp.getSourceVectorType().getShape()[distributedDim];
|
|
auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
|
|
if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "the source of extract_strided_slice op lacks distribution "
|
|
"layout");
|
|
auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
|
|
// Because only single dimension distribution is supported, lane layout
|
|
// size at the distributed dim must be the subgroup size.
|
|
int subgroupSize = sourceLaneLayout[distributedDim];
|
|
// Check if the source size in the distributed dimension is a multiple of
|
|
// subgroup size.
|
|
if (sourceDistrDimSize % subgroupSize != 0)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"Source size along distributed dimension is not a multiple of "
|
|
"subgroup size.");
|
|
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
|
|
// We expect lane data to be all ones in this case.
|
|
if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Expecting unit lane data in source layout");
|
|
// The offsets in the distributed dimention must be a multiple of subgroup
|
|
// size.
|
|
int64_t distrDimOffset =
|
|
cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
|
|
if (distrDimOffset % subgroupSize != 0)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Offset along distributed dimension "
|
|
"is not a multiple of subgroup size.");
|
|
updatedSourceType = getDistVecTypeBasedOnLaneLayout(
|
|
sourceLayout, extractOp.getSourceVectorType())
|
|
.value();
|
|
// Update the distributed sizes to match the distributed type.
|
|
updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
|
|
distributedType.getDimSize(distributedDim));
|
|
// Update the distributed offsets to match round robin distribution (i.e.
|
|
// each lane owns data at `subgroupSize` stride given unit lane data).
|
|
updatedOffsets[distributedDim] =
|
|
rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
|
|
}
|
|
// Do the distribution by yielding the source of the extract op from
|
|
// the warp op and creating a new extract op outside the warp op.
|
|
SmallVector<size_t> newRetIndices;
|
|
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
|
|
newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
Value source = newWarpOp.getResult(newRetIndices[0]);
|
|
// Create a new extract op outside the warp op.
|
|
Value newExtractOp = vector::ExtractStridedSliceOp::create(
|
|
rewriter, extractOp.getLoc(), distributedType, source,
|
|
ArrayAttr::get(rewriter.getContext(), updatedOffsets),
|
|
ArrayAttr::get(rewriter.getContext(), updatedSizes),
|
|
ArrayAttr::get(rewriter.getContext(), updatedStrides));
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
|
|
/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
|
|
/// advanced cases where the distributed dimension is partially inserted and
|
|
/// currently not supported by the generic vector distribution patterns.
|
|
struct VectorInsertStridedSliceDistribution
|
|
: public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
|
|
// Check if the InsertStridedSliceOp is the last op before yield op
|
|
return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
|
|
warpOp.getTerminator()->getPrevNode() == op;
|
|
});
|
|
if (!operand)
|
|
return failure();
|
|
unsigned int operandNumber = operand->getOperandNumber();
|
|
auto insertOp =
|
|
operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
|
|
auto distributedType =
|
|
cast<VectorType>(warpOp.getResult(operandNumber).getType());
|
|
// Find the distributed dimensions of the dest vector.
|
|
auto insertResultType = cast<VectorType>(operand->get().getType());
|
|
auto destDistributedDims =
|
|
getDistributedDims(insertResultType, distributedType);
|
|
// Collect updated offsets, source type and dest type. They may be adjusted
|
|
// later if the data is distributed to lanes (as opposed to being owned by
|
|
// all lanes uniformly).
|
|
SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
|
|
insertOp.getOffsets(), [](Attribute attr) { return attr; });
|
|
VectorType updatedSourceType = insertOp.getSourceVectorType();
|
|
VectorType updatedDestType = insertOp.getDestVectorType();
|
|
if (destDistributedDims.size() > 0) {
|
|
// Only single dimension distribution is supported.
|
|
if (destDistributedDims.size() != 1)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"Expecting source to be distributed in a single dimension.");
|
|
int64_t destDistributedDim = destDistributedDims[0];
|
|
|
|
VectorType srcType = insertOp.getSourceVectorType();
|
|
VectorType destType = insertOp.getDestVectorType();
|
|
// Currently we require that both source (kD) and dest (nD) vectors are
|
|
// distributed. This requires that distributedDim (d) is contained in the
|
|
// last k dims of the dest vector (d >= n - k).
|
|
int64_t sourceDistributedDim =
|
|
destDistributedDim - (destType.getRank() - srcType.getRank());
|
|
if (sourceDistributedDim < 0)
|
|
return rewriter.notifyMatchFailure(
|
|
insertOp,
|
|
"distributed dimension must be in the last k (i.e. source "
|
|
"rank) dims of dest vector");
|
|
int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
|
|
// Obtain the source and dest layouts.
|
|
auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
|
|
auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
|
|
if (!destLayout || !sourceLayout ||
|
|
destLayout.getEffectiveLaneLayoutAsInt().empty() ||
|
|
sourceLayout.getEffectiveLaneLayoutAsInt().empty())
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "the source or dest of insert_strided_slice op lacks "
|
|
"distribution layout");
|
|
// Because only single dimension distribution is supported, lane layout
|
|
// size at the distributed dim must be the subgroup size.
|
|
int subgroupSize =
|
|
destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
|
|
// We require that source and dest lane data are all ones to ensure
|
|
// uniform round robin distribution.
|
|
auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
|
|
auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
|
|
if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
|
|
!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Expecting unit lane data in source and dest layouts");
|
|
// Source distributed dim size must be multiples of subgroup size.
|
|
if (srcDistrDimSize % subgroupSize != 0)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "Distributed dimension size in source is not a multiple of "
|
|
"subgroup size.");
|
|
// Offsets in the distributed dimension must be multiples of subgroup
|
|
// size.
|
|
int64_t destDistrDimOffset =
|
|
cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
|
|
if (destDistrDimOffset % subgroupSize != 0)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"Offset along distributed dimension in dest is not a multiple of "
|
|
"subgroup size.");
|
|
// Update the source and dest types based on their layouts.
|
|
updatedSourceType = getDistVecTypeBasedOnLaneLayout(
|
|
sourceLayout, insertOp.getSourceVectorType())
|
|
.value();
|
|
updatedDestType = getDistVecTypeBasedOnLaneLayout(
|
|
destLayout, insertOp.getDestVectorType())
|
|
.value();
|
|
// Update the distributed offsets to match round robin distribution (i.e.
|
|
// each lane owns data at `subgroupSize` stride given unit lane data).
|
|
updatedOffsets[destDistributedDim] =
|
|
rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
|
|
}
|
|
// Do the distribution by yielding the source and dest of the insert op
|
|
// from the warp op and creating a new insert op outside the warp op.
|
|
SmallVector<size_t> newRetIndices;
|
|
auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
|
|
{updatedSourceType, updatedDestType}, newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
|
|
Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
|
|
Value dest = newWarpOp.getResult(newRetIndices[1]);
|
|
// Create a new insert op outside the warp op.
|
|
Value newInsertOp = vector::InsertStridedSliceOp::create(
|
|
rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
|
|
ArrayAttr::get(rewriter.getContext(), updatedOffsets),
|
|
insertOp.getStrides());
|
|
rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
|
|
newInsertOp);
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
|
|
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
|
|
/// outside of the warp op.
|
|
struct MemrefExtractAlignedPointerAsIndexDistribution final
|
|
: public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand = getWarpResult(
|
|
warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp,
|
|
"warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
|
|
auto extractOp =
|
|
operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, extractOp.getSource(),
|
|
TypeRange{extractOp.getSource().getType()}, newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
|
|
rewriter, newWarpOp.getLoc(), extractOp.getType(),
|
|
newWarpOp.getResult(newRetIndices[0]));
|
|
Value resultVal = newWarpOp.getResult(operandIdx);
|
|
rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
|
|
/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
|
|
/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
|
|
/// created outside of the warp op with distributed source vector type (computed
|
|
/// using assigned layout).
|
|
struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "warp result is not a vector::BitCast op");
|
|
auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
VectorType distributedSourceType =
|
|
getDistVecTypeBasedOnLaneLayout(
|
|
xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
|
|
bitcastOp.getSourceVectorType())
|
|
.value_or(VectorType());
|
|
if (!distributedSourceType)
|
|
return rewriter.notifyMatchFailure(
|
|
bitcastOp, "Failed to distribute the source vector type in "
|
|
"vector::BitCast op");
|
|
VectorType distributedResultType =
|
|
cast<VectorType>(warpOp.getResult(operandIdx).getType());
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, bitcastOp.getSource(),
|
|
TypeRange{distributedSourceType}, newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
auto newBitcastOp = vector::BitCastOp::create(
|
|
rewriter, newWarpOp.getLoc(), distributedResultType,
|
|
newWarpOp.getResult(newRetIndices[0]));
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
|
|
return success();
|
|
}
|
|
};
|
|
|
|
/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
|
|
/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
|
|
/// supported. In most cases, transpose is a no op because it is entirely
|
|
/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
|
|
/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
|
|
/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
|
|
/// vector::TransposeOp outside of the warp op with distributed source vector
|
|
/// type (computed using assigned layout).
|
|
struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
|
|
using gpu::WarpDistributionPattern::WarpDistributionPattern;
|
|
LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
|
|
PatternRewriter &rewriter) const override {
|
|
OpOperand *operand =
|
|
getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
|
|
if (!operand)
|
|
return rewriter.notifyMatchFailure(
|
|
warpOp, "warp result is not a vector::Transpose op");
|
|
auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
|
|
unsigned operandIdx = operand->getOperandNumber();
|
|
xegpu::DistributeLayoutAttr sourceLayout =
|
|
xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
|
|
xegpu::DistributeLayoutAttr resultLayout =
|
|
xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
|
|
if (!sourceLayout || !resultLayout)
|
|
return rewriter.notifyMatchFailure(
|
|
transposeOp,
|
|
"the source or result vector of the transpose op lacks layout "
|
|
"attribute");
|
|
int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
|
|
int64_t resultRank = transposeOp.getResultVectorType().getRank();
|
|
// Only 2D transposes are supported for now.
|
|
// TODO: Support nD transposes.
|
|
if (sourceRank != 2 || resultRank != 2)
|
|
return rewriter.notifyMatchFailure(
|
|
transposeOp, "the source or result vector of the transpose op "
|
|
"does not have 2D layout");
|
|
ArrayRef<int64_t> perm = transposeOp.getPermutation();
|
|
// Result layout must be a transpose of source layout.
|
|
if (!resultLayout.isTransposeOf(sourceLayout, perm))
|
|
return rewriter.notifyMatchFailure(
|
|
transposeOp,
|
|
"the source or result vector layouts must be 2D transposes of each "
|
|
"other");
|
|
FailureOr<VectorType> distributedSourceTypeOrFailure =
|
|
getDistVecTypeBasedOnLaneLayout(sourceLayout,
|
|
transposeOp.getSourceVectorType());
|
|
if (failed(distributedSourceTypeOrFailure))
|
|
return rewriter.notifyMatchFailure(
|
|
transposeOp, "Failed to distribute the source vector type in "
|
|
"vector::Transpose op");
|
|
SmallVector<size_t> newRetIndices;
|
|
gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
|
|
rewriter, warpOp, transposeOp.getVector(),
|
|
TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
|
|
rewriter.setInsertionPointAfter(newWarpOp);
|
|
auto newTransposeOp = vector::TransposeOp::create(
|
|
rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
|
|
perm);
|
|
Value distributedVal = newWarpOp.getResult(operandIdx);
|
|
rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
|
|
return success();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace {
|
|
struct XeGPUSubgroupDistributePass final
|
|
: public xegpu::impl::XeGPUSubgroupDistributeBase<
|
|
XeGPUSubgroupDistributePass> {
|
|
void runOnOperation() override;
|
|
};
|
|
} // namespace
|
|
|
|
void xegpu::populateXeGPUSubgroupDistributePatterns(
|
|
RewritePatternSet &patterns) {
|
|
patterns.add<CreateNdDescDistribution, StoreNdDistribution,
|
|
LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
|
|
GpuBarrierDistribution, VectorMultiReductionDistribution,
|
|
LoadDistribution, StoreDistribution, VectorTransposeDistribution,
|
|
VectorBitcastDistribution, LoadMatrixDistribution,
|
|
StoreMatrixDistribution,
|
|
MemrefExtractAlignedPointerAsIndexDistribution>(
|
|
patterns.getContext(),
|
|
/*pattern benefit=*/PatternHierarchy::Regular);
|
|
// For following patterns, we need to override the regular vector distribution
|
|
// patterns. Therefore, assign higher benefit.
|
|
patterns
|
|
.add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
|
|
VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
|
|
SinkUniformOps>(patterns.getContext(),
|
|
/*pattern benefit=*/PatternHierarchy::AboveRegular);
|
|
}
|
|
|
|
void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
|
|
RewritePatternSet &patterns) {
|
|
patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
|
|
}
|
|
|
|
void XeGPUSubgroupDistributePass::runOnOperation() {
|
|
// Step 1: Attach layouts to op operands.
|
|
// TODO: Following assumptions are made:
|
|
// 1) It is assumed that there are no layout conflicts.
|
|
// 2) Any existing layout attributes attached to the operands are ignored.
|
|
Operation *op = getOperation();
|
|
if (!xegpu::recoverTemporaryLayouts(op)) {
|
|
signalPassFailure();
|
|
return;
|
|
}
|
|
|
|
// Step 2: Move all operations of a GPU function inside
|
|
// gpu.warp_execute_on_lane_0 operation.
|
|
{
|
|
RewritePatternSet patterns(&getContext());
|
|
xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);
|
|
|
|
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
|
|
signalPassFailure();
|
|
return;
|
|
}
|
|
// At this point, we have moved the entire function body inside the
|
|
// warpOp. Now move any scalar uniform code outside of the warpOp (like
|
|
// GPU index ops, scalar constants, etc.). This will simplify the
|
|
// later lowering and avoid custom patterns for these ops.
|
|
getOperation()->walk([&](Operation *op) {
|
|
if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
|
|
vector::moveScalarUniformCode(warpOp);
|
|
});
|
|
}
|
|
// Step 3: Apply subgroup to workitem distribution patterns.
|
|
RewritePatternSet patterns(&getContext());
|
|
xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
|
|
// distributionFn is used by vector distribution patterns to determine the
|
|
// distributed vector type for a given vector value. In XeGPU subgroup
|
|
// distribution context, we compute this based on lane layout.
|
|
auto distributionFn = [](Value val) {
|
|
VectorType vecType = dyn_cast<VectorType>(val.getType());
|
|
int64_t vecRank = vecType ? vecType.getRank() : 0;
|
|
if (vecRank == 0)
|
|
return AffineMap::get(val.getContext());
|
|
// Get the layout of the vector type.
|
|
xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
|
|
// If no layout is specified, assume uniform case (no distribution).
|
|
if (!layout)
|
|
return AffineMap::get(val.getContext());
|
|
// Expecting vector and layout rank to match.
|
|
assert(layout.getRank() == vecRank &&
|
|
"Expecting vector and layout rank to match");
|
|
// A dimension is distributed only if layout suggests there are
|
|
// multiple lanes assigned for this dimension and the shape can be evenly
|
|
// distributed to those lanes.
|
|
SmallVector<unsigned int> distributedDims;
|
|
for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
|
|
if (v > 1 && vecType.getShape()[i] % v == 0)
|
|
distributedDims.push_back(i);
|
|
}
|
|
return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
|
|
val.getContext());
|
|
};
|
|
// TODO: shuffleFn is not used.
|
|
auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
|
|
int64_t warpSz) { return Value(); };
|
|
|
|
vector::populateDistributeReduction(
|
|
patterns, xegpu::subgroupReduction,
|
|
/*pattern benefit=*/PatternHierarchy::Regular);
|
|
|
|
vector::populatePropagateWarpVectorDistributionPatterns(
|
|
patterns, distributionFn, shuffleFn,
|
|
/*pattern benefit=*/PatternHierarchy::Regular);
|
|
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
|
|
signalPassFailure();
|
|
return;
|
|
}
|
|
|
|
// Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
|
|
// due to tensor desc type mismatches created by using upstream distribution
|
|
// patterns (scf.for). This cleanup should only be done if all the ops are
|
|
// distributed successfully, if some ops are still not distributed and remains
|
|
// inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
|
|
// breaking the IR.
|
|
bool foundWarpOp = false;
|
|
getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
|
|
// Look for WarpOps that are not trivially dead.
|
|
if (isOpTriviallyDead(warpOp))
|
|
return WalkResult::advance();
|
|
foundWarpOp = true;
|
|
return WalkResult::interrupt();
|
|
});
|
|
if (foundWarpOp)
|
|
return;
|
|
|
|
getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
|
|
// We are only interested in UnrealizedConversionCastOps there were added
|
|
// for resolving SIMT type mismatches.
|
|
if (!op->getAttr(resolveSIMTTypeMismatch))
|
|
return WalkResult::skip();
|
|
|
|
Value input = op.getOperand(0);
|
|
Value output = op.getResult(0);
|
|
|
|
// Both input and output must have tensor descriptor types.
|
|
xegpu::TensorDescType inputDescType =
|
|
mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
|
|
xegpu::TensorDescType outputDescType =
|
|
mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
|
|
assert(inputDescType && outputDescType &&
|
|
"Unrealized conversion cast must have tensor descriptor types");
|
|
|
|
// tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
|
|
// This occurs inside scf.for body to resolve the block argument type to
|
|
// SIMT type.
|
|
if (inputDescType.getLayout()) {
|
|
auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
|
|
if (argument) {
|
|
argument.setType(output.getType());
|
|
output.replaceAllUsesWith(argument);
|
|
if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
|
|
argument.getOwner()->getParentOp())) {
|
|
auto result = loopOp.getTiedLoopResult(argument);
|
|
result.setType(output.getType());
|
|
}
|
|
}
|
|
}
|
|
|
|
// tensor_desc<shape> -> tensor_desc<shape, layout> Type of
|
|
// conversions. This occurs at the yield op of scf.for body to go back
|
|
// from SIMT type to original type.
|
|
if (outputDescType.getLayout())
|
|
output.replaceAllUsesWith(input);
|
|
|
|
if (op->use_empty())
|
|
op->erase();
|
|
return WalkResult::advance();
|
|
});
|
|
}
|