[MLIR][XeGPU] Add Layout Propagation support for multi-reduction/reduction op with scalar result (#189133)

This PR add Layout Propagation support for multi-reduction/reduction op
with scalar result:
1) Enhance setupMultiReductionResultLayout() and
LayoutInfoPropagation::visitVectorMultiReductionOp() to support scalar
result
2) Add propagation support for vector.reduction op at the lane level,
since the op is only introduced at the lane level.
This commit is contained in:
Jianhui Li 2026-04-01 13:01:34 -07:00 committed by GitHub
parent 75c6f4791c
commit 401ba6df84
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 227 additions and 53 deletions

View File

@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
OptionalAttr<DistributeLayoutAttr>:$layout);
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
let results = (outs XeGPU_ValueOrScalarType:$value);
let extraClassDeclaration = extraBaseClassDeclaration # [{
@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
}];
let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value,
let arguments = (ins XeGPU_ValueOrScalarType:$value,
XeGPU_GatherScatterSourceType:$dest,
Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
: vector<128x128xf16>
```
}];
let arguments = (ins XeGPU_ConvertLayoutType: $source,
let arguments = (ins XeGPU_VectorOrScalarType: $source,
DistributeLayoutAttr: $input_layout,
DistributeLayoutAttr: $target_layout);
let results = (outs XeGPU_ConvertLayoutType: $result);
let results = (outs XeGPU_VectorOrScalarType: $result);
let assemblyFormat = [{
$source prop-dict attr-dict `:` type($source)
}];
@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
OptionalAttr<UnitAttr>:$subgroup_block_io,
OptionalAttr<DistributeLayoutAttr>:$layout
);
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
let results = (outs XeGPU_ValueOrScalarType:$res);
let assemblyFormat = [{
$mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
prop-dict attr-dict `` `:` type(operands) `->` type(results)
@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
let arguments = (ins
AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
XeGPU_ValueOrScalarType:$data,
XeGPU_MemDesc:$mem_desc,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,

View File

@ -25,11 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
def XeGPU_VectorOrOffsetVectorType
: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
def XeGPU_ConvertLayoutType
def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
def XeGPU_VectorOrScalarType
: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
def XeGPU_GatherScatterBaseAddrType
: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;

View File

@ -82,6 +82,10 @@ DistributeLayoutAttr
inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
SmallVector<int64_t> reduceDims);
/// Infers the source layout attribute for a reduction operation given the
/// result layout attribute and reduced dims.
DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout);
/// Infers the source layout attribute for a transpose operation given the
/// result layout attribute and permutation.
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout,
@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
ArrayRef<int64_t> resShape,
ArrayRef<int64_t> srcShape);
/// Sets up layout for reduction operations by creating a SliceAttr for the
/// result.
/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for
/// the result.
///
/// This function first attempts to construct a source layout that, when
/// sliced along reduction dimensions, produces a result layout compatible
@ -120,7 +124,13 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
VectorType srcVectorTy,
DistributeLayoutAttr consumerLayout,
SmallVector<int64_t> reductionDims,
const uArch::uArch *uArch);
int numSg, const uArch::uArch *uArch);
/// Sets up layout for Reduction operations by creating a SliceAttr for the
/// result.
SliceAttr setupReductionResultLayout(LayoutKind layoutKind,
VectorType srcVectorTy,
const uArch::uArch *uArch);
/// Setup the result layout attribute for a bitcast operation based on element
/// type bitwidths. This ensures the source layout can always be derived from
@ -170,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
std::optional<std::tuple<DistributeLayoutAttr, DistributeLayoutAttr,
DistributeLayoutAttr>>
setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
VectorType cdTy, DistributeLayoutAttr consumerLayout,
const uArch::uArch *uArch, int numSg);
VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
const uArch::uArch *uArch);
/// Gets the expected layout for a given consumer operand. This will check if
/// the owning operation of the consumer operand is one of the special layout

View File

@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
return sliceLayout.getParent();
}
xegpu::DistributeLayoutAttr
xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) {
return xegpu::inferMultiReductionSourceLayout(resLayout, {0});
}
/// Infers the source layout attribute for a transpose operation given the
/// result layout attribute and permutation.
xegpu::DistributeLayoutAttr
@ -399,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
/// Examples:
/// 1. Subgroup layout - Row reduction on 2D tensor:
/// srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
/// workgroupSize=32
/// NumSg=32
/// * Consumer Layout:
/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
/// [1]>}
@ -440,15 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
xegpu::LayoutKind layoutKind, VectorType srcVecTy,
DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
const xegpu::uArch::uArch *uArch) {
int numSg, const xegpu::uArch::uArch *uArch) {
auto srcShape = srcVecTy.getShape();
int srcRank = srcShape.size();
auto context = consumerLayout.getContext();
// Reduction layout requires at least 2D tensors
if (srcRank < 2)
return nullptr;
auto context = srcVecTy.getContext();
// Helper lambda to convert int64 vectors to int32 DenseArrayAttr
auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
@ -456,21 +457,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
return DenseI32ArrayAttr::get(context, vec32);
};
const int workgroupSize = consumerLayout.getNumSubgroups();
const int subgroupSize = uArch->getSubgroupSize();
int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
SmallVector<int64_t> consumerSgLayout =
consumerLayout.getEffectiveSgLayoutAsInt();
SmallVector<int64_t> consumerLaneLayout =
consumerLayout.getEffectiveLaneLayoutAsInt();
SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
xegpu::DistributeLayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
xegpu::SliceAttr consumerSliceLayout =
dyn_cast<xegpu::SliceAttr>(consumerLayout);
dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
if (consumerSliceLayout &&
consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
srcLayout = consumerSliceLayout.getParent();
@ -482,9 +474,17 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
}
} else {
SmallVector<int64_t> consumerSgLayout =
consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
: SmallVector<int64_t>();
SmallVector<int64_t> consumerOrder =
consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
: SmallVector<int64_t>();
DenseI32ArrayAttr orderAttr =
consumerLayout ? consumerLayout.getOrder() : nullptr;
SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
int remainingSgCount = workgroupSize;
int remainingSgCount =
consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
int consumerIdx = 0;
// First pass: Match consumer's layout on non-reduction dimensions
@ -502,6 +502,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
}
// Second pass: Distribute remaining subgroups across reduction dimensions
// the reduction to scalar case is handled only by this loop
int64_t remainOrder = consumerSgLayout.size();
for (int i = 0; i < srcRank; i++) {
if (llvm::is_contained(reductionDims, i)) {
@ -525,19 +526,20 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
} else if (layoutKind == xegpu::LayoutKind::InstData) {
SmallVector<int64_t> instData(srcRank, 1);
instData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
if (srcRank >= 2)
instData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
instData[srcRank - 1] =
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
laneLayout[srcRank - 1] =
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
laneData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
if (srcRank >= 2)
laneData[srcRank - 2] =
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
toInt32Attr(laneData));
}
@ -546,6 +548,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
DenseI64ArrayAttr::get(context, reductionDims));
}
/// Sets up layout for Reduction operations by creating a SliceAttr for the
/// result.
xegpu::SliceAttr
xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
VectorType srcVecTy,
const xegpu::uArch::uArch *uArch) {
auto srcShape = srcVecTy.getShape();
auto context = srcVecTy.getContext();
auto subgroupSize = uArch->getSubgroupSize();
xegpu::LayoutAttr srcLayout;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
assert(true && "subgroup layout assignment not supported for reduction (op "
"is not expected at this level).");
} else if (layoutKind == xegpu::LayoutKind::InstData) {
assert(true && "instData layout assignment not supported for reduction (op "
"is not expected at this level).");
} else if (layoutKind == xegpu::LayoutKind::Lane) {
SmallVector<int32_t> laneLayout(1), laneData(1);
laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
laneData[0] = 1;
srcLayout = xegpu::LayoutAttr::get(
context, DenseI32ArrayAttr::get(context, laneLayout),
DenseI32ArrayAttr::get(context, laneData));
}
auto result = xegpu::SliceAttr::get(context, srcLayout,
DenseI64ArrayAttr::get(context, 0));
return result;
}
/// Sets up the result layout for a bitcast operation.
/// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
/// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
@ -656,7 +690,6 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
"srcShape must be divisible by laneLayout for all dimensions");
laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
consumerLaneData[dim]);
requiredResLayout =
requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
}
@ -930,8 +963,8 @@ std::optional<
xegpu::DistributeLayoutAttr>>
xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
VectorType bTy, VectorType cdTy,
xegpu::DistributeLayoutAttr consumerLayout,
const xegpu::uArch::uArch *uArch, int numSg) {
xegpu::DistributeLayoutAttr consumerLayout, int numSg,
const xegpu::uArch::uArch *uArch) {
auto context = aTy.getContext();
const auto *uArchInstruction =
dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
@ -1079,7 +1112,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
Operation *op = operand.getOwner();
unsigned idx = operand.getOperandNumber();
xegpu::DistributeLayoutAttr resLayout;
if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
if (op->getNumResults() == 1)
resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
// For vector::BroadcastOp, infer the source layout from the result layout.
@ -1108,6 +1141,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
return resLayout;
}
if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
if (!resLayout)
return xegpu::DistributeLayoutAttr();
return xegpu::inferReductionSourceLayout(resLayout);
}
// For vector::BitCastOp, infer source layout from result layout using
// element type bitwidths.
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {

View File

@ -365,6 +365,10 @@ private:
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
void visitVectorReductionOp(vector::ReductionOp reduction,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results);
@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
.Case([&](vector::MultiDimReductionOp reductionOp) {
visitVectorMultiReductionOp(reductionOp, operands, results);
})
.Case([&](vector::ReductionOp reductionOp) {
visitVectorReductionOp(reductionOp, operands, results);
})
.Case([&](vector::BroadcastOp broadcastOp) {
visitVectorBroadCastOp(broadcastOp, operands, results);
})
@ -625,10 +632,17 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
vector::MultiDimReductionOp reduction,
ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
Type resultTy = reduction.getDestType();
// The layout of the result must be present.
LayoutInfo resLayoutInfo = results[0]->getValue();
if (!resLayoutInfo.isAssigned())
return;
xegpu::DistributeLayoutAttr consumerLayoutAttr;
if (!resultTy.isIntOrFloat()) {
if (!resLayoutInfo.isAssigned())
return;
consumerLayoutAttr =
dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
}
VectorType sourceTy = reduction.getSourceVectorType();
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
@ -636,8 +650,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
if (!uArch)
return;
auto consumerLayoutAttr =
dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
int numSg = 0;
if (layoutKind == xegpu::LayoutKind::Subgroup) {
auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
if (succeeded(numSgOrErr))
numSg = numSgOrErr.value();
}
// The result layout represents the layout requirements of the operation.
// it is recorded to anchor layout or temporary layout.
@ -645,7 +663,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
// propagated from consumer op, the conflict is resolved in later phase by
// converting the required result layout to the consumer layout
auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
@ -659,6 +677,26 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}
void LayoutInfoPropagation::visitVectorReductionOp(
vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
VectorType sourceTy = reduction.getSourceVectorType();
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
if (!uArch)
return;
auto requiredResLayoutAttr =
xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
if (reduction.getAcc())
propagateIfChanged(operands[1],
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}
void LayoutInfoPropagation::visitVectorBroadCastOp(
vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
ArrayRef<const LayoutInfoLattice *> results) {
@ -765,7 +803,7 @@ void LayoutInfoPropagation::visitDpasOp(
numSg = numSgOrErr.value();
}
auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
consumerLayoutAttr, uArch, numSg);
consumerLayoutAttr, numSg, uArch);
if (!layouts.has_value()) {
dpas.emitWarning(
"Failed to determine required layouts for DPAS operands.");
@ -1286,6 +1324,7 @@ private:
OpBuilder builder;
LogicalResult resolveTensorDescConsumer(OpOperand &operand);
LogicalResult resolveVectorConsumer(OpOperand &operand);
LogicalResult assignResultLayout(OpResult &result);
};
} // namespace
@ -1294,6 +1333,21 @@ LogicalResult ResolveLayoutConflicts::run() {
// Scan all operations in the parent op and resolve layout conflicts at
// tensor descriptor and vector use points.
auto r = parentOp->walk([&](Operation *op) -> WalkResult {
// if the operation inputs vector and output scalar, like multi-reduction we
// need to check if the result has layout and add a convert_layout to serve
// as anchor op for the reduction op's layout.
if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
for (OpResult result : op->getResults()) {
if (result.getType().isIntOrFloat()) {
auto res = assignResultLayout(result);
if (failed(res)) {
DBGS() << "Failed to resolve vector consumer for multi-reduction "
<< *op << "\n";
return WalkResult::interrupt();
}
}
}
}
for (OpOperand &operand : op->getOpOperands()) {
// Handle conflicts in tensor descriptor operands.
Type operandType = operand.get().getType();
@ -1321,6 +1375,18 @@ LogicalResult ResolveLayoutConflicts::run() {
return r.wasInterrupted() ? failure() : success();
}
LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
Operation *producerOp = result.getDefiningOp();
auto producerLayout = xegpu::getDistributeLayoutAttr(result);
// Insert a convert_layout op to assign the layout.
builder.setInsertionPointAfterValue(result);
auto convertOp = xegpu::ConvertLayoutOp::create(
builder, producerOp->getLoc(), result.getType(), result, producerLayout,
producerLayout);
result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
return success();
}
LogicalResult
ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
Value vectorValue = operand.get();

View File

@ -128,7 +128,7 @@ gpu.module @test {
gpu.module @test {
// CHECK-LABEL: vector_row_reduction
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@ -140,10 +140,23 @@ gpu.module @test {
}
}
// -----
gpu.module @test {
// CHECK-LABEL: vector_row_reduction_scalar
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant 0.000000e+00 : f32
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
%reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
gpu.return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: vector_nest_reduction
gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@ -181,7 +194,7 @@ gpu.module @test {
// CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
// CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
// CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>

View File

@ -746,6 +746,54 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_2d_reduction_scalar(
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16
// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
// CHECK: %[[MASK:.*]] = arith.constant true
// CHECK: %[[OFF:.*]] = arith.constant 1 : index
// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
%cst = arith.constant dense<true> : vector<1xi1>
%0 = vector.step : vector<1xindex>
%1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
%2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
%cst_0 = arith.constant 0.000000e+00 : f16
%4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
%cst_2 = arith.constant true
%cst_3 = arith.constant 1 : index
xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_reduction_scalar(
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xindex>
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>} : vector<16xf16> into f16
// CHECK: %[[MASK:.*]] = arith.constant true
// CHECK: %[[OFF:.*]] = arith.constant 1 : index
// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
%cst = arith.constant dense<true> : vector<16xi1>
%0 = vector.step : vector<16xindex>
%1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
%4 = vector.reduction <add>, %1: vector<16xf16> into f16
%cst_2 = arith.constant true
%cst_3 = arith.constant 1 : index
xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
return
}
}
// -----
gpu.module @test {
// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(