[MLIR][XeGPU] Add Layout Propagation support for multi-reduction/reduction op with scalar result (#189133)
This PR add Layout Propagation support for multi-reduction/reduction op with scalar result: 1) Enhance setupMultiReductionResultLayout() and LayoutInfoPropagation::visitVectorMultiReductionOp() to support scalar result 2) Add propagation support for vector.reduction op at the lane level, since the op is only introduced at the lane level.
This commit is contained in:
parent
75c6f4791c
commit
401ba6df84
@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
|
||||
OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
|
||||
OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
|
||||
OptionalAttr<DistributeLayoutAttr>:$layout);
|
||||
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
|
||||
let results = (outs XeGPU_ValueOrScalarType:$value);
|
||||
|
||||
let extraClassDeclaration = extraBaseClassDeclaration # [{
|
||||
|
||||
@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL
|
||||
|
||||
}];
|
||||
|
||||
let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value,
|
||||
let arguments = (ins XeGPU_ValueOrScalarType:$value,
|
||||
XeGPU_GatherScatterSourceType:$dest,
|
||||
Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
|
||||
AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
|
||||
@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
|
||||
: vector<128x128xf16>
|
||||
```
|
||||
}];
|
||||
let arguments = (ins XeGPU_ConvertLayoutType: $source,
|
||||
let arguments = (ins XeGPU_VectorOrScalarType: $source,
|
||||
DistributeLayoutAttr: $input_layout,
|
||||
DistributeLayoutAttr: $target_layout);
|
||||
let results = (outs XeGPU_ConvertLayoutType: $result);
|
||||
let results = (outs XeGPU_VectorOrScalarType: $result);
|
||||
let assemblyFormat = [{
|
||||
$source prop-dict attr-dict `:` type($source)
|
||||
}];
|
||||
@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
|
||||
OptionalAttr<UnitAttr>:$subgroup_block_io,
|
||||
OptionalAttr<DistributeLayoutAttr>:$layout
|
||||
);
|
||||
let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
|
||||
let results = (outs XeGPU_ValueOrScalarType:$res);
|
||||
let assemblyFormat = [{
|
||||
$mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
|
||||
prop-dict attr-dict `` `:` type(operands) `->` type(results)
|
||||
@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
|
||||
def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
|
||||
AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
|
||||
let arguments = (ins
|
||||
AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
|
||||
XeGPU_ValueOrScalarType:$data,
|
||||
XeGPU_MemDesc:$mem_desc,
|
||||
Variadic<Index>: $offsets,
|
||||
DenseI64ArrayAttr: $const_offsets,
|
||||
|
||||
@ -25,11 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
|
||||
def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
|
||||
def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
|
||||
def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
|
||||
def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
|
||||
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
|
||||
def XeGPU_VectorOrOffsetVectorType
|
||||
: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
|
||||
def XeGPU_ConvertLayoutType
|
||||
def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
|
||||
def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
|
||||
def XeGPU_VectorOrScalarType
|
||||
: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
|
||||
def XeGPU_GatherScatterBaseAddrType
|
||||
: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
|
||||
|
||||
@ -82,6 +82,10 @@ DistributeLayoutAttr
|
||||
inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
|
||||
SmallVector<int64_t> reduceDims);
|
||||
|
||||
/// Infers the source layout attribute for a reduction operation given the
|
||||
/// result layout attribute and reduced dims.
|
||||
DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout);
|
||||
|
||||
/// Infers the source layout attribute for a transpose operation given the
|
||||
/// result layout attribute and permutation.
|
||||
DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout,
|
||||
@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
|
||||
ArrayRef<int64_t> resShape,
|
||||
ArrayRef<int64_t> srcShape);
|
||||
|
||||
/// Sets up layout for reduction operations by creating a SliceAttr for the
|
||||
/// result.
|
||||
/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for
|
||||
/// the result.
|
||||
///
|
||||
/// This function first attempts to construct a source layout that, when
|
||||
/// sliced along reduction dimensions, produces a result layout compatible
|
||||
@ -120,7 +124,13 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
|
||||
VectorType srcVectorTy,
|
||||
DistributeLayoutAttr consumerLayout,
|
||||
SmallVector<int64_t> reductionDims,
|
||||
const uArch::uArch *uArch);
|
||||
int numSg, const uArch::uArch *uArch);
|
||||
|
||||
/// Sets up layout for Reduction operations by creating a SliceAttr for the
|
||||
/// result.
|
||||
SliceAttr setupReductionResultLayout(LayoutKind layoutKind,
|
||||
VectorType srcVectorTy,
|
||||
const uArch::uArch *uArch);
|
||||
|
||||
/// Setup the result layout attribute for a bitcast operation based on element
|
||||
/// type bitwidths. This ensures the source layout can always be derived from
|
||||
@ -170,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
|
||||
std::optional<std::tuple<DistributeLayoutAttr, DistributeLayoutAttr,
|
||||
DistributeLayoutAttr>>
|
||||
setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
|
||||
VectorType cdTy, DistributeLayoutAttr consumerLayout,
|
||||
const uArch::uArch *uArch, int numSg);
|
||||
VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
|
||||
const uArch::uArch *uArch);
|
||||
|
||||
/// Gets the expected layout for a given consumer operand. This will check if
|
||||
/// the owning operation of the consumer operand is one of the special layout
|
||||
|
||||
@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
|
||||
return sliceLayout.getParent();
|
||||
}
|
||||
|
||||
xegpu::DistributeLayoutAttr
|
||||
xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) {
|
||||
return xegpu::inferMultiReductionSourceLayout(resLayout, {0});
|
||||
}
|
||||
|
||||
/// Infers the source layout attribute for a transpose operation given the
|
||||
/// result layout attribute and permutation.
|
||||
xegpu::DistributeLayoutAttr
|
||||
@ -399,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
|
||||
/// Examples:
|
||||
/// 1. Subgroup layout - Row reduction on 2D tensor:
|
||||
/// srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
|
||||
/// workgroupSize=32
|
||||
/// NumSg=32
|
||||
/// * Consumer Layout:
|
||||
/// #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
|
||||
/// [1]>}
|
||||
@ -440,15 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
|
||||
xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
xegpu::LayoutKind layoutKind, VectorType srcVecTy,
|
||||
DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
|
||||
const xegpu::uArch::uArch *uArch) {
|
||||
int numSg, const xegpu::uArch::uArch *uArch) {
|
||||
|
||||
auto srcShape = srcVecTy.getShape();
|
||||
int srcRank = srcShape.size();
|
||||
auto context = consumerLayout.getContext();
|
||||
|
||||
// Reduction layout requires at least 2D tensors
|
||||
if (srcRank < 2)
|
||||
return nullptr;
|
||||
auto context = srcVecTy.getContext();
|
||||
|
||||
// Helper lambda to convert int64 vectors to int32 DenseArrayAttr
|
||||
auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
|
||||
@ -456,21 +457,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
return DenseI32ArrayAttr::get(context, vec32);
|
||||
};
|
||||
|
||||
const int workgroupSize = consumerLayout.getNumSubgroups();
|
||||
const int subgroupSize = uArch->getSubgroupSize();
|
||||
int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
|
||||
|
||||
SmallVector<int64_t> consumerSgLayout =
|
||||
consumerLayout.getEffectiveSgLayoutAsInt();
|
||||
SmallVector<int64_t> consumerLaneLayout =
|
||||
consumerLayout.getEffectiveLaneLayoutAsInt();
|
||||
SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
|
||||
DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
|
||||
|
||||
xegpu::DistributeLayoutAttr srcLayout;
|
||||
if (layoutKind == xegpu::LayoutKind::Subgroup) {
|
||||
xegpu::SliceAttr consumerSliceLayout =
|
||||
dyn_cast<xegpu::SliceAttr>(consumerLayout);
|
||||
dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
|
||||
if (consumerSliceLayout &&
|
||||
consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
|
||||
srcLayout = consumerSliceLayout.getParent();
|
||||
@ -482,9 +474,17 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
|
||||
}
|
||||
} else {
|
||||
|
||||
SmallVector<int64_t> consumerSgLayout =
|
||||
consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
|
||||
: SmallVector<int64_t>();
|
||||
SmallVector<int64_t> consumerOrder =
|
||||
consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
|
||||
: SmallVector<int64_t>();
|
||||
DenseI32ArrayAttr orderAttr =
|
||||
consumerLayout ? consumerLayout.getOrder() : nullptr;
|
||||
SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
|
||||
int remainingSgCount = workgroupSize;
|
||||
int remainingSgCount =
|
||||
consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
|
||||
int consumerIdx = 0;
|
||||
|
||||
// First pass: Match consumer's layout on non-reduction dimensions
|
||||
@ -502,6 +502,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
}
|
||||
|
||||
// Second pass: Distribute remaining subgroups across reduction dimensions
|
||||
// the reduction to scalar case is handled only by this loop
|
||||
int64_t remainOrder = consumerSgLayout.size();
|
||||
for (int i = 0; i < srcRank; i++) {
|
||||
if (llvm::is_contained(reductionDims, i)) {
|
||||
@ -525,19 +526,20 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
} else if (layoutKind == xegpu::LayoutKind::InstData) {
|
||||
|
||||
SmallVector<int64_t> instData(srcRank, 1);
|
||||
instData[srcRank - 2] =
|
||||
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
|
||||
if (srcRank >= 2)
|
||||
instData[srcRank - 2] =
|
||||
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
|
||||
instData[srcRank - 1] =
|
||||
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
|
||||
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
|
||||
|
||||
} else if (layoutKind == xegpu::LayoutKind::Lane) {
|
||||
|
||||
SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
|
||||
laneLayout[srcRank - 1] =
|
||||
std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
|
||||
laneData[srcRank - 2] =
|
||||
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
|
||||
if (srcRank >= 2)
|
||||
laneData[srcRank - 2] =
|
||||
std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
|
||||
srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
|
||||
toInt32Attr(laneData));
|
||||
}
|
||||
@ -546,6 +548,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
|
||||
DenseI64ArrayAttr::get(context, reductionDims));
|
||||
}
|
||||
|
||||
/// Sets up layout for Reduction operations by creating a SliceAttr for the
|
||||
/// result.
|
||||
xegpu::SliceAttr
|
||||
xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
|
||||
VectorType srcVecTy,
|
||||
const xegpu::uArch::uArch *uArch) {
|
||||
|
||||
auto srcShape = srcVecTy.getShape();
|
||||
auto context = srcVecTy.getContext();
|
||||
auto subgroupSize = uArch->getSubgroupSize();
|
||||
xegpu::LayoutAttr srcLayout;
|
||||
|
||||
if (layoutKind == xegpu::LayoutKind::Subgroup) {
|
||||
assert(true && "subgroup layout assignment not supported for reduction (op "
|
||||
"is not expected at this level).");
|
||||
} else if (layoutKind == xegpu::LayoutKind::InstData) {
|
||||
assert(true && "instData layout assignment not supported for reduction (op "
|
||||
"is not expected at this level).");
|
||||
} else if (layoutKind == xegpu::LayoutKind::Lane) {
|
||||
SmallVector<int32_t> laneLayout(1), laneData(1);
|
||||
laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
|
||||
laneData[0] = 1;
|
||||
srcLayout = xegpu::LayoutAttr::get(
|
||||
context, DenseI32ArrayAttr::get(context, laneLayout),
|
||||
DenseI32ArrayAttr::get(context, laneData));
|
||||
}
|
||||
|
||||
auto result = xegpu::SliceAttr::get(context, srcLayout,
|
||||
DenseI64ArrayAttr::get(context, 0));
|
||||
return result;
|
||||
}
|
||||
|
||||
/// Sets up the result layout for a bitcast operation.
|
||||
/// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
|
||||
/// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
|
||||
@ -656,7 +690,6 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
|
||||
"srcShape must be divisible by laneLayout for all dimensions");
|
||||
laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
|
||||
consumerLaneData[dim]);
|
||||
|
||||
requiredResLayout =
|
||||
requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
|
||||
}
|
||||
@ -930,8 +963,8 @@ std::optional<
|
||||
xegpu::DistributeLayoutAttr>>
|
||||
xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
|
||||
VectorType bTy, VectorType cdTy,
|
||||
xegpu::DistributeLayoutAttr consumerLayout,
|
||||
const xegpu::uArch::uArch *uArch, int numSg) {
|
||||
xegpu::DistributeLayoutAttr consumerLayout, int numSg,
|
||||
const xegpu::uArch::uArch *uArch) {
|
||||
auto context = aTy.getContext();
|
||||
const auto *uArchInstruction =
|
||||
dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
|
||||
@ -1079,7 +1112,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
|
||||
Operation *op = operand.getOwner();
|
||||
unsigned idx = operand.getOperandNumber();
|
||||
xegpu::DistributeLayoutAttr resLayout;
|
||||
if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
|
||||
if (op->getNumResults() == 1)
|
||||
resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));
|
||||
|
||||
// For vector::BroadcastOp, infer the source layout from the result layout.
|
||||
@ -1108,6 +1141,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
|
||||
return resLayout;
|
||||
}
|
||||
|
||||
if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
|
||||
if (!resLayout)
|
||||
return xegpu::DistributeLayoutAttr();
|
||||
return xegpu::inferReductionSourceLayout(resLayout);
|
||||
}
|
||||
|
||||
// For vector::BitCastOp, infer source layout from result layout using
|
||||
// element type bitwidths.
|
||||
if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
|
||||
|
||||
@ -365,6 +365,10 @@ private:
|
||||
ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results);
|
||||
|
||||
void visitVectorReductionOp(vector::ReductionOp reduction,
|
||||
ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results);
|
||||
|
||||
void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
|
||||
ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results);
|
||||
@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
|
||||
.Case([&](vector::MultiDimReductionOp reductionOp) {
|
||||
visitVectorMultiReductionOp(reductionOp, operands, results);
|
||||
})
|
||||
.Case([&](vector::ReductionOp reductionOp) {
|
||||
visitVectorReductionOp(reductionOp, operands, results);
|
||||
})
|
||||
.Case([&](vector::BroadcastOp broadcastOp) {
|
||||
visitVectorBroadCastOp(broadcastOp, operands, results);
|
||||
})
|
||||
@ -625,10 +632,17 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
|
||||
vector::MultiDimReductionOp reduction,
|
||||
ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results) {
|
||||
Type resultTy = reduction.getDestType();
|
||||
// The layout of the result must be present.
|
||||
LayoutInfo resLayoutInfo = results[0]->getValue();
|
||||
if (!resLayoutInfo.isAssigned())
|
||||
return;
|
||||
|
||||
xegpu::DistributeLayoutAttr consumerLayoutAttr;
|
||||
if (!resultTy.isIntOrFloat()) {
|
||||
if (!resLayoutInfo.isAssigned())
|
||||
return;
|
||||
consumerLayoutAttr =
|
||||
dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
|
||||
}
|
||||
|
||||
VectorType sourceTy = reduction.getSourceVectorType();
|
||||
SmallVector<int64_t> reductionDims(reduction.getReductionDims());
|
||||
@ -636,8 +650,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
|
||||
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
|
||||
if (!uArch)
|
||||
return;
|
||||
auto consumerLayoutAttr =
|
||||
dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
|
||||
int numSg = 0;
|
||||
if (layoutKind == xegpu::LayoutKind::Subgroup) {
|
||||
auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
|
||||
if (succeeded(numSgOrErr))
|
||||
numSg = numSgOrErr.value();
|
||||
}
|
||||
|
||||
// The result layout represents the layout requirements of the operation.
|
||||
// it is recorded to anchor layout or temporary layout.
|
||||
@ -645,7 +663,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
|
||||
// propagated from consumer op, the conflict is resolved in later phase by
|
||||
// converting the required result layout to the consumer layout
|
||||
auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
|
||||
layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
|
||||
layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);
|
||||
|
||||
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
|
||||
|
||||
@ -659,6 +677,26 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
|
||||
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
|
||||
}
|
||||
|
||||
void LayoutInfoPropagation::visitVectorReductionOp(
|
||||
vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results) {
|
||||
|
||||
VectorType sourceTy = reduction.getSourceVectorType();
|
||||
const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
|
||||
if (!uArch)
|
||||
return;
|
||||
|
||||
auto requiredResLayoutAttr =
|
||||
xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
|
||||
xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
|
||||
|
||||
auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
|
||||
propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
|
||||
if (reduction.getAcc())
|
||||
propagateIfChanged(operands[1],
|
||||
operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
|
||||
}
|
||||
|
||||
void LayoutInfoPropagation::visitVectorBroadCastOp(
|
||||
vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
|
||||
ArrayRef<const LayoutInfoLattice *> results) {
|
||||
@ -765,7 +803,7 @@ void LayoutInfoPropagation::visitDpasOp(
|
||||
numSg = numSgOrErr.value();
|
||||
}
|
||||
auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
|
||||
consumerLayoutAttr, uArch, numSg);
|
||||
consumerLayoutAttr, numSg, uArch);
|
||||
if (!layouts.has_value()) {
|
||||
dpas.emitWarning(
|
||||
"Failed to determine required layouts for DPAS operands.");
|
||||
@ -1286,6 +1324,7 @@ private:
|
||||
OpBuilder builder;
|
||||
LogicalResult resolveTensorDescConsumer(OpOperand &operand);
|
||||
LogicalResult resolveVectorConsumer(OpOperand &operand);
|
||||
LogicalResult assignResultLayout(OpResult &result);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
@ -1294,6 +1333,21 @@ LogicalResult ResolveLayoutConflicts::run() {
|
||||
// Scan all operations in the parent op and resolve layout conflicts at
|
||||
// tensor descriptor and vector use points.
|
||||
auto r = parentOp->walk([&](Operation *op) -> WalkResult {
|
||||
// if the operation inputs vector and output scalar, like multi-reduction we
|
||||
// need to check if the result has layout and add a convert_layout to serve
|
||||
// as anchor op for the reduction op's layout.
|
||||
if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
|
||||
for (OpResult result : op->getResults()) {
|
||||
if (result.getType().isIntOrFloat()) {
|
||||
auto res = assignResultLayout(result);
|
||||
if (failed(res)) {
|
||||
DBGS() << "Failed to resolve vector consumer for multi-reduction "
|
||||
<< *op << "\n";
|
||||
return WalkResult::interrupt();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (OpOperand &operand : op->getOpOperands()) {
|
||||
// Handle conflicts in tensor descriptor operands.
|
||||
Type operandType = operand.get().getType();
|
||||
@ -1321,6 +1375,18 @@ LogicalResult ResolveLayoutConflicts::run() {
|
||||
return r.wasInterrupted() ? failure() : success();
|
||||
}
|
||||
|
||||
LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
|
||||
Operation *producerOp = result.getDefiningOp();
|
||||
auto producerLayout = xegpu::getDistributeLayoutAttr(result);
|
||||
// Insert a convert_layout op to assign the layout.
|
||||
builder.setInsertionPointAfterValue(result);
|
||||
auto convertOp = xegpu::ConvertLayoutOp::create(
|
||||
builder, producerOp->getLoc(), result.getType(), result, producerLayout,
|
||||
producerLayout);
|
||||
result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
|
||||
return success();
|
||||
}
|
||||
|
||||
LogicalResult
|
||||
ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
|
||||
Value vectorValue = operand.get();
|
||||
|
||||
@ -128,7 +128,7 @@ gpu.module @test {
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: vector_row_reduction
|
||||
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
|
||||
gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
|
||||
gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
|
||||
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
|
||||
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
|
||||
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
|
||||
@ -140,10 +140,23 @@ gpu.module @test {
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: vector_row_reduction_scalar
|
||||
// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
|
||||
gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
|
||||
%load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
|
||||
%reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: vector_nest_reduction
|
||||
gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
|
||||
gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
|
||||
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
|
||||
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
|
||||
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
|
||||
@ -181,7 +194,7 @@ gpu.module @test {
|
||||
// CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
|
||||
// CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
|
||||
// CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
|
||||
gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
|
||||
gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
|
||||
%cst = arith.constant dense<0.000000e+00> : vector<32xf32>
|
||||
%cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
|
||||
%tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
|
||||
|
||||
@ -746,6 +746,54 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: func.func @vector_2d_reduction_scalar(
|
||||
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
|
||||
// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
|
||||
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
|
||||
// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
|
||||
// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16
|
||||
// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
|
||||
// CHECK: %[[MASK:.*]] = arith.constant true
|
||||
// CHECK: %[[OFF:.*]] = arith.constant 1 : index
|
||||
// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
|
||||
func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
|
||||
%cst = arith.constant dense<true> : vector<1xi1>
|
||||
%0 = vector.step : vector<1xindex>
|
||||
%1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
|
||||
%2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
|
||||
%cst_0 = arith.constant 0.000000e+00 : f16
|
||||
%4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
|
||||
%cst_2 = arith.constant true
|
||||
%cst_3 = arith.constant 1 : index
|
||||
xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: func.func @vector_reduction_scalar(
|
||||
// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
|
||||
// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xindex>
|
||||
// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
|
||||
// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>} : vector<16xf16> into f16
|
||||
// CHECK: %[[MASK:.*]] = arith.constant true
|
||||
// CHECK: %[[OFF:.*]] = arith.constant 1 : index
|
||||
// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
|
||||
func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
|
||||
%cst = arith.constant dense<true> : vector<16xi1>
|
||||
%0 = vector.step : vector<16xindex>
|
||||
%1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
|
||||
%4 = vector.reduction <add>, %1: vector<16xf16> into f16
|
||||
%cst_2 = arith.constant true
|
||||
%cst_3 = arith.constant 1 : index
|
||||
xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user