diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 3526178ea575..e001419257d8 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou OptionalAttr:$l2_hint, OptionalAttr:$l3_hint, OptionalAttr:$layout); - let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value); + let results = (outs XeGPU_ValueOrScalarType:$value); let extraClassDeclaration = extraBaseClassDeclaration # [{ @@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL }]; - let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value, + let arguments = (ins XeGPU_ValueOrScalarType:$value, XeGPU_GatherScatterSourceType:$dest, Optional>:$offsets, AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr:$chunk_size, @@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou : vector<128x128xf16> ``` }]; - let arguments = (ins XeGPU_ConvertLayoutType: $source, + let arguments = (ins XeGPU_VectorOrScalarType: $source, DistributeLayoutAttr: $input_layout, DistributeLayoutAttr: $target_layout); - let results = (outs XeGPU_ConvertLayoutType: $result); + let results = (outs XeGPU_VectorOrScalarType: $result); let assemblyFormat = [{ $source prop-dict attr-dict `:` type($source) }]; @@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, OptionalAttr:$subgroup_block_io, OptionalAttr:$layout ); - let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res); + let results = (outs XeGPU_ValueOrScalarType:$res); let assemblyFormat = [{ $mem_desc `` custom($offsets, $const_offsets) prop-dict attr-dict `` `:` type(operands) `->` type(results) @@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> { let arguments = (ins - AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data, + XeGPU_ValueOrScalarType:$data, XeGPU_MemDesc:$mem_desc, Variadic: $offsets, DenseI64ArrayAttr: $const_offsets, diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index f41c0bf1fd2b..7e142b20c089 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -25,11 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>; def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>; -def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>; -def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; -def XeGPU_VectorOrOffsetVectorType - : VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>; -def XeGPU_ConvertLayoutType +def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; +def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>; +def XeGPU_VectorOrScalarType : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>; def XeGPU_GatherScatterBaseAddrType : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>; diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h index 55b18d4a19c5..9cf9a8705209 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h @@ -82,6 +82,10 @@ DistributeLayoutAttr inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout, SmallVector reduceDims); +/// Infers the source layout attribute for a reduction operation given the +/// result layout attribute and reduced dims. +DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout); + /// Infers the source layout attribute for a transpose operation given the /// result layout attribute and permutation. DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout, @@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout, ArrayRef resShape, ArrayRef srcShape); -/// Sets up layout for reduction operations by creating a SliceAttr for the -/// result. +/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for +/// the result. /// /// This function first attempts to construct a source layout that, when /// sliced along reduction dimensions, produces a result layout compatible @@ -120,7 +124,13 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind, VectorType srcVectorTy, DistributeLayoutAttr consumerLayout, SmallVector reductionDims, - const uArch::uArch *uArch); + int numSg, const uArch::uArch *uArch); + +/// Sets up layout for Reduction operations by creating a SliceAttr for the +/// result. +SliceAttr setupReductionResultLayout(LayoutKind layoutKind, + VectorType srcVectorTy, + const uArch::uArch *uArch); /// Setup the result layout attribute for a bitcast operation based on element /// type bitwidths. This ensures the source layout can always be derived from @@ -170,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind, std::optional> setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy, - VectorType cdTy, DistributeLayoutAttr consumerLayout, - const uArch::uArch *uArch, int numSg); + VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg, + const uArch::uArch *uArch); /// Gets the expected layout for a given consumer operand. This will check if /// the owning operation of the consumer operand is one of the special layout diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp index ec5751634fdf..55cd6ec04970 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp @@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout, return sliceLayout.getParent(); } +xegpu::DistributeLayoutAttr +xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) { + return xegpu::inferMultiReductionSourceLayout(resLayout, {0}); +} + /// Infers the source layout attribute for a transpose operation given the /// result layout attribute and permutation. xegpu::DistributeLayoutAttr @@ -399,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout, /// Examples: /// 1. Subgroup layout - Row reduction on 2D tensor: /// srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16, -/// workgroupSize=32 +/// NumSg=32 /// * Consumer Layout: /// #xegpu.slice<#xegpu.layout, dims = /// [1]>} @@ -440,15 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout, xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( xegpu::LayoutKind layoutKind, VectorType srcVecTy, DistributeLayoutAttr consumerLayout, SmallVector reductionDims, - const xegpu::uArch::uArch *uArch) { + int numSg, const xegpu::uArch::uArch *uArch) { auto srcShape = srcVecTy.getShape(); int srcRank = srcShape.size(); - auto context = consumerLayout.getContext(); - - // Reduction layout requires at least 2D tensors - if (srcRank < 2) - return nullptr; + auto context = srcVecTy.getContext(); // Helper lambda to convert int64 vectors to int32 DenseArrayAttr auto toInt32Attr = [&](ArrayRef vec) { @@ -456,21 +457,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( return DenseI32ArrayAttr::get(context, vec32); }; - const int workgroupSize = consumerLayout.getNumSubgroups(); const int subgroupSize = uArch->getSubgroupSize(); int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size - - SmallVector consumerSgLayout = - consumerLayout.getEffectiveSgLayoutAsInt(); - SmallVector consumerLaneLayout = - consumerLayout.getEffectiveLaneLayoutAsInt(); - SmallVector consumerOrder = consumerLayout.getEffectiveOrderAsInt(); - DenseI32ArrayAttr orderAttr = consumerLayout.getOrder(); - xegpu::DistributeLayoutAttr srcLayout; if (layoutKind == xegpu::LayoutKind::Subgroup) { xegpu::SliceAttr consumerSliceLayout = - dyn_cast(consumerLayout); + dyn_cast_if_present(consumerLayout); if (consumerSliceLayout && consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) { srcLayout = consumerSliceLayout.getParent(); @@ -482,9 +474,17 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1); } } else { - + SmallVector consumerSgLayout = + consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt() + : SmallVector(); + SmallVector consumerOrder = + consumerLayout ? consumerLayout.getEffectiveOrderAsInt() + : SmallVector(); + DenseI32ArrayAttr orderAttr = + consumerLayout ? consumerLayout.getOrder() : nullptr; SmallVector sgLayout(srcRank), sgData(srcRank), order(srcRank); - int remainingSgCount = workgroupSize; + int remainingSgCount = + consumerLayout ? consumerLayout.getNumSubgroups() : numSg; int consumerIdx = 0; // First pass: Match consumer's layout on non-reduction dimensions @@ -502,6 +502,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( } // Second pass: Distribute remaining subgroups across reduction dimensions + // the reduction to scalar case is handled only by this loop int64_t remainOrder = consumerSgLayout.size(); for (int i = 0; i < srcRank; i++) { if (llvm::is_contained(reductionDims, i)) { @@ -525,19 +526,20 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( } else if (layoutKind == xegpu::LayoutKind::InstData) { SmallVector instData(srcRank, 1); - instData[srcRank - 2] = - std::min(maxReduceVectorSize, srcShape[srcRank - 2]); + if (srcRank >= 2) + instData[srcRank - 2] = + std::min(maxReduceVectorSize, srcShape[srcRank - 2]); instData[srcRank - 1] = std::min(static_cast(subgroupSize), srcShape[srcRank - 1]); srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData)); - } else if (layoutKind == xegpu::LayoutKind::Lane) { SmallVector laneLayout(srcRank, 1), laneData(srcRank, 1); laneLayout[srcRank - 1] = std::min(static_cast(subgroupSize), srcShape[srcRank - 1]); - laneData[srcRank - 2] = - std::min(maxReduceVectorSize, srcShape[srcRank - 2]); + if (srcRank >= 2) + laneData[srcRank - 2] = + std::min(maxReduceVectorSize, srcShape[srcRank - 2]); srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout), toInt32Attr(laneData)); } @@ -546,6 +548,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout( DenseI64ArrayAttr::get(context, reductionDims)); } +/// Sets up layout for Reduction operations by creating a SliceAttr for the +/// result. +xegpu::SliceAttr +xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind, + VectorType srcVecTy, + const xegpu::uArch::uArch *uArch) { + + auto srcShape = srcVecTy.getShape(); + auto context = srcVecTy.getContext(); + auto subgroupSize = uArch->getSubgroupSize(); + xegpu::LayoutAttr srcLayout; + + if (layoutKind == xegpu::LayoutKind::Subgroup) { + assert(true && "subgroup layout assignment not supported for reduction (op " + "is not expected at this level)."); + } else if (layoutKind == xegpu::LayoutKind::InstData) { + assert(true && "instData layout assignment not supported for reduction (op " + "is not expected at this level)."); + } else if (layoutKind == xegpu::LayoutKind::Lane) { + SmallVector laneLayout(1), laneData(1); + laneLayout[0] = std::min(subgroupSize, static_cast(srcShape[0])); + laneData[0] = 1; + srcLayout = xegpu::LayoutAttr::get( + context, DenseI32ArrayAttr::get(context, laneLayout), + DenseI32ArrayAttr::get(context, laneData)); + } + + auto result = xegpu::SliceAttr::get(context, srcLayout, + DenseI64ArrayAttr::get(context, 0)); + return result; +} + /// Sets up the result layout for a bitcast operation. /// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData, /// instData, or laneData) by multiplying by the bitwidth ratio to ensure the @@ -656,7 +690,6 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout( "srcShape must be divisible by laneLayout for all dimensions"); laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim], consumerLaneData[dim]); - requiredResLayout = requiredResLayout.setDimData(dim, -1, -1, laneDataValue); } @@ -930,8 +963,8 @@ std::optional< xegpu::DistributeLayoutAttr>> xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy, VectorType bTy, VectorType cdTy, - xegpu::DistributeLayoutAttr consumerLayout, - const xegpu::uArch::uArch *uArch, int numSg) { + xegpu::DistributeLayoutAttr consumerLayout, int numSg, + const xegpu::uArch::uArch *uArch) { auto context = aTy.getContext(); const auto *uArchInstruction = dyn_cast(uArch->getInstruction( @@ -1079,7 +1112,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) { Operation *op = operand.getOwner(); unsigned idx = operand.getOperandNumber(); xegpu::DistributeLayoutAttr resLayout; - if (op->getNumResults() == 1 && isa(op->getResult(0).getType())) + if (op->getNumResults() == 1) resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0)); // For vector::BroadcastOp, infer the source layout from the result layout. @@ -1108,6 +1141,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) { return resLayout; } + if (auto reduction = dyn_cast(op)) { + if (!resLayout) + return xegpu::DistributeLayoutAttr(); + return xegpu::inferReductionSourceLayout(resLayout); + } + // For vector::BitCastOp, infer source layout from result layout using // element type bitwidths. if (auto bitcast = dyn_cast(op)) { diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp index 8675fe8b5cce..4c30dacae885 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp @@ -365,6 +365,10 @@ private: ArrayRef operands, ArrayRef results); + void visitVectorReductionOp(vector::ReductionOp reduction, + ArrayRef operands, + ArrayRef results); + void visitVectorBroadCastOp(vector::BroadcastOp broadcast, ArrayRef operands, ArrayRef results); @@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation( .Case([&](vector::MultiDimReductionOp reductionOp) { visitVectorMultiReductionOp(reductionOp, operands, results); }) + .Case([&](vector::ReductionOp reductionOp) { + visitVectorReductionOp(reductionOp, operands, results); + }) .Case([&](vector::BroadcastOp broadcastOp) { visitVectorBroadCastOp(broadcastOp, operands, results); }) @@ -625,10 +632,17 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( vector::MultiDimReductionOp reduction, ArrayRef operands, ArrayRef results) { + Type resultTy = reduction.getDestType(); // The layout of the result must be present. LayoutInfo resLayoutInfo = results[0]->getValue(); - if (!resLayoutInfo.isAssigned()) - return; + + xegpu::DistributeLayoutAttr consumerLayoutAttr; + if (!resultTy.isIntOrFloat()) { + if (!resLayoutInfo.isAssigned()) + return; + consumerLayoutAttr = + dyn_cast(resLayoutInfo.get()); + } VectorType sourceTy = reduction.getSourceVectorType(); SmallVector reductionDims(reduction.getReductionDims()); @@ -636,8 +650,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); if (!uArch) return; - auto consumerLayoutAttr = - dyn_cast(resLayoutInfo.get()); + int numSg = 0; + if (layoutKind == xegpu::LayoutKind::Subgroup) { + auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize()); + if (succeeded(numSgOrErr)) + numSg = numSgOrErr.value(); + } // The result layout represents the layout requirements of the operation. // it is recorded to anchor layout or temporary layout. @@ -645,7 +663,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( // propagated from consumer op, the conflict is resolved in later phase by // converting the required result layout to the consumer layout auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout( - layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch); + layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch); xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr); @@ -659,6 +677,26 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp( operands[1]->meet(LayoutInfo(requiredResLayoutAttr))); } +void LayoutInfoPropagation::visitVectorReductionOp( + vector::ReductionOp reduction, ArrayRef operands, + ArrayRef results) { + + VectorType sourceTy = reduction.getSourceVectorType(); + const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or("")); + if (!uArch) + return; + + auto requiredResLayoutAttr = + xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch); + xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr); + + auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr); + propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr))); + if (reduction.getAcc()) + propagateIfChanged(operands[1], + operands[1]->meet(LayoutInfo(requiredResLayoutAttr))); +} + void LayoutInfoPropagation::visitVectorBroadCastOp( vector::BroadcastOp broadcast, ArrayRef operands, ArrayRef results) { @@ -765,7 +803,7 @@ void LayoutInfoPropagation::visitDpasOp( numSg = numSgOrErr.value(); } auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy, - consumerLayoutAttr, uArch, numSg); + consumerLayoutAttr, numSg, uArch); if (!layouts.has_value()) { dpas.emitWarning( "Failed to determine required layouts for DPAS operands."); @@ -1286,6 +1324,7 @@ private: OpBuilder builder; LogicalResult resolveTensorDescConsumer(OpOperand &operand); LogicalResult resolveVectorConsumer(OpOperand &operand); + LogicalResult assignResultLayout(OpResult &result); }; } // namespace @@ -1294,6 +1333,21 @@ LogicalResult ResolveLayoutConflicts::run() { // Scan all operations in the parent op and resolve layout conflicts at // tensor descriptor and vector use points. auto r = parentOp->walk([&](Operation *op) -> WalkResult { + // if the operation inputs vector and output scalar, like multi-reduction we + // need to check if the result has layout and add a convert_layout to serve + // as anchor op for the reduction op's layout. + if (isa(op) || isa(op)) { + for (OpResult result : op->getResults()) { + if (result.getType().isIntOrFloat()) { + auto res = assignResultLayout(result); + if (failed(res)) { + DBGS() << "Failed to resolve vector consumer for multi-reduction " + << *op << "\n"; + return WalkResult::interrupt(); + } + } + } + } for (OpOperand &operand : op->getOpOperands()) { // Handle conflicts in tensor descriptor operands. Type operandType = operand.get().getType(); @@ -1321,6 +1375,18 @@ LogicalResult ResolveLayoutConflicts::run() { return r.wasInterrupted() ? failure() : success(); } +LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) { + Operation *producerOp = result.getDefiningOp(); + auto producerLayout = xegpu::getDistributeLayoutAttr(result); + // Insert a convert_layout op to assign the layout. + builder.setInsertionPointAfterValue(result); + auto convertOp = xegpu::ConvertLayoutOp::create( + builder, producerOp->getLoc(), result.getType(), result, producerLayout, + producerLayout); + result.replaceAllUsesExcept(convertOp.getResult(), convertOp); + return success(); +} + LogicalResult ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) { Value vectorValue = operand.get(); diff --git a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir index e4e6d61b92fd..bb387b4cfb09 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir @@ -128,7 +128,7 @@ gpu.module @test { gpu.module @test { // CHECK-LABEL: vector_row_reduction // CHECK: %[[REDUCE:.*]] = vector.multi_reduction , %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [1]>} - gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) { + gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32> %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> @@ -140,10 +140,23 @@ gpu.module @test { } } +// ----- +gpu.module @test { +// CHECK-LABEL: vector_row_reduction_scalar +// CHECK: %[[REDUCE:.*]] = vector.multi_reduction , %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1]>} + gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { + %cst = arith.constant 0.000000e+00 : f32 + %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32> + %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32> + %reduce = vector.multi_reduction , %load, %cst [0, 1] : vector<32x64xf32> to f32 + gpu.return + } +} + // ----- gpu.module @test { // CHECK-LABEL: vector_nest_reduction - gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) { + gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32> @@ -181,7 +194,7 @@ gpu.module @test { // CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]] // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout, dims = [0]>, dims = [1]>}> // CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1> - gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) { + gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array} { %cst = arith.constant dense<0.000000e+00> : vector<32xf32> %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32> %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32> diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir index 221e963ed9ac..26936dab2fb3 100644 --- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir +++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir @@ -746,6 +746,54 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x } } +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @vector_2d_reduction_scalar( +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<1xi1> +// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout} : vector<1xindex> +// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> +// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout} : vector<1xf16> to vector<1x1x1xf16> +// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16 +// CHECK: %[[RED:.*]] = vector.multi_reduction , %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16 +// CHECK: %[[MASK:.*]] = arith.constant true +// CHECK: %[[OFF:.*]] = arith.constant 1 : index +// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1 +func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) { + %cst = arith.constant dense : vector<1xi1> + %0 = vector.step : vector<1xindex> + %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16> + %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16> + %cst_0 = arith.constant 0.000000e+00 : f16 + %4 = vector.multi_reduction , %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16 + %cst_2 = arith.constant true + %cst_3 = arith.constant 1 : index + xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1 + return + } +} + +// ----- +gpu.module @test { +// CHECK-LABEL: func.func @vector_reduction_scalar( +// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout} dense : vector<16xi1> +// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout} : vector<16xindex> +// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> +// CHECK: %[[RED:.*]] = vector.reduction , %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout, dims = [0]>} : vector<16xf16> into f16 +// CHECK: %[[MASK:.*]] = arith.constant true +// CHECK: %[[OFF:.*]] = arith.constant 1 : index +// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1 +func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) { + %cst = arith.constant dense : vector<16xi1> + %0 = vector.step : vector<16xindex> + %1 = xegpu.load %arg0[%0], %cst : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16> + %4 = vector.reduction , %1: vector<16xf16> into f16 + %cst_2 = arith.constant true + %cst_3 = arith.constant 1 : index + xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1 + return + } +} + // ----- gpu.module @test { // CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(