[MLIR][XeGPU] Add Layout Propagation support for multi-reduction/reduction op with scalar result (#189133)

This PR add Layout Propagation support for multi-reduction/reduction op with scalar result: 1) Enhance setupMultiReductionResultLayout() and LayoutInfoPropagation::visitVectorMultiReductionOp() to support scalar result 2) Add propagation support for vector.reduction op at the lane level, since the op is only introduced at the lane level.
2026-04-01 13:01:34 -07:00 · 2026-04-01 13:01:34 -07:00 · 401ba6df84
commit 401ba6df84
parent 75c6f4791c
7 changed files with 227 additions and 53 deletions
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@ -974,7 +974,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>, AnchorLayou
      OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
      OptionalAttr<DistributeLayoutAttr>:$layout);
-  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
+  let results = (outs XeGPU_ValueOrScalarType:$value);

  let extraClassDeclaration = extraBaseClassDeclaration # [{

@ -1134,7 +1134,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>, AnchorL

  }];

-  let arguments = (ins AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value,
+  let arguments = (ins XeGPU_ValueOrScalarType:$value,
      XeGPU_GatherScatterSourceType:$dest,
      Optional<AnyTypeOf<[XeGPU_OffsetType, Index]>>:$offsets,
      AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
@ -1521,10 +1521,10 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou
            : vector<128x128xf16>
        ```
    }];
-    let arguments = (ins XeGPU_ConvertLayoutType: $source,
+    let arguments = (ins XeGPU_VectorOrScalarType: $source,
                         DistributeLayoutAttr: $input_layout,
                         DistributeLayoutAttr: $target_layout);
-    let results = (outs XeGPU_ConvertLayoutType: $result);
+    let results = (outs XeGPU_VectorOrScalarType: $result);
    let assemblyFormat = [{
        $source prop-dict attr-dict `:` type($source)
    }];
@ -1584,7 +1584,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
    OptionalAttr<UnitAttr>:$subgroup_block_io,
    OptionalAttr<DistributeLayoutAttr>:$layout
  );
-  let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$res);
+  let results = (outs XeGPU_ValueOrScalarType:$res);
  let assemblyFormat = [{
    $mem_desc `` custom<DynamicIndexList>($offsets, $const_offsets)
    prop-dict attr-dict `` `:` type(operands) `->` type(results)
@ -1652,7 +1652,7 @@ def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>,
 def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>,
                              AllElementTypesMatch<["mem_desc", "data"]>, AnchorLayoutInterface]> {
  let arguments = (ins
-    AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$data,
+    XeGPU_ValueOrScalarType:$data,
    XeGPU_MemDesc:$mem_desc,
    Variadic<Index>: $offsets,
    DenseI64ArrayAttr: $const_offsets,
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@ -25,11 +25,9 @@ def XeGPU_DpasOprType: FixedVectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
 def XeGPU_DpasResType: FixedVectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
 def XeGPU_OffsetType: FixedVectorOfNonZeroRankOf<[Index]>;
 def XeGPU_MaskType: FixedVectorOfNonZeroRankOf<[I1]>;
-def XeGPU_ValueType: FixedVectorOfNonZeroRankOf<[XeGPU_ScalarType]>;
-def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
-def XeGPU_VectorOrOffsetVectorType
-    : VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>;
-def XeGPU_ConvertLayoutType
+def XeGPU_ValueType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
+def XeGPU_ValueOrScalarType : AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>;
+def XeGPU_VectorOrScalarType
    : AnyTypeOf<[VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType, Index]>, XeGPU_ScalarType]>;
 def XeGPU_GatherScatterBaseAddrType
    : AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1]>, XeGPU_PointerType]>;
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h
@ -82,6 +82,10 @@ DistributeLayoutAttr
 inferMultiReductionSourceLayout(DistributeLayoutAttr resLayout,
                                SmallVector<int64_t> reduceDims);

+/// Infers the source layout attribute for a reduction operation given the
+/// result layout attribute and reduced dims.
+DistributeLayoutAttr inferReductionSourceLayout(DistributeLayoutAttr resLayout);
+
 /// Infers the source layout attribute for a transpose operation given the
 /// result layout attribute and permutation.
 DistributeLayoutAttr inferTransposeSourceLayout(DistributeLayoutAttr resLayout,
@ -108,8 +112,8 @@ inferInsertStridedSliceSourceLayout(DistributeLayoutAttr resLayout,
                                    ArrayRef<int64_t> resShape,
                                    ArrayRef<int64_t> srcShape);

-/// Sets up layout for reduction operations by creating a SliceAttr for the
-/// result.
+/// Sets up layout for Multi-Reduction operations by creating a SliceAttr for
+/// the result.
 ///
 /// This function first attempts to construct a source layout that, when
 /// sliced along reduction dimensions, produces a result layout compatible
@ -120,7 +124,13 @@ SliceAttr setupMultiReductionResultLayout(LayoutKind layoutKind,
                                          VectorType srcVectorTy,
                                          DistributeLayoutAttr consumerLayout,
                                          SmallVector<int64_t> reductionDims,
-                                          const uArch::uArch *uArch);
+                                          int numSg, const uArch::uArch *uArch);
+
+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+SliceAttr setupReductionResultLayout(LayoutKind layoutKind,
+                                     VectorType srcVectorTy,
+                                     const uArch::uArch *uArch);

 /// Setup the result layout attribute for a bitcast operation based on element
 /// type bitwidths. This ensures the source layout can always be derived from
@ -170,8 +180,8 @@ DistributeLayoutAttr setupStoreMatrixAnchorLayout(LayoutKind layoutKind,
 std::optional<std::tuple<DistributeLayoutAttr, DistributeLayoutAttr,
                         DistributeLayoutAttr>>
 setupDpasLayout(LayoutKind layoutKind, VectorType aTy, VectorType bTy,
-                VectorType cdTy, DistributeLayoutAttr consumerLayout,
-                const uArch::uArch *uArch, int numSg);
+                VectorType cdTy, DistributeLayoutAttr consumerLayout, int numSg,
+                const uArch::uArch *uArch);

 /// Gets the expected layout for a given consumer operand. This will check if
 /// the owning operation of the consumer operand is one of the special layout
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPULayoutImpl.cpp
@ -183,6 +183,11 @@ xegpu::inferMultiReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout,
  return sliceLayout.getParent();
 }

+xegpu::DistributeLayoutAttr
+xegpu::inferReductionSourceLayout(xegpu::DistributeLayoutAttr resLayout) {
+  return xegpu::inferMultiReductionSourceLayout(resLayout, {0});
+}
+
 /// Infers the source layout attribute for a transpose operation given the
 /// result layout attribute and permutation.
 xegpu::DistributeLayoutAttr
@ -399,7 +404,7 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 /// Examples:
 ///   1. Subgroup layout - Row reduction on 2D tensor:
 ///      srcShape=[32, 128], reductionDims=[1], resShape=[32], subgroupSize=16,
-///      workgroupSize=32
+///      NumSg=32
 ///      * Consumer Layout:
 ///        #xegpu.slice<#xegpu.layout<sg_layout=[4, 8], sg_data=[8, 8]>, dims =
 ///        [1]>}
@ -440,15 +445,11 @@ xegpu::inferShapeCastSourceLayout(xegpu::DistributeLayoutAttr resLayout,
 xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
    xegpu::LayoutKind layoutKind, VectorType srcVecTy,
    DistributeLayoutAttr consumerLayout, SmallVector<int64_t> reductionDims,
-    const xegpu::uArch::uArch *uArch) {
+    int numSg, const xegpu::uArch::uArch *uArch) {

  auto srcShape = srcVecTy.getShape();
  int srcRank = srcShape.size();
-  auto context = consumerLayout.getContext();
-
-  // Reduction layout requires at least 2D tensors
-  if (srcRank < 2)
-    return nullptr;
+  auto context = srcVecTy.getContext();

  // Helper lambda to convert int64 vectors to int32 DenseArrayAttr
  auto toInt32Attr = [&](ArrayRef<int64_t> vec) {
@ -456,21 +457,12 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
    return DenseI32ArrayAttr::get(context, vec32);
  };

-  const int workgroupSize = consumerLayout.getNumSubgroups();
  const int subgroupSize = uArch->getSubgroupSize();
  int64_t maxReduceVectorSize = 1; // could extend to spirv vector Size
-
-  SmallVector<int64_t> consumerSgLayout =
-      consumerLayout.getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> consumerLaneLayout =
-      consumerLayout.getEffectiveLaneLayoutAsInt();
-  SmallVector<int64_t> consumerOrder = consumerLayout.getEffectiveOrderAsInt();
-  DenseI32ArrayAttr orderAttr = consumerLayout.getOrder();
-
  xegpu::DistributeLayoutAttr srcLayout;
  if (layoutKind == xegpu::LayoutKind::Subgroup) {
    xegpu::SliceAttr consumerSliceLayout =
-        dyn_cast<xegpu::SliceAttr>(consumerLayout);
+        dyn_cast_if_present<xegpu::SliceAttr>(consumerLayout);
    if (consumerSliceLayout &&
        consumerSliceLayout.getDims().asArrayRef().equals(reductionDims)) {
      srcLayout = consumerSliceLayout.getParent();
@ -482,9 +474,17 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
          srcLayout = srcLayout.setDimData(dim, srcSgData.value()[dim], -1, -1);
        }
    } else {
-
+      SmallVector<int64_t> consumerSgLayout =
+          consumerLayout ? consumerLayout.getEffectiveSgLayoutAsInt()
+                         : SmallVector<int64_t>();
+      SmallVector<int64_t> consumerOrder =
+          consumerLayout ? consumerLayout.getEffectiveOrderAsInt()
+                         : SmallVector<int64_t>();
+      DenseI32ArrayAttr orderAttr =
+          consumerLayout ? consumerLayout.getOrder() : nullptr;
      SmallVector<int64_t> sgLayout(srcRank), sgData(srcRank), order(srcRank);
-      int remainingSgCount = workgroupSize;
+      int remainingSgCount =
+          consumerLayout ? consumerLayout.getNumSubgroups() : numSg;
      int consumerIdx = 0;

      // First pass: Match consumer's layout on non-reduction dimensions
@ -502,6 +502,7 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
      }

      // Second pass: Distribute remaining subgroups across reduction dimensions
+      // the reduction to scalar case is handled only by this loop
      int64_t remainOrder = consumerSgLayout.size();
      for (int i = 0; i < srcRank; i++) {
        if (llvm::is_contained(reductionDims, i)) {
@ -525,19 +526,20 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
  } else if (layoutKind == xegpu::LayoutKind::InstData) {

    SmallVector<int64_t> instData(srcRank, 1);
-    instData[srcRank - 2] =
-        std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+    if (srcRank >= 2)
+      instData[srcRank - 2] =
+          std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
    instData[srcRank - 1] =
        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
    srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(instData));
-
  } else if (layoutKind == xegpu::LayoutKind::Lane) {

    SmallVector<int64_t> laneLayout(srcRank, 1), laneData(srcRank, 1);
    laneLayout[srcRank - 1] =
        std::min(static_cast<int64_t>(subgroupSize), srcShape[srcRank - 1]);
-    laneData[srcRank - 2] =
-        std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
+    if (srcRank >= 2)
+      laneData[srcRank - 2] =
+          std::min(maxReduceVectorSize, srcShape[srcRank - 2]);
    srcLayout = xegpu::LayoutAttr::get(context, toInt32Attr(laneLayout),
                                       toInt32Attr(laneData));
  }
@ -546,6 +548,38 @@ xegpu::SliceAttr xegpu::setupMultiReductionResultLayout(
                               DenseI64ArrayAttr::get(context, reductionDims));
 }

+/// Sets up layout for Reduction operations by creating a SliceAttr for the
+/// result.
+xegpu::SliceAttr
+xegpu::setupReductionResultLayout(xegpu::LayoutKind layoutKind,
+                                  VectorType srcVecTy,
+                                  const xegpu::uArch::uArch *uArch) {
+
+  auto srcShape = srcVecTy.getShape();
+  auto context = srcVecTy.getContext();
+  auto subgroupSize = uArch->getSubgroupSize();
+  xegpu::LayoutAttr srcLayout;
+
+  if (layoutKind == xegpu::LayoutKind::Subgroup) {
+    assert(true && "subgroup layout assignment not supported for reduction (op "
+                   "is not expected at this level).");
+  } else if (layoutKind == xegpu::LayoutKind::InstData) {
+    assert(true && "instData layout assignment not supported for reduction (op "
+                   "is not expected at this level).");
+  } else if (layoutKind == xegpu::LayoutKind::Lane) {
+    SmallVector<int32_t> laneLayout(1), laneData(1);
+    laneLayout[0] = std::min(subgroupSize, static_cast<int32_t>(srcShape[0]));
+    laneData[0] = 1;
+    srcLayout = xegpu::LayoutAttr::get(
+        context, DenseI32ArrayAttr::get(context, laneLayout),
+        DenseI32ArrayAttr::get(context, laneData));
+  }
+
+  auto result = xegpu::SliceAttr::get(context, srcLayout,
+                                      DenseI64ArrayAttr::get(context, 0));
+  return result;
+}
+
 /// Sets up the result layout for a bitcast operation.
 /// When casting to a smaller bitwidth, adjusts the layout dimensions (sgData,
 /// instData, or laneData) by multiplying by the bitwidth ratio to ensure the
@ -656,7 +690,6 @@ xegpu::DistributeLayoutAttr xegpu::setupInsertStridedSliceResultLayout(
             "srcShape must be divisible by laneLayout for all dimensions");
      laneDataValue = std::min(srcShape[dim] / consumerLaneLayout[dim],
                               consumerLaneData[dim]);
-
      requiredResLayout =
          requiredResLayout.setDimData(dim, -1, -1, laneDataValue);
    }
@ -930,8 +963,8 @@ std::optional<
               xegpu::DistributeLayoutAttr>>
 xegpu::setupDpasLayout(xegpu::LayoutKind layoutKind, VectorType aTy,
                       VectorType bTy, VectorType cdTy,
-                       xegpu::DistributeLayoutAttr consumerLayout,
-                       const xegpu::uArch::uArch *uArch, int numSg) {
+                       xegpu::DistributeLayoutAttr consumerLayout, int numSg,
+                       const xegpu::uArch::uArch *uArch) {
  auto context = aTy.getContext();
  const auto *uArchInstruction =
      dyn_cast<xegpu::uArch::SubgroupMatrixMultiplyAcc>(uArch->getInstruction(
@ -1079,7 +1112,7 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
  Operation *op = operand.getOwner();
  unsigned idx = operand.getOperandNumber();
  xegpu::DistributeLayoutAttr resLayout;
-  if (op->getNumResults() == 1 && isa<VectorType>(op->getResult(0).getType()))
+  if (op->getNumResults() == 1)
    resLayout = xegpu::getDistributeLayoutAttr(op->getResult(0));

  // For vector::BroadcastOp, infer the source layout from the result layout.
@ -1108,6 +1141,12 @@ xegpu::DistributeLayoutAttr xegpu::getConsumerLayoutAt(OpOperand &operand) {
      return resLayout;
  }

+  if (auto reduction = dyn_cast<vector::ReductionOp>(op)) {
+    if (!resLayout)
+      return xegpu::DistributeLayoutAttr();
+    return xegpu::inferReductionSourceLayout(resLayout);
+  }
+
  // For vector::BitCastOp, infer source layout from result layout using
  // element type bitwidths.
  if (auto bitcast = dyn_cast<vector::BitCastOp>(op)) {
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@ -365,6 +365,10 @@ private:
                                   ArrayRef<LayoutInfoLattice *> operands,
                                   ArrayRef<const LayoutInfoLattice *> results);

+  void visitVectorReductionOp(vector::ReductionOp reduction,
+                              ArrayRef<LayoutInfoLattice *> operands,
+                              ArrayRef<const LayoutInfoLattice *> results);
+
  void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
                              ArrayRef<LayoutInfoLattice *> operands,
                              ArrayRef<const LayoutInfoLattice *> results);
@ -461,6 +465,9 @@ LogicalResult LayoutInfoPropagation::visitOperation(
      .Case([&](vector::MultiDimReductionOp reductionOp) {
        visitVectorMultiReductionOp(reductionOp, operands, results);
      })
+      .Case([&](vector::ReductionOp reductionOp) {
+        visitVectorReductionOp(reductionOp, operands, results);
+      })
      .Case([&](vector::BroadcastOp broadcastOp) {
        visitVectorBroadCastOp(broadcastOp, operands, results);
      })
@ -625,10 +632,17 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
    vector::MultiDimReductionOp reduction,
    ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
+  Type resultTy = reduction.getDestType();
  // The layout of the result must be present.
  LayoutInfo resLayoutInfo = results[0]->getValue();
-  if (!resLayoutInfo.isAssigned())
-    return;
+
+  xegpu::DistributeLayoutAttr consumerLayoutAttr;
+  if (!resultTy.isIntOrFloat()) {
+    if (!resLayoutInfo.isAssigned())
+      return;
+    consumerLayoutAttr =
+        dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+  }

  VectorType sourceTy = reduction.getSourceVectorType();
  SmallVector<int64_t> reductionDims(reduction.getReductionDims());
@ -636,8 +650,12 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
  const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
  if (!uArch)
    return;
-  auto consumerLayoutAttr =
-      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
+  int numSg = 0;
+  if (layoutKind == xegpu::LayoutKind::Subgroup) {
+    auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
+    if (succeeded(numSgOrErr))
+      numSg = numSgOrErr.value();
+  }

  // The result layout represents the layout requirements of the operation.
  // it is recorded to anchor layout or temporary layout.
@ -645,7 +663,7 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
  // propagated from consumer op, the conflict is resolved in later phase by
  // converting the required result layout to the consumer layout
  auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
-      layoutKind, sourceTy, consumerLayoutAttr, reductionDims, uArch);
+      layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);

  xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);

@ -659,6 +677,26 @@ void LayoutInfoPropagation::visitVectorMultiReductionOp(
                     operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
 }

+void LayoutInfoPropagation::visitVectorReductionOp(
+    vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
+    ArrayRef<const LayoutInfoLattice *> results) {
+
+  VectorType sourceTy = reduction.getSourceVectorType();
+  const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
+  if (!uArch)
+    return;
+
+  auto requiredResLayoutAttr =
+      xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
+  xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);
+
+  auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
+  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
+  if (reduction.getAcc())
+    propagateIfChanged(operands[1],
+                       operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
+}
+
 void LayoutInfoPropagation::visitVectorBroadCastOp(
    vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
@ -765,7 +803,7 @@ void LayoutInfoPropagation::visitDpasOp(
      numSg = numSgOrErr.value();
    }
    auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
-                                          consumerLayoutAttr, uArch, numSg);
+                                          consumerLayoutAttr, numSg, uArch);
    if (!layouts.has_value()) {
      dpas.emitWarning(
          "Failed to determine required layouts for DPAS operands.");
@ -1286,6 +1324,7 @@ private:
  OpBuilder builder;
  LogicalResult resolveTensorDescConsumer(OpOperand &operand);
  LogicalResult resolveVectorConsumer(OpOperand &operand);
+  LogicalResult assignResultLayout(OpResult &result);
 };

 } // namespace
@ -1294,6 +1333,21 @@ LogicalResult ResolveLayoutConflicts::run() {
  // Scan all operations in the parent op and resolve layout conflicts at
  // tensor descriptor and vector use points.
  auto r = parentOp->walk([&](Operation *op) -> WalkResult {
+    // if the operation inputs vector and output scalar, like multi-reduction we
+    // need to check if the result has layout and add a convert_layout to serve
+    // as anchor op for the reduction op's layout.
+    if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
+      for (OpResult result : op->getResults()) {
+        if (result.getType().isIntOrFloat()) {
+          auto res = assignResultLayout(result);
+          if (failed(res)) {
+            DBGS() << "Failed to resolve vector consumer for multi-reduction "
+                   << *op << "\n";
+            return WalkResult::interrupt();
+          }
+        }
+      }
+    }
    for (OpOperand &operand : op->getOpOperands()) {
      // Handle conflicts in tensor descriptor operands.
      Type operandType = operand.get().getType();
@ -1321,6 +1375,18 @@ LogicalResult ResolveLayoutConflicts::run() {
  return r.wasInterrupted() ? failure() : success();
 }

+LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
+  Operation *producerOp = result.getDefiningOp();
+  auto producerLayout = xegpu::getDistributeLayoutAttr(result);
+  // Insert a convert_layout op to assign the layout.
+  builder.setInsertionPointAfterValue(result);
+  auto convertOp = xegpu::ConvertLayoutOp::create(
+      builder, producerOp->getLoc(), result.getType(), result, producerLayout,
+      producerLayout);
+  result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
+  return success();
+}
+
 LogicalResult
 ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
  Value vectorValue = operand.get();
--- a/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout-subgroup.mlir
@ -128,7 +128,7 @@ gpu.module @test {
 gpu.module @test {
 // CHECK-LABEL: vector_row_reduction
 // CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [1]>}
-  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_row_reduction(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
    %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
@ -140,10 +140,23 @@ gpu.module @test {
  }
 }

+// -----
+gpu.module @test {
+// CHECK-LABEL: vector_row_reduction_scalar
+// CHECK: %[[REDUCE:.*]] = vector.multi_reduction <add>, %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [1, 64]>, dims = [0, 1]>}
+  gpu.func @vector_row_reduction_scalar(%src: memref<32x64xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
+    %cst = arith.constant 0.000000e+00 : f32
+    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32>
+    %load = xegpu.load_nd %tdesc_src : !xegpu.tensor_desc<32x64xf32> -> vector<32x64xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst [0, 1] : vector<32x64xf32> to f32
+    gpu.return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: vector_nest_reduction
-  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_nest_reduction(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
    %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
@ -181,7 +194,7 @@ gpu.module @test {
 // CHECK: xegpu.store %[[REDUCE2]], %{{.*}}[%[[OFFSET]]], %[[MASK]]
 // CHECK-SAME: <{layout = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [1, 4, 8], sg_data = [4, 8, 32]>, dims = [0]>, dims = [1]>}>
 // CHECK-SAME: : vector<32xf32>, memref<32xf32>, vector<32xindex>, vector<32xi1>
-  gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) {
+  gpu.func @vector_nest_reduction_with_nest_slice_layout(%src: memref<32x128xf32>, %dst: memref<32xf32>) kernel attributes {known_block_size = array<i32: 1, 32, 16>} {
    %cst = arith.constant dense<0.000000e+00> : vector<32xf32>
    %cst1 = arith.constant dense<0.000000e+00> : vector<32x128xf32>
    %tdesc_src = xegpu.create_nd_tdesc %src : memref<32x128xf32> -> !xegpu.tensor_desc<32x128xf32>
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@ -746,6 +746,54 @@ func.func @vector_2d_reduction_with_fractional_subgroup_size(%arg0: memref<1024x
  }
 }

+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_2d_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} dense<true> : vector<1xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [1], lane_data = [1]>} : vector<1xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [1], lane_data = [1]>}> : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+// CHECK: %[[SC:.*]] = vector.shape_cast %[[LOAD]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>} : vector<1xf16> to vector<1x1x1xf16>
+// CHECK: %[[ACC:.*]] = arith.constant 0.000000e+00 : f16
+// CHECK: %[[RED:.*]] = vector.multi_reduction <add>, %[[SC]], %[[ACC]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 1, 1], lane_data = [1, 1, 1]>, dims = [0, 1, 2]>} [0, 1, 2] : vector<1x1x1xf16> to f16
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
+func.func @vector_2d_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<1xi1>
+    %0 = vector.step : vector<1xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<1xindex>, vector<1xi1> -> vector<1xf16>
+    %2 = vector.shape_cast %1 : vector<1xf16> to vector<1x1x1xf16>
+    %cst_0 = arith.constant 0.000000e+00 : f16
+    %4 = vector.multi_reduction <add>, %2, %cst_0 [0, 1, 2] : vector<1x1x1xf16> to f16
+    %cst_2 = arith.constant true
+    %cst_3 = arith.constant 1 : index
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+    return
+  }
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_reduction_scalar(
+// CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[IDX:.*]] = vector.step {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : vector<16xindex>
+// CHECK: %[[LOAD:.*]] = xegpu.load %arg0[%[[IDX]]], %[[CST]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[RED:.*]] = vector.reduction <add>, %[[LOAD]] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [16], lane_data = [1]>, dims = [0]>} : vector<16xf16> into f16
+// CHECK: %[[MASK:.*]] = arith.constant true
+// CHECK: %[[OFF:.*]] = arith.constant 1 : index
+// CHECK: xegpu.store %[[RED]], %arg1[%[[OFF]]], %[[MASK]] : f16, memref<16xf16>, index, i1
+func.func @vector_reduction_scalar(%arg0: memref<1024xf16>, %arg1: memref<16xf16>) {
+    %cst = arith.constant dense<true> : vector<16xi1>
+    %0 = vector.step : vector<16xindex>
+    %1 = xegpu.load %arg0[%0], %cst  : memref<1024xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+    %4 = vector.reduction <add>, %1: vector<16xf16> into f16
+    %cst_2 = arith.constant true
+    %cst_3 = arith.constant 1 : index
+    xegpu.store %4, %arg1[%cst_3], %cst_2 : f16, memref<16xf16>, index, i1
+    return
+  }
+}
+
 // -----
 gpu.module @test {
 // CHECK-LABEL: func.func @vector_2d_reduction_with_fractional_subgroup_size_1x4(