llvm-project/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp

//===- XeGPUSubgroupDistribute.cpp - XeGPU Subgroup Distribute Pass -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/AffineMap.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeRange.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/InliningUtils.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/SmallVectorExtras.h"

namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUSUBGROUPDISTRIBUTE
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir

#define DEBUG_TYPE "xegpu-subgroup-distribute"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

using namespace mlir;

static const char *const resolveSIMTTypeMismatch =
    "resolve_simt_type_mismatch"; // Attribute name for identifying
                                  // UnrelizedConversionCastOp added to resolve
                                  // SIMT type mismatches.

namespace {

//===----------------------------------------------------------------------===//
// SIMT Distribution Patterns
//===----------------------------------------------------------------------===//

/// In certain cases, we may need to favor XeGPU specific distribution patterns
/// over generic vector distribution patterns. In such cases, we can assign
/// priorities to patterns.
enum PatternHierarchy : unsigned { Regular = 1, AboveRegular = 2 };

/// Helper function to resolve types if the distributed type out of
/// gpu.warp_execute_on_lane0 is different from the expected xegpu SIMT type.
/// Example 1:
///   distributed type: vector<8x1xf32>
///   expected type: vector<8xf32>
///   resolved using,
///   %0 = vector.shape_cast %1 : vector<8x1xf32> to vector<8xf32>
/// Example 2:
///   distributed type: xegpu.tensor_desc<8x16xf32, #xegpu.layout<...>>
///   expected type: xegpu.tensor_desc<8x16xf32>
///   resolved using,
///   %0 = unrealized_conversion_cast %1 :
///      xegpu.tensor_desc<8x16xf32, #xegpu.layout<..>> ->
///      xegpu.tensor_desc<8x16xf32>
template <typename T>
static Value resolveDistributedTy(Value orig, T expected,
                                  PatternRewriter &rewriter) {
  // If orig and expected types are the same, return orig.
  if (orig.getType() == expected)
    return orig;
  // If orig is a vector type, create a shape cast op to reconcile the types.
  if (isa<VectorType>(orig.getType())) {
    auto castOp =
        vector::ShapeCastOp::create(rewriter, orig.getLoc(), expected, orig);
    return castOp.getResult();
  }
  // If orig is a tensor descriptor type, create an unrealized conversion cast
  // op to reconcile the types.
  if (isa<xegpu::TensorDescType>(orig.getType())) {
    auto castOp = UnrealizedConversionCastOp::create(rewriter, orig.getLoc(),
                                                     expected, orig);
    castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
    return castOp.getResult(0);
  }
  llvm_unreachable("Unsupported type for reconciliation");
  return orig;
}

/// Given a vector type and its distributed vector type, return the list of
/// dimensions that are distributed.
static SmallVector<int64_t> getDistributedDims(VectorType originalType,
                                               VectorType distributedType) {
  assert(originalType.getRank() == distributedType.getRank() &&
         "sequential and distributed vector types must have the same rank");
  SmallVector<int64_t> distributedDims;
  for (int64_t i = 0; i < originalType.getRank(); ++i) {
    if (distributedType.getDimSize(i) != originalType.getDimSize(i)) {
      distributedDims.push_back(i);
    }
  }
  return distributedDims;
}

/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
/// contained within a WarpExecuteOnLane0Op.
/// Example:
///
/// ```
///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
///     ...
///     ...
///     gpu.return %result: vector<8x16xf32>
///   }
/// ```
/// To
/// ```
///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
///     %laneid = gpu.lane_id : index
///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
///       ...
///       ...
///       gpu.yield %result: vector<8x16xf32>
///     }
///     return %0
///   }
struct MoveFuncBodyToWarpOp : public OpRewritePattern<gpu::GPUFuncOp> {
  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
                                PatternRewriter &rewriter) const override {
    auto uArch = getUArch(xegpu::getChipStr(gpuFuncOp).value_or(""));
    if (!uArch)
      return rewriter.notifyMatchFailure(
          gpuFuncOp, "Subgroup distribution requires target attribute attached "
                     "to set the warp size");
    // If the function only contains a single void return, skip.
    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
        }))
      return failure();
    // If the function already moved inside a warp_execute_on_lane0, skip.
    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
          return isa<gpu::WarpExecuteOnLane0Op>(op);
        }))
      return failure();
    // Create a new function with the same signature and same attributes.
    SmallVector<Type> workgroupAttributionsTypes =
        llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(),
                            [](BlockArgument arg) { return arg.getType(); });
    SmallVector<Type> privateAttributionsTypes =
        llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(),
                            [](BlockArgument arg) { return arg.getType(); });
    auto newGpuFunc = gpu::GPUFuncOp::create(
        rewriter, gpuFuncOp.getLoc(), gpuFuncOp.getName(),
        gpuFuncOp.getFunctionType(), workgroupAttributionsTypes,
        privateAttributionsTypes);
    newGpuFunc->setAttrs(gpuFuncOp->getAttrs());
    // Create a WarpExecuteOnLane0Op with same arguments and results as the
    // original gpuFuncOp.
    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
    auto laneId = gpu::LaneIdOp::create(
        rewriter, newGpuFunc.getLoc(), rewriter.getIndexType(),
        /** upperBound = **/ mlir::IntegerAttr());
    ArrayRef<Type> gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
    auto warpOp = gpu::WarpExecuteOnLane0Op::create(
        rewriter, laneId.getLoc(), gpuFuncResultType, laneId,
        uArch->getSubgroupSize(), newGpuFunc.getArguments(),
        newGpuFunc.getArgumentTypes());
    Block &warpBodyBlock = warpOp.getBodyRegion().front();
    // Replace the ReturnOp of the original gpu function with a YieldOp.
    auto origRetunOp =
        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
    rewriter.setInsertionPointAfter(origRetunOp);
    gpu::YieldOp::create(rewriter, origRetunOp.getLoc(),
                         origRetunOp.getOperands());
    rewriter.eraseOp(origRetunOp);
    // Move the original function body to the WarpExecuteOnLane0Op body.
    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
                                warpOp.getBodyRegion().begin());
    rewriter.eraseBlock(&warpBodyBlock);
    // Insert a new ReturnOp after the WarpExecuteOnLane0Op.
    rewriter.setInsertionPointAfter(warpOp);
    gpu::ReturnOp::create(rewriter, newGpuFunc.getLoc(), warpOp.getResults());
    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
    return success();
  }
};

/// Distribute a create_nd_tdesc feeding into vector.yield op of the enclosing
/// `gpu.warp_execute_on_lane_0` region. After the sinking, the warp op will
/// still contain the original op that will not be used by the yield op (and
/// should be cleaned up later). The yield op will bypass the create_nd_tdesc's
/// arguments. Tensor descriptor shape is not distributed because it is a
/// uniform value across all work items within the subgroup. However, the
/// layout information is dropped in the new tensor descriptor type.
///
/// Example:
///
/// ```
///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
///                   (!xegpu.tensor_desc<4x8xf32, #layout0>) {
///     ...
///     %td = xegpu.create_nd_tdesc %arg0
///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
///     vector.yield %td
///   }
/// ```
/// To
/// ```
///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (...) {
///     ...
///     %dead = xegpu.create_nd_tdesc %arg0
///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32, #layout0>
///     vector.yield %arg0, %dead
///   }
///   %td = xegpu.create_nd_tdesc %r#0: memref<4x8xf32>
///                                 -> !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand =
        getWarpResult(warpOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
    if (!operand)
      return rewriter.notifyMatchFailure(
          warpOp, "warp result is not a xegpu::CreateNdDesc op");
    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
    unsigned operandIdx = operand->getOperandNumber();

    xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          descOp, "the tensor descriptor lacks layout attribute");
    // CreateNdOp must not have offsets.
    if (descOp.getMixedOffsets().size())
      return rewriter.notifyMatchFailure(
          descOp, "xegpu::CreateNdDescOp must not have offsets");

    SmallVector<size_t> newRetIndices;
    rewriter.setInsertionPoint(warpOp);
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, /* new yieled values = */ descOp->getOperands(),
        /* new yielded types = */ descOp.getOperandTypes(), newRetIndices);

    SmallVector<Value> newDescOperands = llvm::map_to_vector(
        newRetIndices, [&](size_t i) { return newWarpOp.getResult(i); });
    rewriter.setInsertionPointAfter(newWarpOp);
    xegpu::TensorDescType distributedTensorDescTy =
        descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                        // does not contain layout info.
    Value newDescOp = xegpu::CreateNdDescOp::create(
        rewriter, newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
        descOp->getAttrs());

    Value distributedVal = newWarpOp.getResult(operandIdx);
    // Resolve the distributed type to the expected type.
    newDescOp =
        resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
    rewriter.replaceAllUsesWith(distributedVal, newDescOp);
    return success();
  }
};

/// Distribute a store_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the store are passed
/// through the warp op interface they would be propagated as returned values.
/// Source vector is distributed based on lane layout. Appropriate cast ops are
/// inserted if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
///   gpu.warp_execute_on_lane_0(%laneid) -> () {
///     ...
///     xegpu.store_nd %arg0, %arg1 [%x, %y]: vector<4x8xf32>,
///                                 !xegpu.tensor_desc<4x8xf32, #layout0>
///   }
/// ```
/// To
/// ```
///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
///   !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
///     ...
///     gpu.yield %arg0, %arg1, %x, %y: vector<4x8xf32>,
///     !xegpu.tensor_desc<4x8xf32, #layout0>, index, index
///   }
///   %0 = vector.shape_cast %r#0: vector<4x1xf32> to vector<4xf32>
///   %1 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
///   #layout0>
///     -> !xegpu.tensor_desc<4x8xf32>
///   xegpu.store_nd %0, %1 [%r#2, %r#3]: vector<4xf32>,
///     !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    gpu::YieldOp yield = warpOp.getTerminator();
    Operation *lastNode = yield->getPrevNode();
    auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
    if (!storeOp)
      return failure();

    SmallVector<OpFoldResult> offsets = storeOp.getMixedOffsets();
    // Expecting offsets to be present.
    if (offsets.empty())
      return rewriter.notifyMatchFailure(storeOp,
                                         "the store op must have offsets");
    SmallVector<Value> offsetsAsValues =
        vector::getAsValues(rewriter, storeOp.getLoc(), offsets);
    SmallVector<Type> offsetTypes = llvm::map_to_vector(
        offsetsAsValues, [](Value v) { return v.getType(); });
    xegpu::TensorDescType tensorDescTy = storeOp.getTensorDescType();
    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          storeOp, "the source tensor descriptor lacks layout attribute");

    FailureOr<VectorType> distributedTypeByWarpOpOrFailure =
        xegpu::getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
    if (failed(distributedTypeByWarpOpOrFailure))
      return rewriter.notifyMatchFailure(storeOp,
                                         "Failed to distribute the type");
    VectorType distributedTypeByWarpOp =
        distributedTypeByWarpOpOrFailure.value();

    SmallVector<size_t> newRetIndices;
    SmallVector<Value> newYieldedValues = {storeOp.getValue(),
                                           storeOp.getTensorDesc()};
    SmallVector<Type> newYieldedTypes = {distributedTypeByWarpOp, tensorDescTy};
    newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
    newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);
    // Create a new store op outside the warp op with the distributed vector
    // type. Tensor descriptor is not distributed.
    rewriter.setInsertionPointAfter(newWarpOp);
    SmallVector<Value> newStoreOperands;

    // For the value operand, there can be a mismatch between the vector type
    // distributed by the warp op and (xegpu-specific) distributed type
    // supported by the store op. Type mismatch must be resolved using
    // appropriate cast op.
    FailureOr<VectorType> storeNdDistributedValueTyOrFailure =
        xegpu::getDistributedVectorType(storeOp.getTensorDescType());
    if (failed(storeNdDistributedValueTyOrFailure))
      return rewriter.notifyMatchFailure(
          storeOp, "Failed to get distributed vector type for the store op");
    newStoreOperands.push_back(resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[0]),
        storeNdDistributedValueTyOrFailure.value(), rewriter));
    // For the tensor descriptor operand, the layout attribute is dropped after
    // distribution. Types needs to be resolved in this case also.
    xegpu::TensorDescType distributedTensorDescTy =
        storeOp.getTensorDescType().dropLayouts();
    newStoreOperands.push_back(
        resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
                             distributedTensorDescTy, rewriter));
    // Collect offsets.
    for (size_t i = 2; i < newRetIndices.size(); ++i)
      newStoreOperands.push_back(newWarpOp.getResult(newRetIndices[i]));

    auto newStoreOp =
        xegpu::StoreNdOp::create(rewriter, newWarpOp.getLoc(), TypeRange{},
                                 newStoreOperands, storeOp->getAttrs());
    xegpu::removeLayoutAttrs(newStoreOp);
    rewriter.eraseOp(storeOp);
    return success();
  }
};

/// Distribute a load_nd op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the load's arguments. Only the loaded vector is distributed
/// according to lane layout and, tensor descriptor types is not
/// distributed. Appropriate cast ops are inserted if the distributed types does
/// not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
///                   (vector<4x1xf32>) {
///     ...
///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32, #layout0>
///     ->
///       vector<4x8xf32>
///     gpu.yield %ld
///   }
/// ```
/// To
/// ```
///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4x1xf32>,
///   !xegpu.tensor_desc<4x8xf32, #layout0>) {
///     ...
///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32, #layout0> ->
///     vector<4x8xf32> gpu.yield %dead, %arg0
///   }
///   %0 = unrealized_conversion_cast %r#1: !xegpu.tensor_desc<4x8xf32,
///        #layout0> -> !xegpu.tensor_desc<4x8xf32>
///   %1 = xegpu.load_nd %0: !xegpu.tensor_desc<4x8xf32> -> vector<4xf32>
///   %2 = vector.shape_cast %r#0: vector<4xf32> to vector<4x1xf32>
///
/// ```
struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
      if (!isa<xegpu::LoadNdOp>(op))
        return false;
      // Make sure the same load op is the last operation in the warp op body.
      // This ensure that load op is not sinked earlier violating any barrier
      // synchronizations.
      gpu::YieldOp yield = warpOp.getTerminator();
      return yield->getPrevNode() == op;
    });

    if (!operand)
      return rewriter.notifyMatchFailure(
          warpOp, "warp result is not a xegpu::LoadNd op");

    auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
    auto uArch = getUArch(xegpu::getChipStr(loadOp).value_or(""));
    if (!uArch)
      return rewriter.notifyMatchFailure(
          loadOp, "xegpu::LoadNdOp require target attribute attached to "
                  "determine transpose "
                  "requirement");
    // Chip information is required to decide if the layout requires transpose
    // effect.
    // Expecting offsets to be present.
    SmallVector<OpFoldResult> offsets = loadOp.getMixedOffsets();
    if (offsets.empty())
      return rewriter.notifyMatchFailure(loadOp,
                                         "the load op must have offsets");
    SmallVector<Value> offsetsAsValues =
        vector::getAsValues(rewriter, loadOp.getLoc(), offsets);
    SmallVector<Type> offsetTypes = llvm::map_to_vector(
        offsetsAsValues, [](Value v) { return v.getType(); });

    xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          loadOp, "the source tensor descriptor lacks layout attribute");

    unsigned operandIdx = operand->getOperandNumber();
    VectorType distributedTypeByWarpOp =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());

    SmallVector<size_t> newRetIndices;
    SmallVector<Value> newYieldedValues = {loadOp.getTensorDesc()};
    SmallVector<Type> newYieldedTypes = {tensorDescTy};
    newYieldedValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
    newYieldedTypes.append(offsetTypes.begin(), offsetTypes.end());
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, newYieldedValues, newYieldedTypes, newRetIndices);

    // Create a new load op outside the warp op with the distributed vector
    // type.
    rewriter.setInsertionPointAfter(newWarpOp);
    FailureOr<VectorType> loadNdDistValueTyOrFailure =
        xegpu::getDistributedVectorType(loadOp.getTensorDescType());
    if (failed(loadNdDistValueTyOrFailure))
      return rewriter.notifyMatchFailure(
          loadOp, "Failed to get distributed vector type for the load op");
    xegpu::TensorDescType distributedTensorDescTy =
        loadOp.getTensorDescType().dropLayouts(); // Distributed tensor
                                                  // descriptor type does not
                                                  // contain layout info.
    SmallVector<Value> newLoadOperands{
        resolveDistributedTy(newWarpOp.getResult(newRetIndices[0]),
                             distributedTensorDescTy, rewriter)};
    // Collect offsets.
    for (size_t i = 1; i < newRetIndices.size(); ++i)
      newLoadOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
    auto newLoadOp = xegpu::LoadNdOp::create(
        rewriter, newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
        newLoadOperands, loadOp->getAttrs());
    xegpu::removeLayoutAttrs(newLoadOp);
    // Set the packed attribute if the layout requires it.
    newLoadOp.setPacked(xegpu::requirePacked(layout));
    // Set the transpose attribute if the layout requires it.
    if (xegpu::requireTranspose(layout, uArch))
      newLoadOp.setTranspose(
          DenseI64ArrayAttr::get(rewriter.getContext(), {1, 0}));
    Value distributedVal = newWarpOp.getResult(operandIdx);
    // There can be a conflict between the vector type distributed by the
    // warp op and (xegpu-specific) distributed type supported by the load
    // op. Resolve these mismatches by inserting a cast.
    Value tyResolvedVal = resolveDistributedTy(
        newLoadOp.getResult(), distributedTypeByWarpOp, rewriter);
    rewriter.replaceAllUsesWith(distributedVal, tyResolvedVal);
    return success();
  }
};

/// Distribute a dpas op feeding into vector.yield op for the enclosing
/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
/// The warp op will still contain the original op that will not be used by
/// the yield op (and should be cleaned up later). The yield op will
/// bypass the dpas's arguments. Appropriate cast ops are inserted if the
/// distributed types does not match expected xegpu SIMT types.
/// Example:
/// ```
///   #lo_a = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
///   #lo_b = #xegpu.layout<wi_layout = [1, 16], wi_data = [2, 1]>
///   #lo_c = #xegpu.layout<wi_layout = [1, 16], wi_data = [1, 1]>
///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
///                   (vector<8x1xf32>) {
///     ...
///     %dpas = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16> ->
///       vector<8x16xf32>
///     gpu.yield %dpas
///   }
/// ```
/// To
/// ```
///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> (vector<8x1xf32>,
///   vector<8x1xf16>, vector<16x1xf16>) {
///     ...
///     %dead = xegpu.dpas %arg0, %arg1: vector<8x16xf16>, vector<16x16xf16>
///       -> vector<8x16xf32>
///     gpu.yield %dead, %arg0, %arg1
///   }
///   %0 = vector.shape_cast %r#1: vector<8x1xf16> to vector<8xf16>
///   %1 = vector.shape_cast %r#2: vector<16x1xf16> to vector<16xf16>
///   %2 = xegpu.dpas %0, %1: vector<8xf16>, vector<16xf16> ->
///     vector<8xf32>
///   %dpas = vector.shape_cast %2: vector<8xf32> to vector<8x1xf32>
/// ```
struct DpasDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred<xegpu::DpasOp>);
    if (!operand)
      return rewriter.notifyMatchFailure(warpOp,
                                         "warp result is not a xegpu::Dpas op");

    auto dpasOp = operand->get().getDefiningOp<xegpu::DpasOp>();
    unsigned operandIdx = operand->getOperandNumber();

    xegpu::LayoutAttr layoutA =
        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutAAttr());
    xegpu::LayoutAttr layoutB =
        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutBAttr());
    xegpu::LayoutAttr layoutOut =
        dyn_cast<xegpu::LayoutAttr>(dpasOp.getLayoutCdAttr());

    if (!layoutA || !layoutB || !layoutOut)
      return rewriter.notifyMatchFailure(
          dpasOp,
          "the xegpu::Dpas op lacks layout attribute for A, B or output");

    FailureOr<VectorType> distLhsTypeByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutA, dpasOp.getLhsType());
    FailureOr<VectorType> distRhsTypeByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutB, dpasOp.getRhsType());
    FailureOr<VectorType> distResultTypeByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutOut, dpasOp.getResultType());

    if (failed(distLhsTypeByWarpOpOrFailure) ||
        failed(distRhsTypeByWarpOpOrFailure) ||
        failed(distResultTypeByWarpOpOrFailure))
      return rewriter.notifyMatchFailure(
          dpasOp,
          "Failed to distribute the A, B or output types in xegpu::Dpas op");

    llvm::SmallVector<Value, 3> newYieldValues{dpasOp.getLhs(),
                                               dpasOp.getRhs()};
    llvm::SmallVector<Type, 3> newYieldTypes{
        distLhsTypeByWarpOpOrFailure.value(),
        distRhsTypeByWarpOpOrFailure.value()};
    // Dpas acc operand is optional.
    if (dpasOp.getAcc()) {
      newYieldValues.push_back(dpasOp.getAcc());
      newYieldTypes.push_back(distResultTypeByWarpOpOrFailure.value());
    }
    // Create a new warp op without the dpas.
    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);

    FailureOr<VectorType> expectedDistLhsTyOrFailure =
        xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA);
    FailureOr<VectorType> expectedDistRhsTyOrFailure =
        xegpu::getDistributedVectorType(dpasOp.getRhsType(), layoutB);
    FailureOr<VectorType> expectedDistResultTyOrFailure =
        xegpu::getDistributedVectorType(dpasOp.getResultType(), layoutOut);

    if (failed(expectedDistLhsTyOrFailure) ||
        failed(expectedDistRhsTyOrFailure) ||
        failed(expectedDistResultTyOrFailure))
      return rewriter.notifyMatchFailure(
          dpasOp,
          "Failed to get distributed vector type for the dpas operands.");
    // Create a new dpas op outside the warp op.
    rewriter.setInsertionPointAfter(newWarpOp);
    SmallVector<Value> newDpasOperands;
    SmallVector<VectorType> newDpasOperandExpectedTypes;

    // Resolve the distributed types with the original types.
    newDpasOperandExpectedTypes.push_back(expectedDistLhsTyOrFailure.value());
    newDpasOperandExpectedTypes.push_back(expectedDistRhsTyOrFailure.value());
    VectorType distributedResultTy = expectedDistResultTyOrFailure.value();
    if (dpasOp.getAcc())
      newDpasOperandExpectedTypes.push_back(distributedResultTy);

    for (unsigned i = 0; i < newRetIndices.size(); i++) {
      newDpasOperands.push_back(
          resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]),
                               newDpasOperandExpectedTypes[i], rewriter));
    }
    auto newDpasOp = xegpu::DpasOp::create(rewriter, newWarpOp->getLoc(),
                                           distributedResultTy, newDpasOperands,
                                           dpasOp->getAttrs());
    xegpu::removeLayoutAttrs(newDpasOp);
    Value distributedVal = newWarpOp.getResult(operandIdx);
    // Resolve the output type.
    Value typeResolved =
        resolveDistributedTy(newDpasOp.getResult(),
                             distResultTypeByWarpOpOrFailure.value(), rewriter);
    rewriter.replaceAllUsesWith(distributedVal, typeResolved);
    return success();
  }
};

/// Distribute a prefetch_nd op at the end of enclosing
/// `gpu.warp_execute_on_lane_0`. In case arguments for the prefetch are passed
/// through the warp op interface they would be propagated as returned values.
/// Tensor descriptor shape is not distributed because it is a uniform value
/// across all work items within the subgroup. Appropriate cast ops are inserted
/// if the distributed types does not match expected xegpu SIMT types.
///
/// Example:
///
/// ```
///   #layout0 = #xegpu.layout<wi_layout = [1, 8], wi_data = [1, 1]>
///   gpu.warp_execute_on_lane_0(%laneid) -> () {
///     ...
///     xegpu.prefetch_nd %arg0 [%x, %y] : !xegpu.tensor_desc<4x8xf32, #layout0>
///   }
/// ```
/// To
/// ```
///   %r:1 = gpu.warp_execute_on_lane_0(%laneid) -> (
///    !xegpu.tensor_desc<4x8xf32, #layout0>, index, index) {
///     gpu.yield %arg0, %x, %y: !xegpu.tensor_desc<4x8xf32, #layout0>, index,
///     index
///   }
///   %1 = unrealized_conversion_cast %r#0: !xegpu.tensor_desc<4x8xf32,
///     #layout0> -> !xegpu.tensor_desc<4x8xf32>
///   xegpu.prefetch_nd %1 [%r#1, %r#2] : !xegpu.tensor_desc<4x8xf32>
///
/// ```
struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    gpu::YieldOp yield = warpOp.getTerminator();
    Operation *lastNode = yield->getPrevNode();
    auto prefetchOp = dyn_cast_or_null<xegpu::PrefetchNdOp>(lastNode);
    if (!prefetchOp)
      return failure();

    SmallVector<OpFoldResult> offsets = prefetchOp.getMixedOffsets();
    // PrefetchNdOp must have offsets.
    if (offsets.empty())
      return rewriter.notifyMatchFailure(prefetchOp,
                                         "the prefetch op must have offsets");
    SmallVector<Value> offsetsAsValues =
        vector::getAsValues(rewriter, prefetchOp.getLoc(), offsets);
    SmallVector<Type> offsetTypes = llvm::map_to_vector(
        offsetsAsValues, [](Value v) { return v.getType(); });

    xegpu::LayoutAttr layout = prefetchOp.getTensorDescType().getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          prefetchOp, "the source tensor descriptor lacks layout attribute");

    SmallVector<Value> newYieldValues = {prefetchOp.getTensorDesc()};
    SmallVector<Type> newYieldTypes = {prefetchOp.getTensorDescType()};
    newYieldValues.append(offsetsAsValues.begin(), offsetsAsValues.end());
    newYieldTypes.append(offsetTypes.begin(), offsetTypes.end());
    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices);
    // Create a new prefetch op outside the warp op with updated tensor
    // descriptor type. Source tensor descriptor require type resolution.
    xegpu::TensorDescType newTensorDescTy =
        prefetchOp.getTensorDescType().dropLayouts();
    rewriter.setInsertionPointAfter(newWarpOp);
    SmallVector<Value> newPrefetchOperands = {resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)};
    // Collect offsets.
    for (size_t i = 1; i < newRetIndices.size(); ++i)
      newPrefetchOperands.push_back(newWarpOp.getResult(newRetIndices[i]));
    Operation *newPrefetchOp = xegpu::PrefetchNdOp::create(
        rewriter, newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands,
        prefetchOp->getAttrs());
    xegpu::removeLayoutAttrs(newPrefetchOp);
    rewriter.eraseOp(prefetchOp);
    return success();
  }
};

/// Sink a gpu::BarrierOp at the end of enclosing `gpu.warp_execute_on_lane_0`
/// region. This will simply move the barrier op outside of the warp op.
struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    gpu::YieldOp yield = warpOp.getTerminator();
    Operation *lastNode = yield->getPrevNode();
    // The last node must be a gpu::BarrierOp.
    auto barrierOp = dyn_cast_or_null<gpu::BarrierOp>(lastNode);
    if (!barrierOp)
      return failure();
    // Move the barrier op outside of the warp op.
    rewriter.setInsertionPointAfter(warpOp);
    gpu::BarrierOp::create(rewriter, barrierOp.getLoc(),
                           barrierOp->getResultTypes(),
                           barrierOp->getOperands(), barrierOp->getAttrs());
    rewriter.eraseOp(barrierOp);
    return success();
  }
};

/// Distribute a scattered store op. The offsets argument is required.
/// Both offset and mask vectors must be 1D and have #subgroup_size elements.
/// The layouts are fixed and implicit: one offset/mask per lane.
/// The pass changes the offset/mask vector shapes to a
/// single-element vector, **it is assumed that their producer will also be
/// distributed**. The payload vector also has a fixed distribution:
///   no chunk size -> vector of one element.
///   chunk size    -> vector of the innermost dimension of the SG-payload.
/// Example 1 (no chunk size):
///    %mask = producer_op : vector<16xi1>
///    %offset = producer_op : vector<16xindex>
///    xegpu.store %payload, %src[%offset], %mask : vector<16xf16>,
///     memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
///    %mask = producer_op : vector<1xi1>
///    %offset = producer_op : vector<1xindex>
///    xegpu.store %payload, %src[%offset], %mask : vector<1xf16>,
///     memref<256xf16>, vector<1xindex>, vector<1xi1>
/// Example 2 (chunk size, same mask and offsets):
///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
///     vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
/// To
///    xegpu.store %payload, %src[%offset], %mask <{chunk_size=8}> :
///     vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
///
/// Note that the store distribution pattern also handles leading unit
/// dimensions in the payload, mask and offsets vectors. In this case the store
/// distribution will only change the dimensions corresponding to the SG
/// distribution and keep the leading unit dimensions unchanged.
/// For example, a store with payload vector<1x16xf16> with lane layout [1, 16 ]
/// will be distributed as vector<1x1xf16>. Shapecast ops are inserted for the
/// offset/mask/payload when necessary so that the distributed store is workign
/// on 1D shape vector to match the HW capability.
struct StoreDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    Operation *lastNode = warpOp.getTerminator()->getPrevNode();
    auto storeScatterOp = dyn_cast_or_null<xegpu::StoreScatterOp>(lastNode);
    if (!storeScatterOp)
      return failure();
    auto offsets = storeScatterOp.getOffsets();
    if (!offsets || !isa<VectorType>(offsets.getType()))
      return rewriter.notifyMatchFailure(
          storeScatterOp, "Store op must have a vector of offsets argument");
    VectorType offsetsTy = cast<VectorType>(offsets.getType());
    VectorType maskTy = cast<VectorType>(storeScatterOp.getMask().getType());
    VectorType storeVecTy = cast<VectorType>(storeScatterOp.getValueType());

    // Add handling for leading unit dimensions support
    int chunkSize = storeScatterOp.getChunkSize().value_or(1);
    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;

    // Check that all leading dimensions are unit dimensions
    for (int i = 0; i < storeVecTy.getRank() - effectiveVecRank; i++) {
      if (storeVecTy.getShape()[i] != 1) {
        return rewriter.notifyMatchFailure(
            storeScatterOp, "Only unit dimensions allowed for the leading "
                            "dimensions of the store vector!");
      }
    }

    auto layoutPayload =
        xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(0));
    auto layoutOffsets =
        xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(2));
    auto layoutMask =
        xegpu::getTemporaryLayout(storeScatterOp->getOpOperand(3));

    FailureOr<VectorType> distStoreVecByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutPayload, storeVecTy);
    FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
    FailureOr<VectorType> distMaskByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
    if (failed(distStoreVecByWarpOpOrFailure) ||
        failed(distOffsetsByWarpOpOrFailure) ||
        failed(distMaskByWarpOpOrFailure)) {
      return rewriter.notifyMatchFailure(
          storeScatterOp,
          "Some vector operands have no layouts, using defaults instead.");
    }

    VectorType distPayloadTy = distStoreVecByWarpOpOrFailure.value();
    VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
    VectorType distMaskTy = distMaskByWarpOpOrFailure.value();

    SmallVector<size_t> newRetIndices;
    SmallVector<Value> operands = storeScatterOp->getOperands();
    SmallVector<Type> operandTypesToYield = {
        distPayloadTy, operands[1].getType(), distOffsetsTy, distMaskTy};

    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, operands, operandTypesToYield, newRetIndices);

    rewriter.setInsertionPointAfter(newWarpOp);

    // Distributed store payload type is always 1D without leading unit dims
    VectorType payloadTy1D = VectorType::get({distPayloadTy.getNumElements()},
                                             distPayloadTy.getElementType());

    VectorType distOffsetsTy1D = VectorType::get(
        {distOffsetsTy.getNumElements()}, distOffsetsTy.getElementType());
    VectorType distMaskTy1D = VectorType::get({distMaskTy.getNumElements()},
                                              distMaskTy.getElementType());

    // Resolve distributed types to 1D for SIMT execution
    Value distPayloadVal = resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[0]), payloadTy1D, rewriter);
    Value distOffsetVal = resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[2]), distOffsetsTy1D, rewriter);
    Value distMaskVal = resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[3]), distMaskTy1D, rewriter);

    SmallVector<Value> newStoreScatterOpOperands = {
        distPayloadVal, newWarpOp.getResult(newRetIndices[1]), distOffsetVal,
        distMaskVal};

    xegpu::StoreScatterOp newOp = xegpu::StoreScatterOp::create(
        rewriter, newWarpOp.getLoc(), TypeRange{}, newStoreScatterOpOperands,
        storeScatterOp->getAttrs());
    xegpu::removeLayoutAttrs(newOp);
    rewriter.eraseOp(storeScatterOp);
    return success();
  }
};

static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
  SmallVector<Value> newCoods;
  auto maybeCoords =
      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
  if (failed(maybeCoords))
    return {};
  assert(maybeCoords.value().size() == 1 &&
         "Expected one set of distributed offsets");
  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
      getAsOpFoldResult(origOffsets));
  newCoods = llvm::map_to_vector(ofrVec, llvm::CastTo<Value>);
  return newCoods;
}

/// Pattern for distributing xegpu::LoadMatrixOp.
struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    gpu::YieldOp yield = warpOp.getTerminator();
    Operation *lastNode = yield->getPrevNode();
    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
    if (!matrixOp)
      return failure();

    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
    });
    if (!producedByLastLoad)
      return rewriter.notifyMatchFailure(
          warpOp, "The last op is not xegpu::LoadMatrixOp");
    const int operandIdx = producedByLastLoad->getOperandNumber();

    VectorType sgPayloadTy =
        dyn_cast<VectorType>(matrixOp.getResult().getType());
    VectorType warpResultTy =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());
    if (!sgPayloadTy)
      return rewriter.notifyMatchFailure(
          matrixOp, "the matrix op payload must be a vector type");

    auto loc = matrixOp.getLoc();
    auto offsets = matrixOp.getMixedOffsets();
    if (offsets.empty())
      return rewriter.notifyMatchFailure(matrixOp,
                                         "the load op must have offsets");
    SmallVector<Value> offsetsAsValues =
        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);

    auto layout = matrixOp.getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          matrixOp, "the matrix operation lacks layout attribute");

    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
    if (failed(distPayloadByWarpOpOrFailure))
      return rewriter.notifyMatchFailure(
          matrixOp, "Failed to distribute matrix op payload based on layout.");

    SmallVector<Value> operands = {matrixOp.getMemDesc()};
    const unsigned offsetsStartIdx = operands.size();
    operands.append(offsetsAsValues);

    SmallVector<Type> operandTypes =
        llvm::map_to_vector(operands, [](Value v) { return v.getType(); });

    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, operands, operandTypes, newRetIndices);
    SmallVector<Value> newOperands = llvm::map_to_vector(
        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
                                         ShapedType::kDynamic);
    DenseI64ArrayAttr newConstOffsetsAttr =
        rewriter.getDenseI64ArrayAttr(newConstOffsets);
    ValueRange currentOffsets =
        ValueRange(newOperands).drop_front(offsetsStartIdx);

    SmallVector<Value> newCoords = currentOffsets;
    rewriter.setInsertionPointAfter(newWarpOp);

    if (!matrixOp.getSubgroupBlockIoAttr()) {
      newCoords = computeDistributedCoordinatesForMatrixOp(
          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
          currentOffsets);
    }
    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
        newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
    // Resolve the output type and replace all uses.
    rewriter.replaceAllUsesWith(
        newWarpOp.getResult(operandIdx),
        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
    return success();
  }
};

/// Pattern for distributing xegpu::StoreMatrixOp.
struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    gpu::YieldOp yield = warpOp.getTerminator();
    Operation *lastNode = yield->getPrevNode();
    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
    if (!matrixOp)
      return failure();

    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
    if (!sgPayloadTy)
      return rewriter.notifyMatchFailure(
          matrixOp, "the matrix op payload must be a vector type");

    auto loc = matrixOp.getLoc();
    auto offsets = matrixOp.getMixedOffsets();
    if (offsets.empty())
      return rewriter.notifyMatchFailure(matrixOp,
                                         "the store op must have offsets");
    SmallVector<Value> offsetsAsValues =
        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);

    auto layout = matrixOp.getLayoutAttr();
    if (!layout)
      return rewriter.notifyMatchFailure(
          matrixOp, "the matrix operation lacks layout attribute");

    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
    if (failed(distPayloadByWarpOpOrFailure))
      return rewriter.notifyMatchFailure(
          matrixOp, "Failed to distribute matrix op payload based on layout.");

    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
    const unsigned offsetsStartIdx = operands.size();
    operands.append(offsetsAsValues);

    SmallVector<Type> operandTypes =
        llvm::map_to_vector(operands, [](Value v) { return v.getType(); });
    operandTypes[0] = *distPayloadByWarpOpOrFailure;

    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, operands, operandTypes, newRetIndices);
    SmallVector<Value> newOperands = llvm::map_to_vector(
        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });

    SmallVector<int64_t> newConstOffsets(matrixOp.getConstOffsets().size(),
                                         ShapedType::kDynamic);
    DenseI64ArrayAttr newConstOffsetsAttr =
        rewriter.getDenseI64ArrayAttr(newConstOffsets);
    ValueRange currentOffsets =
        ValueRange(newOperands).drop_front(offsetsStartIdx);

    SmallVector<Value> newCoords = currentOffsets;
    rewriter.setInsertionPointAfter(newWarpOp);

    if (!matrixOp.getSubgroupBlockIoAttr()) {
      newCoords = computeDistributedCoordinatesForMatrixOp(
          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
          currentOffsets);
    }

    xegpu::StoreMatrixOp::create(
        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
        ValueRange(newCoords), newConstOffsetsAttr,
        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
    rewriter.eraseOp(matrixOp);
    return success();
  }
};

/// Distribute a scattered load op. The logic and requirements are the same as
/// for the scattered store distribution. The warpOp's payload vector is
/// expected to be distributed by the load's result consumer.
/// Example 1 (no chunk size):
///    %mask = producer_op : vector<16xi1>
///    %offset = producer_op : vector<16xindex>
///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
///    vector<16xindex>, vector<16xi1> -> vector<16xf16>
/// To
///    %mask = producer_op : vector<1xi1>
///    %offset = producer_op : vector<1xindex>
///    %0 = xegpu.load %payload, %src[%offset], %mask : memref<256xf16>,
///     vector<1xindex>, vector<1xi1> -> vector<1xf16>
/// Example 2 (chunk size, same mask and offsets):
///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
///     memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
/// To
///    %0 = xegpu.load %payload, %src[%offset], %mask <{chunk_size=8}> :
///     memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
///
/// Note that the load distribution pattern also handles leading unit dimensions
/// in the payload, mask, and offsets vector.The load distribution will only
/// change the dimensions corresponding to the SG distribution and keep the
/// leading unit dimensions unchanged. For example, a load with result type
/// vector<1x16xf16> with lane layout [1, 16 ] will be distributed
/// as result type vector<1x1xf16>. Shapecast ops are inserted for the
/// offset/mask/payload when necessary so that the distributed load is workign
/// on 1D shape vector to match the HW capability.
struct LoadDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
      // Check if the yield operand that was produced by the *last* scattered
      // load op to avoid sinking it before barriers (maintain memory order).
      return isa<xegpu::LoadGatherOp>(op) &&
             warpOp.getTerminator()->getPrevNode() == op;
    });
    if (!producedByLastLoad)
      return rewriter.notifyMatchFailure(
          warpOp, "The last op is not xegpu::LoadGatherOp");

    auto loadGatherOp =
        producedByLastLoad->get().getDefiningOp<xegpu::LoadGatherOp>();
    auto offsets = loadGatherOp.getOffsets();
    if (!offsets || !isa<VectorType>(offsets.getType()) ||
        !isa<VectorType>(loadGatherOp.getMask().getType()))
      return rewriter.notifyMatchFailure(
          loadGatherOp,
          "Load op must have a vector arguments for offsets and mask");
    VectorType offsetsTy = cast<VectorType>(offsets.getType());
    VectorType maskTy = cast<VectorType>(loadGatherOp.getMask().getType());
    VectorType resultVecTy =
        cast<VectorType>(loadGatherOp.getResult().getType());
    // add handling leading unit dimensions support
    int chunkSize = loadGatherOp.getChunkSize().value_or(1);
    int effectiveVecRank = (chunkSize == 1) ? 1 : 2;
    for (int i = 0; i < resultVecTy.getRank() - effectiveVecRank; i++) {
      if (resultVecTy.getShape()[i] != 1) {
        return rewriter.notifyMatchFailure(
            loadGatherOp, "Only unit dimensions allowed for the leading "
                          "dimensions of the load vector!");
      }
    }

    auto layoutOffsets =
        xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(1));
    auto layoutMask = xegpu::getTemporaryLayout(loadGatherOp->getOpOperand(2));

    FailureOr<VectorType> distOffsetsByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutOffsets, offsetsTy);
    FailureOr<VectorType> distMaskByWarpOpOrFailure =
        getDistVecTypeBasedOnLaneLayout(layoutMask, maskTy);
    if (failed(distOffsetsByWarpOpOrFailure) ||
        failed(distMaskByWarpOpOrFailure)) {
      return rewriter.notifyMatchFailure(
          loadGatherOp,
          "Some vector operands have no layouts, using defaults instead.");
    }

    SmallVector<size_t> newRetIndices;
    SmallVector<Value> operands = loadGatherOp->getOperands();

    const unsigned operandIdx = producedByLastLoad->getOperandNumber();
    VectorType distResultTy =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());
    VectorType distOffsetsTy = distOffsetsByWarpOpOrFailure.value();
    VectorType distMaskTy = distMaskByWarpOpOrFailure.value();

    SmallVector<Type> operandTypesToYield = {operands[0].getType(),
                                             distOffsetsTy, distMaskTy};

    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, operands, operandTypesToYield, newRetIndices);

    rewriter.setInsertionPointAfter(newWarpOp);

    // Distributed load op will always be 1D.
    VectorType loadVecTy1D = VectorType::get({distResultTy.getNumElements()},
                                             distResultTy.getElementType());

    VectorType distOffsetsTy1D =
        VectorType::get({distOffsetsByWarpOpOrFailure.value().getNumElements()},
                        distOffsetsByWarpOpOrFailure.value().getElementType());
    VectorType distMaskTy1D =
        VectorType::get({distMaskByWarpOpOrFailure.value().getNumElements()},
                        distMaskByWarpOpOrFailure.value().getElementType());

    Value distOffsetVal = resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[1]), distOffsetsTy1D, rewriter);
    Value distmaskVal = resolveDistributedTy(
        newWarpOp.getResult(newRetIndices[2]), distMaskTy1D, rewriter);

    SmallVector<Value> newLoadGatherOperands = {
        newWarpOp.getResult(newRetIndices[0]), distOffsetVal, distmaskVal};

    xegpu::LoadGatherOp newOp = xegpu::LoadGatherOp::create(
        rewriter, newWarpOp.getLoc(), loadVecTy1D, newLoadGatherOperands,
        loadGatherOp->getAttrs());
    xegpu::removeLayoutAttrs(newOp);
    Value distributedVal = newWarpOp.getResult(operandIdx);
    // Resolve the output type and replace all uses.
    rewriter.replaceAllUsesWith(
        distributedVal,
        resolveDistributedTy(newOp.getResult(), distResultTy, rewriter));
    return success();
  }
};

// Sink SG-uniform ops. An op is uniform if none
// of its operands/results has a distribution layout attribute.
// Non-uniform vectors are handled by dedicated patterns.
// This pattern must have a higher priority than vector dialect distribution
// patterns, because a distributable shape may be logically intended as
// uniform (i.e., no layout), so we want to omit its distribution.
struct SinkUniformOps final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    // Take the last op
    Operation *warpRegionPreYieldOp = warpOp.getTerminator()->getPrevNode();
    // Any ops with nested regions must be handled carefully in dedicated
    // patterns.
    if (!warpRegionPreYieldOp || warpRegionPreYieldOp->getNumRegions())
      return failure();
    int operandIdx = -1;
    if (warpRegionPreYieldOp->getNumResults()) {
      OpOperand *operand = getWarpResult(
          warpOp, [&](Operation *op) { return warpRegionPreYieldOp == op; });
      if (!operand)
        return failure();
      operandIdx = operand->getOperandNumber();
      if (warpRegionPreYieldOp->getResult(0).getType() !=
          warpOp.getResult(operandIdx).getType())
        return rewriter.notifyMatchFailure(warpOp,
                                           "The op result is not uniform.");
    }

    // The op must have no layout-based operands or results.
    bool uniformValuesOnly =
        llvm::all_of(warpRegionPreYieldOp->getResults(), [](Value v) {
          return !xegpu::getDistributeLayoutAttr(v);
        });
    uniformValuesOnly &=
        llvm::all_of(warpRegionPreYieldOp->getOpOperands(), [](OpOperand &opr) {
          return !xegpu::getDistributeLayoutAttr(opr);
        });
    if (!uniformValuesOnly)
      return rewriter.notifyMatchFailure(warpOp,
                                         "Some values are not uniform.");
    SmallVector<size_t> newRetIndices;
    SmallVector<Value> operands =
        llvm::to_vector_of<Value>(warpRegionPreYieldOp->getOperands());
    SmallVector<Type> operandTypes =
        llvm::to_vector_of<Type>(warpRegionPreYieldOp->getOperandTypes());
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, operands, operandTypes, newRetIndices);

    rewriter.setInsertionPointAfter(newWarpOp);
    IRMapping operandMapper;
    for (auto [oldOperandIdx, newOperandIdx] : llvm::enumerate(newRetIndices))
      operandMapper.map(warpRegionPreYieldOp->getOperand(oldOperandIdx),
                        newWarpOp->getResult(newOperandIdx));
    Operation *clonedOp = rewriter.clone(*warpRegionPreYieldOp, operandMapper);
    if (!clonedOp->getNumResults())
      rewriter.eraseOp(warpRegionPreYieldOp);
    else {
      assert(operandIdx != -1 && "Expected a warp result for the operation");
      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx),
                                  clonedOp->getResult(0));
    }
    return success();
  }
};

/// Helper to rewrite a 2D VectorMultiReductionOp into a sequence of 1D
/// VectorReductionOps. We also insert layouts for the newly created ops.
static Value lowerToVectorReductions(TypedValue<VectorType> src,
                                     TypedValue<VectorType> acc,
                                     vector::CombiningKind kind,
                                     int64_t reductionDim, Location loc,
                                     PatternRewriter &rewriter) {
  // Expecting a 2D source vector.
  assert(src.getType().getRank() == 2 && "expected a 2D source vector");
  VectorType sourceType = src.getType();
  int64_t sourceH = sourceType.getShape()[0];
  int64_t sourceW = sourceType.getShape()[1];
  int nSlices = (reductionDim == 0) ? sourceW : sourceH;
  // Create a constant vector to hold the result of the reduction.
  TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
  Value reductionResult = arith::ConstantOp::create(
      rewriter, loc, acc.getType(),
      DenseElementsAttr::get(acc.getType(), zeroAttr));
  // Reduction result should have the same layout as the accumulator.
  xegpu::setTemporaryLayout(cast<OpResult>(reductionResult),
                            xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc)));
  // For each slice of the source, extract the slice vector, do a reduction
  // and, insert the reduced value back to the result vector.
  for (int i = 0; i < nSlices; ++i) {
    SmallVector<int64_t, 2> sliceOffsets, sliceSizes;
    if (reductionDim == 1) {
      sliceOffsets = {i, 0};
      sliceSizes = {1, sourceW};
    } else {
      sliceOffsets = {0, i};
      sliceSizes = {sourceH, 1};
    }
    vector::ExtractStridedSliceOp extractOp =
        vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                              sliceSizes, {1, 1});

    int64_t nSliceElements = extractOp.getResult().getType().getNumElements();

    vector::ShapeCastOp slice = vector::ShapeCastOp::create(
        rewriter, loc,
        VectorType::get({nSliceElements}, sourceType.getElementType()),
        extractOp.getResult());

    // Shape cast is currently handled in xegpu side. So layouts must be
    // retained during lowering. Shape cast output has the same layout as the
    // accumulator. Shape cast source has the same layout as the original
    // reduction source.
    // TODO: other ops generated here may also need layout attributes.
    auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
    auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));

    xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
    xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
    // Extract and reduction results in scalars, so no result layout is needed.
    Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, i);
    Value reduction = vector::ReductionOp::create(
        rewriter, loc, kind, slice.getResult(), accExtract);
    reductionResult =
        vector::InsertOp::create(rewriter, loc, reduction, reductionResult, i);
  }
  return reductionResult;
}

/// This patterns distribute the `vector.multi_reduction` operation across
/// lanes in a warp. Currently only 2D to 1D reductions are supported. Given
/// layouts for the source and accumulator vectors,
/// * If the reduction dimension is distributed across lanes, the reduction is
///   non-lane-local and the reduction is done using warp shuffles. Here we
///   simply rewrite the MultiDimReductionOp to a sequence of ReductionOps in
///   the warp op body.
/// * If the reduction dimension is not distributed across lanes, the reduction
///   is lane-local. In this case, we yield the source and accumulator vectors
///   from the warp op and perform the lane-local reduction outside the warp op
///   using a sequence of ReductionOps.
/// Example 1 (Reduction is lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
///   %0 = "some_def"() : () -> (vector<16x32xf32>)
///   %acc = "some_def"() : () -> (vector<32xf32>)
///   %1 = vector.multi_reduction <add>, %0, %acc [0] : vector<16x32xf32> to
///   vector<32xf32> gpu.yield %1 : vector<32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<16x1xf32>,
/// vector<1xf32>) {
///   %0 = "some_def"() : () -> (vector<16x32xf32>)
///   %acc = "some_def"() : () -> (vector<32xf32>)
///   gpu.yield %0, %acc : vector<16x32xf32>, vector<32xf32>
/// }
/// %c = arith.constant dense<0.0> : vector<1xf32>
/// %1 = vector.shape_cast %r#0 : vector<16x1xf32> to vector<16xf32>
/// %2 = vector.reduction <add>, %1, %r#1 : vector<16xf32> to f32
/// %3 = vector.insert %2, %c[0] : f32 into vector<1xf32>
/// ```
/// Example 2 (Reduction is non-lane-local):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
///   %0 = "some_def"() : () -> (vector<2x32xf32>)
///   %acc = "some_def"() : () -> (vector<2xf32>)
///   %1 = vector.multi_reduction <add>, %0, %acc [1] : vector<2x32xf32> to
///   vector<2xf32>
///   gpu.yield %1 : vector<2xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
///   %0 = "some_def"() : () -> (vector<2x32xf32>)
///   %acc = "some_def"() : () -> (vector<2xf32>)
///   %1 = arith.constant dense<0.0> : vector<2xf32>
///   %2 = vector.extract %0[0] : vector<32xf32> from <vector<2x32xf32>>
///   %3 = ("warp.reduction %2") : f32
///   %4 = vector.insert %3, %1[0] : f32 into vector<2xf32>
///   ... repeat for row 1
///   gpu.yield %1 : vector<2xf32>
/// }
struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *yieldOperand =
        getWarpResult(warpOp, llvm::IsaPred<vector::MultiDimReductionOp>);
    if (!yieldOperand)
      return failure();
    auto reductionOp =
        cast<vector::MultiDimReductionOp>(yieldOperand->get().getDefiningOp());
    unsigned operandIdx = yieldOperand->getOperandNumber();
    VectorType sourceType = reductionOp.getSourceVectorType();
    // Only 2D vectors are supported.
    if (sourceType.getRank() != 2)
      return rewriter.notifyMatchFailure(warpOp,
                                         "Only 2D reductions are supported.");
    ArrayRef<int64_t> reductionDims = reductionOp.getReductionDims();
    // Only 1 reduction dimension supported. This also ensures that the result
    // is vector type.
    if (reductionDims.size() != 1)
      return rewriter.notifyMatchFailure(
          warpOp, "Only 1 reduction dimension is supported.");
    int64_t reductionDim = reductionDims[0];
    VectorType distributedResultType =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());
    VectorType resultType = cast<VectorType>(reductionOp.getType());
    xegpu::DistributeLayoutAttr sourceLayout =
        xegpu::getTemporaryLayout(reductionOp->getOpOperand(0));

    FailureOr<VectorType> sourceDistTypeOrFailure =
        getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
    if (failed(sourceDistTypeOrFailure))
      return rewriter.notifyMatchFailure(
          warpOp, "Failed to distribute the source vector type.");
    VectorType sourceDistType = sourceDistTypeOrFailure.value();
    // Only single dimension distribution is supported.
    bool dim0Distributed =
        sourceDistType.getShape()[0] != sourceType.getShape()[0];
    bool dim1Distributed =
        sourceDistType.getShape()[1] != sourceType.getShape()[1];
    if (dim0Distributed && dim1Distributed)
      return rewriter.notifyMatchFailure(
          warpOp, "Expecting source to be distributed in a single dimension.");
    int64_t sourceDistDim = dim0Distributed ? 0 : (dim1Distributed ? 1 : -1);
    if (sourceDistDim == -1)
      return rewriter.notifyMatchFailure(
          warpOp, "Expecting a distributed source vector.");
    bool resultDistributed =
        distributedResultType.getNumElements() < resultType.getNumElements();
    // If the lane owns all the data required for reduction (i.e. reduction is
    // fully parallel accross lanes), then each lane owns part of the result
    // (i.e. result is distributed). If the reduction require cross-lane
    // shuffling, then the result is shared among all lanes (broadcasted).
    // Therefore we expect following cases:
    //
    // | Source vector        | Reduction dim  | Result vector  |
    // |----------------------|----------------|----------------|
    // |  dim-0 distributed   |       0        | broadcasted    |
    // |  dim-0 distributed   |       1        | distributed    |
    // |  dim-1 distributed   |       0        | distributed    |
    // |  dim-1 distributed   |       1        | broadcasted    |

    bool isReductionLaneLocal = (sourceDistDim == 0 && reductionDim == 1) ||
                                (sourceDistDim == 1 && reductionDim == 0);
    if (isReductionLaneLocal && !resultDistributed)
      return rewriter.notifyMatchFailure(
          warpOp, "Expecting a distributed result for lane-local reduction.");

    if (!isReductionLaneLocal && resultDistributed)
      return rewriter.notifyMatchFailure(
          warpOp,
          "Expecting a broadcasted result for non-lane-local reduction.");

    // Handle lane-local reduction case. In this case we fully distribute the
    // reduction result.
    if (isReductionLaneLocal) {
      // Yield the source and acc vectors from the WarpOp.
      SmallVector<size_t> newRetIndices;
      auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
          rewriter, warpOp, {reductionOp.getSource(), reductionOp.getAcc()},
          {sourceDistType, distributedResultType}, newRetIndices);
      rewriter.setInsertionPointAfter(newWarpOp);
      Value result = lowerToVectorReductions(
          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[0])),
          cast<TypedValue<VectorType>>(newWarpOp->getResult(newRetIndices[1])),
          reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
      // Replace the warp op result with the final result.
      rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), result);
      return success();
    }
    // For non-lane-local case, we simply rewrite the MultiReductionOp in terms
    // of multiple ReductionOps. Actual distribution is done by the
    // WarpOpReduction pattern.
    rewriter.setInsertionPointAfter(reductionOp);
    Value result = lowerToVectorReductions(
        cast<TypedValue<VectorType>>(reductionOp.getSource()),
        cast<TypedValue<VectorType>>(reductionOp.getAcc()),
        reductionOp.getKind(), reductionDim, reductionOp.getLoc(), rewriter);
    // Replace the warp op result with the final result.
    rewriter.replaceAllUsesWith(reductionOp.getResult(), result);
    return success();
  }
};

/// This pattern distributes the `vector.broadcast` operation across lanes in a
/// warp. The pattern supports three use cases:
///
/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
/// vector
///    must have a slice layout of the result. If the distributed source and
///    target vector types are identical, this lowers to a no-op; otherwise, it
///    remains a broadcast but operates on distributed vectors.
///
/// 2) Broadcast a same-rank vector with identical layouts for source and
/// target:
///    The source vector must have unit dimensions, and lane_data must be unit
///    size for those unit dims. This always lowers to a no-op.
///
/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
///    scalar to distributed result type.
///
/// Example 1 (lowering to a broadcast with distributed types):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
///   %0 = "some_def"() {layout_result_0 =
///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
///     dims = [0]> } : () -> (vector<32xf32>)
///   %2 = vector.broadcast %0 {layout_result_0 =
///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
///     : vector<32xf32> to vector<8x32xf32>
///     gpu.yield %1 : vector<8x32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
///   %0 = "some_def"() {layout_result_0 =
///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
///     dims = [0]> } : () -> (vector<32xf32>)
///   gpu.yield %0 : vector<32xf32>
/// }
/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
///
/// Example 2 (no-op):
/// ```
/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
///   %0 = "some_def"() {layout_result_0 =
///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
///     dims = [1]> } : () -> (vector<8xf32>)
///   %1 = vector.shape_cast %0
///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
///      1]>}: vector<8xf32> to vector<8x1xf32>
///   %2 = vector.broadcast %1
///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
///     1]>}: vector<8x1xf32> to vector<8x32xf32>
///   gpu.yield %1 : vector<8x32xf32>
/// }
/// ```
/// is lowered to:
/// ```
/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
///   %0 = "some_def"() {layout_result_0 =
///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
///     dims = [1]> } : () -> (vector<8xf32>)
///   %1 = vector.shape_cast %0
///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
///     1]>}: vector<8xf32> to vector<8x1xf32>
///   gpu.yield %1 : vector<8x1xf32>
/// }
/// // The broadcast is implicit through layout transformation (no-op)
///  "some_use"(%r#0)
/// ```
struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *yieldOperand =
        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
    if (!yieldOperand)
      return failure();
    auto broadcastOp =
        cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
    unsigned operandIdx = yieldOperand->getOperandNumber();

    VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
    VectorType destType =
        dyn_cast<VectorType>(broadcastOp.getResult().getType());

    xegpu::DistributeLayoutAttr sourceLayout =
        xegpu::getTemporaryLayout(broadcastOp->getOpOperand(0));
    xegpu::DistributeLayoutAttr resultLayout =
        xegpu::getTemporaryLayout(dyn_cast<OpResult>(broadcastOp.getResult()));

    FailureOr<VectorType> sourceDistType;
    Type sourceElemOrDistType;
    if (sourceType) {

      // Case 1 and 2: source is a vector type.
      int64_t rankDiff = destType.getRank() - sourceType.getRank();
      if (rankDiff > 0) {
        // Case 1: source is lower-rank than result.
        bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
        if (!isSliceOf)
          return rewriter.notifyMatchFailure(
              warpOp,
              "Broadcast input layout must be a slice of result layout.");
      }
      // case 2: source and result have same rank
      if (rankDiff == 0) {
        auto broadcastUnitDimsSet = broadcastOp.computeBroadcastedUnitDims();
        SmallVector<int64_t> broadcastUnitDims(broadcastUnitDimsSet.begin(),
                                               broadcastUnitDimsSet.end());
        bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
        if (!isEqualTo)
          return rewriter.notifyMatchFailure(
              warpOp, "For same-rank broadcast, source must be identical to "
                      "adjusted result layouts with unit dims.");
        resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
      }

      sourceDistType =
          getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
      if (failed(sourceDistType)) {
        return rewriter.notifyMatchFailure(
            warpOp, "Failed to distribute the source vector type.");
      }
      sourceElemOrDistType = sourceDistType.value();

    } else {
      // Case 3: source is a scalar type.
      if (sourceLayout) {
        return rewriter.notifyMatchFailure(
            warpOp, "Broadcast from scalar must not have a layout attribute.");
      }
      sourceElemOrDistType = broadcastOp.getSourceType();
    }
    FailureOr<VectorType> destDistType =
        getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
    if (failed(destDistType)) {
      return rewriter.notifyMatchFailure(
          warpOp, "Failed to distribute the dest vector type.");
    }

    SmallVector<size_t> newRetIndices;
    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
        newRetIndices);

    Value distributedSource = newWarpOp.getResult(newRetIndices[0]);

    Value newBroadcast = distributedSource;

    if (sourceElemOrDistType != destDistType.value()) {
      rewriter.setInsertionPointAfter(newWarpOp);
      newBroadcast =
          vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
                                      destDistType.value(), distributedSource);
    }

    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
    return success();
  }
};

/// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region.
struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *yieldOperand =
        getWarpResult(warpOp, llvm::IsaPred<vector::ShapeCastOp>);
    if (!yieldOperand)
      return failure();
    auto shapeCastOp =
        cast<vector::ShapeCastOp>(yieldOperand->get().getDefiningOp());
    unsigned operandNumber = yieldOperand->getOperandNumber();
    auto resultDistTy =
        cast<VectorType>(warpOp.getResult(operandNumber).getType());
    xegpu::DistributeLayoutAttr sourceLayout =
        xegpu::getTemporaryLayout(shapeCastOp->getOpOperand(0));
    xegpu::DistributeLayoutAttr resultLayout =
        xegpu::getTemporaryLayout(dyn_cast<OpResult>(shapeCastOp.getResult()));
    if (!sourceLayout || !resultLayout)
      return rewriter.notifyMatchFailure(
          warpOp,
          "the source or result of shape_cast op lacks distribution layout");

    FailureOr<VectorType> sourceDistTypeOrFailure =
        getDistVecTypeBasedOnLaneLayout(sourceLayout,
                                        shapeCastOp.getSourceVectorType());
    if (failed(sourceDistTypeOrFailure))
      return rewriter.notifyMatchFailure(
          warpOp, "failed to get distributed vector type for source");
    VectorType sourceDistType = sourceDistTypeOrFailure.value();
    // Create a new warp op that yields the source of the shape_cast op.
    SmallVector<size_t> newRetIndices;
    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, {shapeCastOp.getSource()}, {sourceDistType},
        newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);
    Value source = newWarpOp.getResult(newRetIndices[0]);
    // Create a new shape_cast op outside the warp op.
    Value newShapeCast = vector::ShapeCastOp::create(
        rewriter, shapeCastOp.getLoc(), resultDistTy, source);
    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
                                newShapeCast);
    return success();
  }
};

// Distribute a `vector.extract_strided_slice` op feeding into yield op of an
// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
// advanced cases where the distributed dimension is partially extracted and
// currently not supported by the generic vector distribution patterns.
struct VectorExtractStridedSliceDistribution
    : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand =
        getWarpResult(warpOp, llvm::IsaPred<vector::ExtractStridedSliceOp>);
    if (!operand)
      return failure();
    auto extractOp =
        cast<vector::ExtractStridedSliceOp>(operand->get().getDefiningOp());
    unsigned operandIdx = operand->getOperandNumber();
    auto distributedType =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());
    // Find the distributed dimensions.
    auto extractResultType = cast<VectorType>(operand->get().getType());
    auto distributedDims =
        getDistributedDims(extractResultType, distributedType);
    // Collect updated source type, sizes and offsets. They may be adjusted
    // later if the data is distributed to lanes (as opposed to being owned by
    // all lanes uniformly).
    VectorType updatedSourceType = extractOp.getSourceVectorType();
    SmallVector<Attribute> updatedSizes = llvm::map_to_vector(
        extractOp.getSizes(), [](Attribute attr) { return attr; });
    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
        extractOp.getOffsets(), [](Attribute attr) { return attr; });
    SmallVector<Attribute> updatedStrides = llvm::map_to_vector(
        extractOp.getStrides(), [](Attribute attr) { return attr; });
    // If the provided sizes, offsets, strides are less than the rank, pad them
    // with full sizes, zero offsets, and unit strides. This makes it easier to
    // adjust them later.
    int64_t sourceRank = extractOp.getSourceVectorType().getRank();
    for (int64_t i = extractOp.getSizes().size(); i < sourceRank; ++i) {
      updatedSizes.push_back(rewriter.getI64IntegerAttr(
          extractOp.getSourceVectorType().getDimSize(i)));
      updatedOffsets.push_back(rewriter.getI64IntegerAttr(0));
      updatedStrides.push_back(
          rewriter.getI64IntegerAttr(1)); // stride is always 1.
    }
    // If the result is distributed, it must be distributed in exactly one
    // dimension. In this case, we adjust the sourceDistType, distributedSizes
    // and distributedOffsets accordingly.
    if (distributedDims.size() > 0) {
      if (distributedDims.size() != 1)
        return rewriter.notifyMatchFailure(
            warpOp, "Source can not be distributed in multiple dimensions.");
      int64_t distributedDim = distributedDims[0];
      int sourceDistrDimSize =
          extractOp.getSourceVectorType().getShape()[distributedDim];
      auto sourceLayout = xegpu::getTemporaryLayout(extractOp->getOpOperand(0));
      if (!sourceLayout || sourceLayout.getEffectiveLaneLayoutAsInt().empty())
        return rewriter.notifyMatchFailure(
            warpOp, "the source of extract_strided_slice op lacks distribution "
                    "layout");
      auto sourceLaneLayout = sourceLayout.getEffectiveLaneLayoutAsInt();
      // Because only single dimension distribution is supported, lane layout
      // size at the distributed dim must be the subgroup size.
      int subgroupSize = sourceLaneLayout[distributedDim];
      // Check if the source size in the distributed dimension is a multiple of
      // subgroup size.
      if (sourceDistrDimSize % subgroupSize != 0)
        return rewriter.notifyMatchFailure(
            warpOp,
            "Source size along distributed dimension is not a multiple of "
            "subgroup size.");
      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
      // We expect lane data to be all ones in this case.
      if (!llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
        return rewriter.notifyMatchFailure(
            warpOp, "Expecting unit lane data in source layout");
      // The offsets in the distributed dimention must be a multiple of subgroup
      // size.
      int64_t distrDimOffset =
          cast<IntegerAttr>(updatedOffsets[distributedDim]).getInt();
      if (distrDimOffset % subgroupSize != 0)
        return rewriter.notifyMatchFailure(
            warpOp, "Offset along distributed dimension "
                    "is not a multiple of subgroup size.");
      updatedSourceType = getDistVecTypeBasedOnLaneLayout(
                              sourceLayout, extractOp.getSourceVectorType())
                              .value();
      // Update the distributed sizes to match the distributed type.
      updatedSizes[distributedDim] = rewriter.getI64IntegerAttr(
          distributedType.getDimSize(distributedDim));
      // Update the distributed offsets to match round robin distribution (i.e.
      // each lane owns data at `subgroupSize` stride given unit lane data).
      updatedOffsets[distributedDim] =
          rewriter.getI64IntegerAttr(distrDimOffset / subgroupSize);
    }
    // Do the distribution by yielding the source of the extract op from
    // the warp op and creating a new extract op outside the warp op.
    SmallVector<size_t> newRetIndices;
    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, {extractOp.getSource()}, {updatedSourceType},
        newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);
    Value source = newWarpOp.getResult(newRetIndices[0]);
    // Create a new extract op outside the warp op.
    Value newExtractOp = vector::ExtractStridedSliceOp::create(
        rewriter, extractOp.getLoc(), distributedType, source,
        ArrayAttr::get(rewriter.getContext(), updatedOffsets),
        ArrayAttr::get(rewriter.getContext(), updatedSizes),
        ArrayAttr::get(rewriter.getContext(), updatedStrides));
    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newExtractOp);
    return success();
  }
};

/// Distribute a `vector.insert_strided_slice` op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This pattern covers
/// advanced cases where the distributed dimension is partially inserted and
/// currently not supported by the generic vector distribution patterns.
struct VectorInsertStridedSliceDistribution
    : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) {
      // Check if the InsertStridedSliceOp is the last op before yield op
      return llvm::IsaPred<vector::InsertStridedSliceOp>(op) &&
             warpOp.getTerminator()->getPrevNode() == op;
    });
    if (!operand)
      return failure();
    unsigned int operandNumber = operand->getOperandNumber();
    auto insertOp =
        operand->get().getDefiningOp<vector::InsertStridedSliceOp>();
    auto distributedType =
        cast<VectorType>(warpOp.getResult(operandNumber).getType());
    // Find the distributed dimensions of the dest vector.
    auto insertResultType = cast<VectorType>(operand->get().getType());
    auto destDistributedDims =
        getDistributedDims(insertResultType, distributedType);
    // Collect updated offsets, source type and dest type. They may be adjusted
    // later if the data is distributed to lanes (as opposed to being owned by
    // all lanes uniformly).
    SmallVector<Attribute> updatedOffsets = llvm::map_to_vector(
        insertOp.getOffsets(), [](Attribute attr) { return attr; });
    VectorType updatedSourceType = insertOp.getSourceVectorType();
    VectorType updatedDestType = insertOp.getDestVectorType();
    if (destDistributedDims.size() > 0) {
      // Only single dimension distribution is supported.
      if (destDistributedDims.size() != 1)
        return rewriter.notifyMatchFailure(
            warpOp,
            "Expecting source to be distributed in a single dimension.");
      int64_t destDistributedDim = destDistributedDims[0];

      VectorType srcType = insertOp.getSourceVectorType();
      VectorType destType = insertOp.getDestVectorType();
      // Currently we require that both source (kD) and dest (nD) vectors are
      // distributed. This requires that distributedDim (d) is contained in the
      // last k dims of the dest vector (d >= n - k).
      int64_t sourceDistributedDim =
          destDistributedDim - (destType.getRank() - srcType.getRank());
      if (sourceDistributedDim < 0)
        return rewriter.notifyMatchFailure(
            insertOp,
            "distributed dimension must be in the last k (i.e. source "
            "rank) dims of dest vector");
      int64_t srcDistrDimSize = srcType.getDimSize(sourceDistributedDim);
      // Obtain the source and dest layouts.
      auto destLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(1));
      auto sourceLayout = xegpu::getTemporaryLayout(insertOp->getOpOperand(0));
      if (!destLayout || !sourceLayout ||
          destLayout.getEffectiveLaneLayoutAsInt().empty() ||
          sourceLayout.getEffectiveLaneLayoutAsInt().empty())
        return rewriter.notifyMatchFailure(
            warpOp, "the source or dest of insert_strided_slice op lacks "
                    "distribution layout");
      // Because only single dimension distribution is supported, lane layout
      // size at the distributed dim must be the subgroup size.
      int subgroupSize =
          destLayout.getEffectiveLaneLayoutAsInt()[destDistributedDim];
      // We require that source and dest lane data are all ones to ensure
      // uniform round robin distribution.
      auto destLaneData = destLayout.getEffectiveLaneDataAsInt();
      auto sourceLaneData = sourceLayout.getEffectiveLaneDataAsInt();
      if (!llvm::all_of(destLaneData, [](int64_t v) { return v == 1; }) ||
          !llvm::all_of(sourceLaneData, [](int64_t v) { return v == 1; }))
        return rewriter.notifyMatchFailure(
            warpOp, "Expecting unit lane data in source and dest layouts");
      // Source distributed dim size must be multiples of subgroup size.
      if (srcDistrDimSize % subgroupSize != 0)
        return rewriter.notifyMatchFailure(
            warpOp, "Distributed dimension size in source is not a multiple of "
                    "subgroup size.");
      // Offsets in the distributed dimension must be multiples of subgroup
      // size.
      int64_t destDistrDimOffset =
          cast<IntegerAttr>(insertOp.getOffsets()[destDistributedDim]).getInt();
      if (destDistrDimOffset % subgroupSize != 0)
        return rewriter.notifyMatchFailure(
            warpOp,
            "Offset along distributed dimension in dest is not a multiple of "
            "subgroup size.");
      // Update the source and dest types based on their layouts.
      updatedSourceType = getDistVecTypeBasedOnLaneLayout(
                              sourceLayout, insertOp.getSourceVectorType())
                              .value();
      updatedDestType = getDistVecTypeBasedOnLaneLayout(
                            destLayout, insertOp.getDestVectorType())
                            .value();
      // Update the distributed offsets to match round robin distribution (i.e.
      // each lane owns data at `subgroupSize` stride given unit lane data).
      updatedOffsets[destDistributedDim] =
          rewriter.getI64IntegerAttr(destDistrDimOffset / subgroupSize);
    }
    // Do the distribution by yielding the source and dest of the insert op
    // from the warp op and creating a new insert op outside the warp op.
    SmallVector<size_t> newRetIndices;
    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, {insertOp.getValueToStore(), insertOp.getDest()},
        {updatedSourceType, updatedDestType}, newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);

    Value valueToStore = newWarpOp.getResult(newRetIndices[0]);
    Value dest = newWarpOp.getResult(newRetIndices[1]);
    // Create a new insert op outside the warp op.
    Value newInsertOp = vector::InsertStridedSliceOp::create(
        rewriter, insertOp.getLoc(), updatedDestType, valueToStore, dest,
        ArrayAttr::get(rewriter.getContext(), updatedOffsets),
        insertOp.getStrides());
    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandNumber),
                                newInsertOp);
    return success();
  }
};

/// Sink a memref::ExtractAlignedPointerAsIndex op feeding into yield op of an
/// enclosing `gpu.warp_execute_on_lane_0` region. This will simply move the op
/// outside of the warp op.
struct MemrefExtractAlignedPointerAsIndexDistribution final
    : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand = getWarpResult(
        warpOp, llvm::IsaPred<memref::ExtractAlignedPointerAsIndexOp>);
    if (!operand)
      return rewriter.notifyMatchFailure(
          warpOp,
          "warp result is not a memref::MemrefExtractAlignedPointerAsIndex op");
    auto extractOp =
        operand->get().getDefiningOp<memref::ExtractAlignedPointerAsIndexOp>();
    unsigned operandIdx = operand->getOperandNumber();
    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, extractOp.getSource(),
        TypeRange{extractOp.getSource().getType()}, newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);
    auto newExtractOp = memref::ExtractAlignedPointerAsIndexOp::create(
        rewriter, newWarpOp.getLoc(), extractOp.getType(),
        newWarpOp.getResult(newRetIndices[0]));
    Value resultVal = newWarpOp.getResult(operandIdx);
    rewriter.replaceAllUsesWith(resultVal, newExtractOp.getResult());
    return success();
  }
};

/// Distribute a vector::BitCastOp feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region. Bitcast only impacts the innermost
/// diemension of the source/result vectors. Equivalent vector::BitCastOp is
/// created outside of the warp op with distributed source vector type (computed
/// using assigned layout).
struct VectorBitcastDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand =
        getWarpResult(warpOp, llvm::IsaPred<vector::BitCastOp>);
    if (!operand)
      return rewriter.notifyMatchFailure(
          warpOp, "warp result is not a vector::BitCast op");
    auto bitcastOp = operand->get().getDefiningOp<vector::BitCastOp>();
    unsigned operandIdx = operand->getOperandNumber();
    VectorType distributedSourceType =
        getDistVecTypeBasedOnLaneLayout(
            xegpu::getTemporaryLayout(bitcastOp->getOpOperand(0)),
            bitcastOp.getSourceVectorType())
            .value_or(VectorType());
    if (!distributedSourceType)
      return rewriter.notifyMatchFailure(
          bitcastOp, "Failed to distribute the source vector type in "
                     "vector::BitCast op");
    VectorType distributedResultType =
        cast<VectorType>(warpOp.getResult(operandIdx).getType());
    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, bitcastOp.getSource(),
        TypeRange{distributedSourceType}, newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);
    auto newBitcastOp = vector::BitCastOp::create(
        rewriter, newWarpOp.getLoc(), distributedResultType,
        newWarpOp.getResult(newRetIndices[0]));
    Value distributedVal = newWarpOp.getResult(operandIdx);
    rewriter.replaceAllUsesWith(distributedVal, newBitcastOp.getResult());
    return success();
  }
};

/// Distribute a vector::TransposeOp feeding into yield op of an enclosing
/// `gpu.warp_execute_on_lane_0` region. Currently only 2D transposes are
/// supported. In most cases, transpose is a no op because it is entirely
/// handled using the layouts (e.g. 16x1 -> 1x16). However, if each lane owns
/// multiple slices of data after distribution (e.g. 16x2 -> 2x16), a lane-local
/// transpose (i.e. shuffle) is needed. Therefore, we create an equivalent
/// vector::TransposeOp outside of the warp op with distributed source vector
/// type (computed using assigned layout).
struct VectorTransposeDistribution final : public gpu::WarpDistributionPattern {
  using gpu::WarpDistributionPattern::WarpDistributionPattern;
  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                PatternRewriter &rewriter) const override {
    OpOperand *operand =
        getWarpResult(warpOp, llvm::IsaPred<vector::TransposeOp>);
    if (!operand)
      return rewriter.notifyMatchFailure(
          warpOp, "warp result is not a vector::Transpose op");
    auto transposeOp = operand->get().getDefiningOp<vector::TransposeOp>();
    unsigned operandIdx = operand->getOperandNumber();
    xegpu::DistributeLayoutAttr sourceLayout =
        xegpu::getTemporaryLayout(transposeOp->getOpOperand(0));
    xegpu::DistributeLayoutAttr resultLayout =
        xegpu::getTemporaryLayout(transposeOp->getOpResult(0));
    if (!sourceLayout || !resultLayout)
      return rewriter.notifyMatchFailure(
          transposeOp,
          "the source or result vector of the transpose op lacks layout "
          "attribute");
    int64_t sourceRank = transposeOp.getSourceVectorType().getRank();
    int64_t resultRank = transposeOp.getResultVectorType().getRank();
    // Only 2D transposes are supported for now.
    // TODO: Support nD transposes.
    if (sourceRank != 2 || resultRank != 2)
      return rewriter.notifyMatchFailure(
          transposeOp, "the source or result vector of the transpose op "
                       "does not have 2D layout");
    ArrayRef<int64_t> perm = transposeOp.getPermutation();
    // Result layout must be a transpose of source layout.
    if (!resultLayout.isTransposeOf(sourceLayout, perm))
      return rewriter.notifyMatchFailure(
          transposeOp,
          "the source or result vector layouts must be 2D transposes of each "
          "other");
    FailureOr<VectorType> distributedSourceTypeOrFailure =
        getDistVecTypeBasedOnLaneLayout(sourceLayout,
                                        transposeOp.getSourceVectorType());
    if (failed(distributedSourceTypeOrFailure))
      return rewriter.notifyMatchFailure(
          transposeOp, "Failed to distribute the source vector type in "
                       "vector::Transpose op");
    SmallVector<size_t> newRetIndices;
    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
        rewriter, warpOp, transposeOp.getVector(),
        TypeRange{distributedSourceTypeOrFailure.value()}, newRetIndices);
    rewriter.setInsertionPointAfter(newWarpOp);
    auto newTransposeOp = vector::TransposeOp::create(
        rewriter, newWarpOp.getLoc(), newWarpOp.getResult(newRetIndices[0]),
        perm);
    Value distributedVal = newWarpOp.getResult(operandIdx);
    rewriter.replaceAllUsesWith(distributedVal, newTransposeOp.getResult());
    return success();
  }
};

} // namespace

namespace {
struct XeGPUSubgroupDistributePass final
    : public xegpu::impl::XeGPUSubgroupDistributeBase<
          XeGPUSubgroupDistributePass> {
  void runOnOperation() override;
};
} // namespace

void xegpu::populateXeGPUSubgroupDistributePatterns(
    RewritePatternSet &patterns) {
  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
               LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
               GpuBarrierDistribution, VectorMultiReductionDistribution,
               LoadDistribution, StoreDistribution, VectorTransposeDistribution,
               VectorBitcastDistribution, LoadMatrixDistribution,
               StoreMatrixDistribution,
               MemrefExtractAlignedPointerAsIndexDistribution>(
      patterns.getContext(),
      /*pattern benefit=*/PatternHierarchy::Regular);
  // For following patterns, we need to override the regular vector distribution
  // patterns. Therefore, assign higher benefit.
  patterns
      .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution,
           SinkUniformOps>(patterns.getContext(),
                           /*pattern benefit=*/PatternHierarchy::AboveRegular);
}

void xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(
    RewritePatternSet &patterns) {
  patterns.add<MoveFuncBodyToWarpOp>(patterns.getContext());
}

void XeGPUSubgroupDistributePass::runOnOperation() {
  // Step 1: Attach layouts to op operands.
  // TODO: Following assumptions are made:
  // 1) It is assumed that there are no layout conflicts.
  // 2) Any existing layout attributes attached to the operands are ignored.
  Operation *op = getOperation();
  if (!xegpu::recoverTemporaryLayouts(op)) {
    signalPassFailure();
    return;
  }

  // Step 2: Move all operations of a GPU function inside
  // gpu.warp_execute_on_lane_0 operation.
  {
    RewritePatternSet patterns(&getContext());
    xegpu::populateXeGPUMoveFuncBodyToWarpOpPatterns(patterns);

    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
      signalPassFailure();
      return;
    }
    // At this point, we have moved the entire function body inside the
    // warpOp. Now move any scalar uniform code outside of the warpOp (like
    // GPU index ops, scalar constants, etc.). This will simplify the
    // later lowering and avoid custom patterns for these ops.
    getOperation()->walk([&](Operation *op) {
      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op))
        vector::moveScalarUniformCode(warpOp);
    });
  }
  // Step 3: Apply subgroup to workitem distribution patterns.
  RewritePatternSet patterns(&getContext());
  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
  // distributionFn is used by vector distribution patterns to determine the
  // distributed vector type for a given vector value. In XeGPU subgroup
  // distribution context, we compute this based on lane layout.
  auto distributionFn = [](Value val) {
    VectorType vecType = dyn_cast<VectorType>(val.getType());
    int64_t vecRank = vecType ? vecType.getRank() : 0;
    if (vecRank == 0)
      return AffineMap::get(val.getContext());
    // Get the layout of the vector type.
    xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
    // If no layout is specified, assume uniform case (no distribution).
    if (!layout)
      return AffineMap::get(val.getContext());
    // Expecting vector and layout rank to match.
    assert(layout.getRank() == vecRank &&
           "Expecting vector and layout rank to match");
    // A dimension is distributed only if layout suggests there are
    // multiple lanes assigned for this dimension and the shape can be evenly
    // distributed to those lanes.
    SmallVector<unsigned int> distributedDims;
    for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
      if (v > 1 && vecType.getShape()[i] % v == 0)
        distributedDims.push_back(i);
    }
    return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
                                                val.getContext());
  };
  // TODO: shuffleFn is not used.
  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
                      int64_t warpSz) { return Value(); };

  auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
                          vector::CombiningKind kind, uint32_t size) {
    // First reduce on a single thread to get per lane reduction value.
    Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
    // Parallel reduction using butterfly shuffles.
    for (uint64_t i = 1; i < size; i <<= 1) {
      Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
                                              /*width=*/size,
                                              /*mode=*/gpu::ShuffleMode::XOR)
                           .getShuffleResult();
      laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
    }
    return laneVal;
  };

  vector::populateDistributeReduction(
      patterns, warpReduction,
      /*pattern benefit=*/PatternHierarchy::Regular);

  vector::populatePropagateWarpVectorDistributionPatterns(
      patterns, distributionFn, shuffleFn,
      /*pattern benefit=*/PatternHierarchy::Regular);
  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
    signalPassFailure();
    return;
  }

  // Step 4: Finally, clean up UnrealizedConversionCastOps that were inserted
  // due to tensor desc type mismatches created by using upstream distribution
  // patterns (scf.for). This cleanup should only be done if all the ops are
  // distributed successfully, if some ops are still not distributed and remains
  // inside any WarpExecuteOnLane0Op we avoid this simplication step to avoid
  // breaking the IR.
  bool foundWarpOp = false;
  getOperation()->walk([&](gpu::WarpExecuteOnLane0Op warpOp) {
    // Look for WarpOps that are not trivially dead.
    if (isOpTriviallyDead(warpOp))
      return WalkResult::advance();
    foundWarpOp = true;
    return WalkResult::interrupt();
  });
  if (foundWarpOp)
    return;

  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
    // We are only interested in UnrealizedConversionCastOps there were added
    // for resolving SIMT type mismatches.
    if (!op->getAttr(resolveSIMTTypeMismatch))
      return WalkResult::skip();

    Value input = op.getOperand(0);
    Value output = op.getResult(0);

    // Both input and output must have tensor descriptor types.
    xegpu::TensorDescType inputDescType =
        mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
    xegpu::TensorDescType outputDescType =
        mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
    assert(inputDescType && outputDescType &&
           "Unrealized conversion cast must have tensor descriptor types");

    // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
    // This occurs inside scf.for body to resolve the block argument type to
    // SIMT type.
    if (inputDescType.getLayout()) {
      auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
      if (argument) {
        argument.setType(output.getType());
        output.replaceAllUsesWith(argument);
        if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
                argument.getOwner()->getParentOp())) {
          auto result = loopOp.getTiedLoopResult(argument);
          result.setType(output.getType());
        }
      }
    }

    // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
    // conversions. This occurs at the yield op of scf.for body to go back
    // from SIMT type to original type.
    if (outputDescType.getLayout())
      output.replaceAllUsesWith(input);

    if (op->use_empty())
      op->erase();
    return WalkResult::advance();
  });
}