//===- XeGPUPropagateLayout.cpp - XeGPU Layout Propagation ------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
#include "mlir/Analysis/DataFlow/Utils.h"
#include "mlir/Analysis/DataFlowFramework.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/Value.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Interfaces/ControlFlowInterfaces.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Support/LLVM.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/LogicalResult.h"
#include "llvm/Support/raw_ostream.h"

namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUPROPAGATELAYOUT
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir

#define DEBUG_TYPE "xegpu-propagate-layout"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")

using namespace mlir;
using namespace mlir::dataflow;

namespace {

//===----------------------------------------------------------------------===//
// LayoutInfo
//===----------------------------------------------------------------------===//

/// Helper class for tracking the analysis state of an mlir value. For layout
/// propagation, the analysis state is simply the distribution layout of
/// each value. The distribution layout information is encapsulated using
/// xegpu::DistributeLayoutAttr class which can hold information about any type
/// of distribution layout that XeGPU dialect supports. Purpose of this analysis
/// to propagate some unique distribution layout for each value in the program
/// starting from a set of anchor operations (like DPAS, StoreNd, etc.). Note
/// that analysis will reach a fixed point when all values are reached some
/// layout and, analysis does not try to modify any already assigned layouts.
///
/// Given this, LayoutInfo  satisifies the following properties:
///  1) A LayoutInfo value can be in one of two states - `assigned` or `not
///  assigned`.
///  2) Two LayoutInfo values are equal if they are both assigned or
///  both not assigned. The concrete value of assigned state does not matter.
///  3) The meet operator works as follows:
///     - If current state is assigned, return the current state. (already
///     a unique layout is assigned. don't change it)
///     - Otherwise, return the other state.

struct LayoutInfo {
private:
  xegpu::DistributeLayoutAttr storage = nullptr;

public:
  LayoutInfo() = default;
  LayoutInfo(const xegpu::DistributeLayoutAttr &layout) : storage(layout) {}

  // Two lattice values are equal if they have `some` layout. The actual
  // content of the layout does not matter.
  bool operator==(const LayoutInfo &other) const {
    return this->isAssigned() == other.isAssigned();
  }

  static LayoutInfo meet(const LayoutInfo &lhs, const LayoutInfo &rhs);

  static LayoutInfo join(const LayoutInfo &lhs, const LayoutInfo &rhs);

  void print(raw_ostream &os) const;

  bool isAssigned() const { return storage != nullptr; }

  LayoutInfo transpose(ArrayRef<int64_t> permutation) const;

  SmallVector<int> getLaneLayout() const;

  SmallVector<int> getLaneData() const;

  SmallVector<int> getInstData() const;

  SmallVector<int> getSgLayout() const;

  SmallVector<int> getSgData() const;

  SmallVector<int> getOrder() const;

  bool isSliceLayout() const {
    if (!isAssigned())
      return false;
    return isa<xegpu::SliceAttr>(storage);
  }

  int64_t getRank() const {
    if (!isAssigned())
      return -1;
    return storage.getRank();
  }

  Attribute get() { return storage; }
  void set(const xegpu::DistributeLayoutAttr &layout) { storage = layout; }
};

SmallVector<int> LayoutInfo::getLaneLayout() const {
  if (!isAssigned())
    return {};
  return llvm::map_to_vector(storage.getEffectiveLaneLayoutAsInt(),
                             [](int64_t val) { return static_cast<int>(val); });
}

SmallVector<int> LayoutInfo::getLaneData() const {
  if (!isAssigned())
    return {};
  return llvm::map_to_vector(storage.getEffectiveLaneDataAsInt(),
                             [](int64_t val) { return static_cast<int>(val); });
}

SmallVector<int> LayoutInfo::getInstData() const {
  if (!isAssigned())
    return {};
  return llvm::map_to_vector(storage.getEffectiveInstDataAsInt(),
                             [](int64_t val) { return static_cast<int>(val); });
}

SmallVector<int> LayoutInfo::getSgLayout() const {
  if (!isAssigned())
    return {};
  return llvm::map_to_vector(storage.getEffectiveSgLayoutAsInt(),
                             [](int64_t val) { return static_cast<int>(val); });
}

SmallVector<int> LayoutInfo::getSgData() const {
  if (!isAssigned())
    return {};
  return llvm::map_to_vector(storage.getEffectiveSgDataAsInt(),
                             [](int64_t val) { return static_cast<int>(val); });
}

SmallVector<int> LayoutInfo::getOrder() const {
  if (!isAssigned() || !storage.getOrder())
    return {};
  return llvm::map_to_vector(storage.getOrder().asArrayRef(),
                             [](int64_t val) { return static_cast<int>(val); });
}

void LayoutInfo::print(raw_ostream &os) const {
  if (isAssigned()) {
    os << storage;
  } else {
    os << "Not assigned.";
  }
}

LayoutInfo LayoutInfo::meet(const LayoutInfo &lhs, const LayoutInfo &rhs) {
  if (!lhs.isAssigned())
    return rhs;
  return lhs;
}

/// Since this is a backward analysis, join method is not used.
LayoutInfo LayoutInfo::join(const LayoutInfo &lhs, const LayoutInfo &rhs) {
  llvm_unreachable("Join should not be triggered by layout propagation.");
}

/// Construct a new layout with the transposed inst_data or lane_layout,
/// lane_data.
LayoutInfo LayoutInfo::transpose(ArrayRef<int64_t> permutation) const {
  if (!isAssigned())
    return {};
  // Check if the permutation is valid.
  llvm::SmallSet<int64_t, 4> seen(permutation.begin(), permutation.end());
  bool hasDuplicates = seen.size() != permutation.size();
  bool withinRange = llvm::all_of(permutation, [&](int64_t idx) {
    return idx >= 0 && idx < static_cast<int64_t>(permutation.size());
  });

  if (!withinRange || hasDuplicates) {
    assert(false && "Invalid permutation for transpose.");
    return {};
  }

  SmallVector<int32_t> laneLayout;
  SmallVector<int32_t> laneData;
  SmallVector<int32_t> instData;
  SmallVector<int32_t> sgLayout;
  SmallVector<int32_t> sgData;
  SmallVector<int32_t> order;

  for (int64_t idx : permutation) {
    if (getLaneLayout().size()) {
      laneLayout.push_back(static_cast<int32_t>(getLaneLayout()[idx]));
      laneData.push_back(static_cast<int32_t>(getLaneData()[idx]));
    }
    if (getInstData().size())
      instData.push_back(static_cast<int32_t>(getInstData()[idx]));
    if (getSgData().size()) {
      sgLayout.push_back(static_cast<int32_t>(getSgLayout()[idx]));
      sgData.push_back(static_cast<int32_t>(getSgData()[idx]));
    }
    if (getOrder().size()) {
      order.push_back(static_cast<int32_t>(getOrder()[idx]));
    }
  }
  auto orderAttr = order.size()
                       ? DenseI32ArrayAttr::get(storage.getContext(), order)
                       : nullptr;
  xegpu::LayoutAttr layoutAttr;
  if (getLaneLayout().size())
    layoutAttr =
        xegpu::LayoutAttr::get(storage.getContext(), laneLayout, laneData);
  if (getInstData().size())
    layoutAttr = xegpu::LayoutAttr::get(storage.getContext(), instData);
  if (getSgData().size())
    layoutAttr = xegpu::LayoutAttr::get(
        storage.getContext(),
        DenseI32ArrayAttr::get(storage.getContext(), sgLayout),
        DenseI32ArrayAttr::get(storage.getContext(), sgData),
        /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
        /*lane_data =*/nullptr, orderAttr);
  return LayoutInfo(layoutAttr);
}

//===----------------------------------------------------------------------===//
// LayoutInfoLattice
//===----------------------------------------------------------------------===//

/// Lattice holding the LayoutInfo for each value.
struct LayoutInfoLattice : public Lattice<LayoutInfo> {
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LayoutInfoLattice)
  using Lattice::Lattice;
};

/// Helper Functions to get default layouts. A `default layout` is a layout that
/// is assigned to a value when the layout is not fixed by some anchor operation
/// (like DPAS).

/// Helper Function to get the default layout for uniform values like constants.
/// For 1D vector, lane_layout is [subgroupSize] and lane_data is [1].
/// For 2D vector, lane_layout is [1, subgroupSize] and lane_data is [1, 1].
static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
                                           unsigned rank,
                                           const xegpu::uArch::uArch *uArch) {
  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
  if (rank == 1) {
    return LayoutInfo(
        xegpu::LayoutAttr::get(ctx, {uArch->getSubgroupSize()}, {1}));
  }
  return LayoutInfo(
      xegpu::LayoutAttr::get(ctx, {1, uArch->getSubgroupSize()}, {1, 1}));
}

static LayoutInfo getDefaultSIMTLayoutInfo(mlir::MLIRContext *ctx,
                                           unsigned rank, int subgroupSize) {
  assert((rank == 1 || rank == 2) && "Expected 1D or 2D vector.");
  if (rank == 1) {
    return LayoutInfo(xegpu::LayoutAttr::get(ctx, {subgroupSize}, {1}));
  }
  return LayoutInfo(xegpu::LayoutAttr::get(ctx, {1, subgroupSize}, {1, 1}));
}

/// Helper to get the default layout for 2D block operations.
template <typename Ty>
static LayoutInfo getSIMTLayoutInfoBlockIO(Ty ty,
                                           const xegpu::uArch::uArch *uArch,
                                           unsigned packingSize) {
  // Expecting a 1D or 2D vector.
  assert((ty.getRank() == 1 || ty.getRank() == 2) &&
         "Expected 1D or 2D vector.");
  // Expecting int or float element type.
  assert(ty.getElementType().isIntOrFloat() &&
         "Expected int or float element type.");
  // If the rank is 1, then return default layout for 1D vector.
  if (ty.getRank() == 1)
    return getDefaultSIMTLayoutInfo(ty.getContext(), 1, uArch);
  // Packing factor is determined by the element type bitwidth.
  unsigned bitwidth = ty.getElementType().getIntOrFloatBitWidth();
  int packingFactor = bitwidth < packingSize ? packingSize / bitwidth : 1;
  return LayoutInfo(xegpu::LayoutAttr::get(
      ty.getContext(), {1, uArch->getSubgroupSize()}, {1, packingFactor}));
}

//===----------------------------------------------------------------------===//
// LayoutInfoPropagation
//===----------------------------------------------------------------------===//

/// Backward data flow analysis to propagate the lane_layout and lane_data of
/// each value in the program. Currently, the layouts for operands DPAS,
/// StoreNd, and StoreScatter are fixed (known before propagation). Purpose of
/// this analysis is to propagate those known layouts to all their producers and
/// (other) consumers.
class LayoutInfoPropagation
    : public SparseBackwardDataFlowAnalysis<LayoutInfoLattice> {
private:
  xegpu::LayoutKind layoutKind;
  unsigned indexBitWidth;
  void visitDpasOp(xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
                   ArrayRef<const LayoutInfoLattice *> results);

  void visitStoreNdOp(xegpu::StoreNdOp store,
                      ArrayRef<LayoutInfoLattice *> operands,
                      ArrayRef<const LayoutInfoLattice *> results);

  void visitStoreScatterOp(xegpu::StoreScatterOp storeScatter,
                           ArrayRef<LayoutInfoLattice *> operands,
                           ArrayRef<const LayoutInfoLattice *> results);

  void visitLoadNdOp(xegpu::LoadNdOp load,
                     ArrayRef<LayoutInfoLattice *> operands,
                     ArrayRef<const LayoutInfoLattice *> results);

  void visitLoadGatherOp(xegpu::LoadGatherOp load,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);

  void visitTransposeOp(vector::TransposeOp transpose,
                        ArrayRef<LayoutInfoLattice *> operands,
                        ArrayRef<const LayoutInfoLattice *> results);

  void visitVectorBitcastOp(vector::BitCastOp bitcast,
                            ArrayRef<LayoutInfoLattice *> operands,
                            ArrayRef<const LayoutInfoLattice *> results);

  void visitCreateDescOp(xegpu::CreateDescOp createDesc,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);

  void visitUpdateNdOffsetOp(xegpu::UpdateNdOffsetOp updateNdOffset,
                             ArrayRef<LayoutInfoLattice *> operands,
                             ArrayRef<const LayoutInfoLattice *> results);

  void visitPrefetchNdOp(xegpu::PrefetchNdOp prefetch,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);

  void visitVectorMultiReductionOp(vector::MultiDimReductionOp reduction,
                                   ArrayRef<LayoutInfoLattice *> operands,
                                   ArrayRef<const LayoutInfoLattice *> results);

  void visitVectorReductionOp(vector::ReductionOp reduction,
                              ArrayRef<LayoutInfoLattice *> operands,
                              ArrayRef<const LayoutInfoLattice *> results);

  void visitVectorBroadCastOp(vector::BroadcastOp broadcast,
                              ArrayRef<LayoutInfoLattice *> operands,
                              ArrayRef<const LayoutInfoLattice *> results);
  void visitShapeCastOp(vector::ShapeCastOp shapeCast,
                        ArrayRef<LayoutInfoLattice *> operands,
                        ArrayRef<const LayoutInfoLattice *> results);
  void
  visitInsertStridedSliceOp(vector::InsertStridedSliceOp insertStridedSlice,
                            ArrayRef<LayoutInfoLattice *> operands,
                            ArrayRef<const LayoutInfoLattice *> results);

  void visitLoadMatrixOp(xegpu::LoadMatrixOp load,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);

  void visitStoreMatrixOp(xegpu::StoreMatrixOp store,
                          ArrayRef<LayoutInfoLattice *> operands,
                          ArrayRef<const LayoutInfoLattice *> results);

  void visitLoadGatherOp(xegpu::LoadMatrixOp load,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results);

  void visitStoreScatterOp(xegpu::StoreMatrixOp store,
                           ArrayRef<LayoutInfoLattice *> operands,
                           ArrayRef<const LayoutInfoLattice *> results);

  bool hasParamsOfLayoutKind(xegpu::DistributeLayoutAttr anchorLayout);

public:
  LayoutInfoPropagation(DataFlowSolver &solver,
                        SymbolTableCollection &symbolTable,
                        xegpu::LayoutKind layoutKind, unsigned indexBitWidth)
      : SparseBackwardDataFlowAnalysis(solver, symbolTable),
        layoutKind(layoutKind), indexBitWidth(indexBitWidth) {}
  using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis;

  LogicalResult
  visitOperation(Operation *op, ArrayRef<LayoutInfoLattice *> operands,
                 ArrayRef<const LayoutInfoLattice *> results) override;

  void visitBranchOperand(OpOperand &operand) override {};

  void visitCallOperand(OpOperand &operand) override {};

  void
  visitNonControlFlowArguments(RegionSuccessor &successor,
                               ArrayRef<BlockArgument> arguments) override {};

  void visitExternalCall(CallOpInterface call,
                         ArrayRef<LayoutInfoLattice *> operands,
                         ArrayRef<const LayoutInfoLattice *> results) override {
  };

  void setToExitState(LayoutInfoLattice *lattice) override {
    (void)lattice->meet(LayoutInfo());
  }
};
} // namespace

LogicalResult LayoutInfoPropagation::visitOperation(
    Operation *op, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  TypeSwitch<Operation *>(op)
      .Case(
          [&](xegpu::DpasOp dpasOp) { visitDpasOp(dpasOp, operands, results); })
      .Case([&](xegpu::StoreNdOp storeNdOp) {
        visitStoreNdOp(storeNdOp, operands, results);
      })
      .Case([&](xegpu::StoreScatterOp storeScatterOp) {
        visitStoreScatterOp(storeScatterOp, operands, results);
      })
      .Case([&](xegpu::LoadNdOp loadNdOp) {
        visitLoadNdOp(loadNdOp, operands, results);
      })
      .Case([&](xegpu::LoadGatherOp loadGatherOp) {
        visitLoadGatherOp(loadGatherOp, operands, results);
      })
      .Case([&](xegpu::CreateDescOp createDescOp) {
        visitCreateDescOp(createDescOp, operands, results);
      })
      .Case([&](xegpu::UpdateNdOffsetOp updateNdOffsetOp) {
        visitUpdateNdOffsetOp(updateNdOffsetOp, operands, results);
      })
      .Case([&](xegpu::PrefetchNdOp prefetchNdOp) {
        visitPrefetchNdOp(prefetchNdOp, operands, results);
      })
      .Case([&](vector::TransposeOp transposeOp) {
        visitTransposeOp(transposeOp, operands, results);
      })
      .Case([&](vector::BitCastOp bitcastOp) {
        visitVectorBitcastOp(bitcastOp, operands, results);
      })
      .Case([&](vector::MultiDimReductionOp reductionOp) {
        visitVectorMultiReductionOp(reductionOp, operands, results);
      })
      .Case([&](vector::ReductionOp reductionOp) {
        visitVectorReductionOp(reductionOp, operands, results);
      })
      .Case([&](vector::BroadcastOp broadcastOp) {
        visitVectorBroadCastOp(broadcastOp, operands, results);
      })
      .Case([&](vector::ShapeCastOp shapeCastOp) {
        visitShapeCastOp(shapeCastOp, operands, results);
      })
      .Case([&](vector::InsertStridedSliceOp insertStridedSliceOp) {
        visitInsertStridedSliceOp(insertStridedSliceOp, operands, results);
      })
      .Case([&](xegpu::LoadMatrixOp loadMatrixOp) {
        visitLoadMatrixOp(loadMatrixOp, operands, results);
      })
      .Case([&](xegpu::StoreMatrixOp storeMatrixOp) {
        visitStoreMatrixOp(storeMatrixOp, operands, results);
      })
      // All other ops.
      .Default([&](Operation *op) {
        for (const LayoutInfoLattice *resultInfo : results) {
          if (!resultInfo->getValue().isAssigned())
            continue;
          for (auto [operandInfo, operand] :
               llvm::zip(operands, op->getOpOperands())) {
            // If the operand type is not a vector or tensor descriptor, skip
            // it.
            if (!isa<xegpu::TensorDescType, VectorType>(
                    operand.get().getType()))
              continue;
            // Propagate the result layout to the operand.
            meet(operandInfo, *resultInfo);
          }
        }
      });

  return success();
}

bool LayoutInfoPropagation::hasParamsOfLayoutKind(
    xegpu::DistributeLayoutAttr anchorLayout) {
  if (anchorLayout == nullptr) {
    return false;
  }
  if (layoutKind == xegpu::LayoutKind::InstData) {
    return !(anchorLayout.getEffectiveInstDataAsInt().empty());
  }
  if (layoutKind == xegpu::LayoutKind::Lane) {
    return !(anchorLayout.getEffectiveLaneLayoutAsInt().empty() ||
             anchorLayout.getEffectiveLaneDataAsInt().empty());
  }
  if (layoutKind == xegpu::LayoutKind::Subgroup) {
    return !(anchorLayout.getEffectiveSgLayoutAsInt().empty() ||
             anchorLayout.getEffectiveSgDataAsInt().empty());
  }
  return false;
}

// This function returns all layouts for the given sgCount, whose sgData:
// 1. Evenly divides the wgShape.
// 2. Is a multiple of instData.
// Example:
//   wgShape = [128, 64], instData = [8, 16], sgCount = 32
// Returns layouts:
//   [(8,4), (16,2)], which correspond to sgData [16,16] and [8,32].
SmallVector<std::pair<int, int>> getValidLayouts(ArrayRef<int64_t> wgShape,
                                                 ArrayRef<int> instData,
                                                 int64_t sgCount) {
  SmallVector<std::pair<int, int>> candidates;
  for (int sgLayout0 = 1; sgLayout0 <= sgCount; ++sgLayout0) {
    if (sgCount % sgLayout0)
      continue;
    int sgLayout1 = sgCount / sgLayout0;
    int sgData0 = wgShape[0] / sgLayout0;
    int sgData1 = wgShape[1] / sgLayout1;
    if ((wgShape[0] % sgLayout0 || wgShape[1] % sgLayout1) ||
        (sgData0 % instData[0] || sgData1 % instData[1]))
      continue;
    candidates.emplace_back(sgLayout0, sgLayout1);
  }
  // Sort primarily by how balanced they are
  // (i.e., minimize the absolute difference between the two dimensions), and
  // secondarily by the first dimension in ascending order.
  llvm::sort(candidates, [](const std::pair<int, int> &lhs,
                            const std::pair<int, int> &rhs) {
    int diffLhs = std::abs(lhs.first - lhs.second);
    int diffRhs = std::abs(rhs.first - rhs.second);
    if (diffLhs != diffRhs)
      return diffLhs < diffRhs;
    return lhs.first < rhs.first;
  });
  return candidates;
}

FailureOr<int64_t> getNumSg(Operation *op, const int sgSize) {
  // Oblivious to workitem layout, the total count matters.
  auto gpuFunc = op->getParentOfType<gpu::GPUFuncOp>();
  if (!gpuFunc)
    return failure();
  auto knownBlockSize = gpuFunc.getKnownBlockSize();
  if (!knownBlockSize.has_value())
    return failure();
  const int flatBlockSize = llvm::product_of(knownBlockSize.value());
  return flatBlockSize / sgSize;
}

void LayoutInfoPropagation::visitPrefetchNdOp(
    xegpu::PrefetchNdOp prefetch, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {

  LayoutInfo prefetchLayout;
  xegpu::DistributeLayoutAttr anchorLayout = prefetch.getLayoutAttr();
  if (hasParamsOfLayoutKind(anchorLayout)) {
    prefetchLayout = LayoutInfo(anchorLayout);
  } else {
    // Here we assign the default layout to the tensor descriptor operand of
    // prefetch.
    auto tdescTy = prefetch.getTensorDescType();

    const uArch *uArch = getUArch(getChipStr(prefetch).value_or(""));
    if (!uArch)
      return;
    const auto *uArchInstruction =
        dyn_cast<xegpu::uArch::Subgroup2DBlockPrefetchInstruction>(
            uArch->getInstruction(
                xegpu::uArch::InstructionKind::Subgroup2DBlockPrefetch));

    auto blockWHC =
        uArchInstruction->getBlockWidthHeightCount(tdescTy.getElementType());
    if (!blockWHC)
      prefetch.emitWarning("No known block params found for the element type.");
    auto [bWidth, bHeight, bCount] = blockWHC.value();
    SmallVector<int> instData;
    int instWidth = xegpu::getLargestDivisor(
        static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth);
    if (instWidth == -1)
      prefetch.emitWarning(
          "No suitable instruction multiple found for the given shape.");
    if (tdescTy.getRank() == 1)
      instData = {instWidth};
    else {
      int instHeight = xegpu::getLargestDivisor(
          static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
      if (instHeight == -1)
        prefetch.emitWarning(
            "No suitable instruction multiple found for the given shape.");
      instData = {instHeight, instWidth};
    }

    if (layoutKind == xegpu::LayoutKind::InstData)
      prefetchLayout =
          LayoutInfo(xegpu::LayoutAttr::get(tdescTy.getContext(), instData));
    else
      prefetchLayout = getSIMTLayoutInfoBlockIO(
          tdescTy, uArch, uArchInstruction->getPackedFormatBitSize());

    prefetch.setLayoutAttr(
        dyn_cast<xegpu::DistributeLayoutAttr>(prefetchLayout.get()));
  }
  // Propagate the layout to the source tensor descriptor.
  propagateIfChanged(operands[0], operands[0]->meet(prefetchLayout));
}

void LayoutInfoPropagation::visitVectorMultiReductionOp(
    vector::MultiDimReductionOp reduction,
    ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  Type resultTy = reduction.getDestType();
  // The layout of the result must be present.
  LayoutInfo resLayoutInfo = results[0]->getValue();

  xegpu::DistributeLayoutAttr consumerLayoutAttr;
  if (!resultTy.isIntOrFloat()) {
    if (!resLayoutInfo.isAssigned())
      return;
    consumerLayoutAttr =
        dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
  }

  VectorType sourceTy = reduction.getSourceVectorType();
  SmallVector<int64_t> reductionDims(reduction.getReductionDims());

  const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
  if (!uArch)
    return;
  int numSg = 0;
  if (layoutKind == xegpu::LayoutKind::Subgroup) {
    auto numSgOrErr = getNumSg(reduction, uArch->getSubgroupSize());
    if (succeeded(numSgOrErr))
      numSg = numSgOrErr.value();
  }

  // The result layout represents the layout requirements of the operation.
  // it is recorded to anchor layout or temporary layout.
  // it must be honored for current op and may conflict with the layout
  // propagated from consumer op, the conflict is resolved in later phase by
  // converting the required result layout to the consumer layout
  auto requiredResLayoutAttr = xegpu::setupMultiReductionResultLayout(
      layoutKind, sourceTy, consumerLayoutAttr, reductionDims, numSg, uArch);

  xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);

  // derive the source layout from the dominant layout and reduction dims
  auto srcLayoutAttr = xegpu::inferMultiReductionSourceLayout(
      requiredResLayoutAttr, reductionDims);

  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
  // Accumulator should have the same layout as the result.
  propagateIfChanged(operands[1],
                     operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}

void LayoutInfoPropagation::visitVectorReductionOp(
    vector::ReductionOp reduction, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {

  VectorType sourceTy = reduction.getSourceVectorType();
  const uArch *uArch = getUArch(xegpu::getChipStr(reduction).value_or(""));
  if (!uArch)
    return;

  auto requiredResLayoutAttr =
      xegpu::setupReductionResultLayout(layoutKind, sourceTy, uArch);
  xegpu::setTemporaryLayout(reduction->getResult(0), requiredResLayoutAttr);

  auto srcLayoutAttr = xegpu::inferReductionSourceLayout(requiredResLayoutAttr);
  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
  if (reduction.getAcc())
    propagateIfChanged(operands[1],
                       operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}

void LayoutInfoPropagation::visitVectorBroadCastOp(
    vector::BroadcastOp broadcast, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // The layout of the result must be present.
  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;

  // Only consider vector to vector broadcasts for now.
  VectorType resultTy = broadcast.getResultVectorType();
  VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
  // skip layout propagation for non-vector source operand.
  if (!sourceTy)
    return;

  auto srcShape = sourceTy.getShape();
  auto resShape = resultTy.getShape();

  auto resultLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());

  xegpu::DistributeLayoutAttr srcLayoutAttr =
      xegpu::inferBroadcastSourceLayout(resultLayoutAttr, resShape, srcShape);

  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
}

void LayoutInfoPropagation::visitShapeCastOp(
    vector::ShapeCastOp shapeCast, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // The layout of the result must be present.
  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;
  ArrayRef<int64_t> resShape = shapeCast.getResultVectorType().getShape();
  ArrayRef<int64_t> srcShape = shapeCast.getSourceVectorType().getShape();
  auto resultLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());

  xegpu::DistributeLayoutAttr srcLayoutAttr =
      xegpu::inferShapeCastSourceLayout(resultLayoutAttr, resShape, srcShape);

  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
}

/// Propagate the layout of the result tensor to the source tensor descriptor
/// in UpdateNdOffsetOp.
void LayoutInfoPropagation::visitUpdateNdOffsetOp(
    xegpu::UpdateNdOffsetOp updateNdOffset,
    ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // The layout of the result must be present.
  LayoutInfo resultLayout = results[0]->getValue();
  if (!resultLayout.isAssigned())
    return;
  // Propagate the layout to the source operand.
  propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
}

/// Set the layouts for DPAS A, B, and C operands.
void LayoutInfoPropagation::visitDpasOp(
    xegpu::DpasOp dpas, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  LayoutInfo dpasALayout;
  LayoutInfo dpasBLayout;
  LayoutInfo dpasCDLayout;

  xegpu::DistributeLayoutAttr anchorLayoutCD = dpas.getLayoutCdAttr();
  if (hasParamsOfLayoutKind(anchorLayoutCD)) {
    xegpu::DistributeLayoutAttr anchorLayoutA = dpas.getLayoutAAttr();
    xegpu::DistributeLayoutAttr anchorLayoutB = dpas.getLayoutBAttr();
    assert(hasParamsOfLayoutKind(anchorLayoutA) &&
           "Expected anchor layout for DPAS A operand.");
    assert(hasParamsOfLayoutKind(anchorLayoutB) &&
           "Expected anchor layout for DPAS B operand.");
    dpasALayout = LayoutInfo(anchorLayoutA);
    dpasBLayout = LayoutInfo(anchorLayoutB);
    dpasCDLayout = LayoutInfo(anchorLayoutCD);
  } else {
    const uArch *uArch = getUArch(getChipStr(dpas).value_or(""));
    if (!uArch)
      return;
    VectorType aTy = dpas.getLhsType();
    VectorType bTy = dpas.getRhsType();
    VectorType cdTy = dpas.getResultType();

    xegpu::DistributeLayoutAttr consumerLayoutAttr = nullptr;
    xegpu::DistributeLayoutAttr requiredCDLayoutAttr, requiredALayout,
        requiredBLayout;

    int numSg = 0;
    if (layoutKind == xegpu::LayoutKind::Subgroup) {
      LayoutInfo consumerLayout = results[0]->getValue();
      if (!consumerLayout.isAssigned())
        return;
      consumerLayoutAttr =
          dyn_cast<xegpu::DistributeLayoutAttr>(consumerLayout.get());
      auto numSgOrErr = getNumSg(dpas, uArch->getSubgroupSize());
      if (failed(numSgOrErr)) {
        dpas.emitWarning(
            "Unable to determine the number of subgroups for the operation.");
        return;
      }
      numSg = numSgOrErr.value();
    }
    auto layouts = xegpu::setupDpasLayout(layoutKind, aTy, bTy, cdTy,
                                          consumerLayoutAttr, numSg, uArch);
    if (!layouts.has_value()) {
      dpas.emitWarning(
          "Failed to determine required layouts for DPAS operands.");
      return;
    }

    std::tie(requiredALayout, requiredBLayout, requiredCDLayoutAttr) = *layouts;

    dpas.setLayoutAAttr(requiredALayout);
    dpas.setLayoutBAttr(requiredBLayout);
    dpas.setLayoutCdAttr(requiredCDLayoutAttr);
    dpasALayout = LayoutInfo(requiredALayout);
    dpasBLayout = LayoutInfo(requiredBLayout);
    dpasCDLayout = LayoutInfo(requiredCDLayoutAttr);
  }
  propagateIfChanged(operands[0], operands[0]->meet(dpasALayout));
  propagateIfChanged(operands[1], operands[1]->meet(dpasBLayout));
  if (operands.size() > 2)
    propagateIfChanged(operands[2], operands[2]->meet(dpasCDLayout));
}

/// Set the layout for the value and tensor descriptor operands in StoreNdOp.
void LayoutInfoPropagation::visitStoreNdOp(
    xegpu::StoreNdOp store, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  LayoutInfo storeLayout;
  xegpu::DistributeLayoutAttr anchorLayout = store.getLayoutAttr();
  if (hasParamsOfLayoutKind(anchorLayout)) {
    storeLayout = LayoutInfo(anchorLayout);
  } else {
    const uArch *uArch = getUArch(getChipStr(store).value_or(""));
    if (!uArch)
      return;
    const auto *uArchInstruction =
        dyn_cast<xegpu::uArch::Subgroup2DBlockStoreInstruction>(
            uArch->getInstruction(
                xegpu::uArch::InstructionKind::Subgroup2DBlockStore));
    VectorType dataTy = store.getValueType();
    auto blockWHC = uArchInstruction->getBlockWidthHeightCount(
        store.getValueType().getElementType());
    if (!blockWHC)
      store.emitWarning("No known block params found for the element type.");
    auto [bWidth, bHeight, bCount] = blockWHC.value();
    SmallVector<int> instData;
    int instWidth = xegpu::getLargestDivisor(
        static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth);
    if (instWidth == -1)
      store.emitWarning(
          "No suitable instruction multiple found for the given shape.");
    if (dataTy.getRank() == 1)
      instData = {instWidth};
    else {
      int instHeight = xegpu::getLargestDivisor(
          static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
      if (instHeight == -1)
        store.emitWarning(
            "No suitable instruction multiple found for the given shape.");
      instData = {instHeight, instWidth};
    }

    if (layoutKind == xegpu::LayoutKind::InstData)
      storeLayout =
          LayoutInfo(xegpu::LayoutAttr::get(dataTy.getContext(), instData));
    else if (layoutKind == xegpu::LayoutKind::Lane)
      storeLayout =
          getSIMTLayoutInfoBlockIO(store.getValueType(), uArch,
                                   uArchInstruction->getPackedFormatBitSize());
    else { // xegpu::LayoutKind::Subgroup
      auto sgSize = uArch->getSubgroupSize();
      auto numSgOrErr = getNumSg(store, sgSize);
      if (failed(numSgOrErr)) {
        store.emitWarning(
            "Unable to determine the number of subgroups for the operation.");
        return;
      }
      auto sgLayouts = getValidLayouts(store.getValueType().getShape(),
                                       instData, numSgOrErr.value());
      if (sgLayouts.empty()) {
        store.emitWarning(
            "Unable to determine suitable subgroup layout for store value.");
        return;
      }
      SmallVector<int> sgLayout = {sgLayouts[0].first, sgLayouts[0].second};
      SmallVector<int> sgData = {
          static_cast<int>(dataTy.getShape()[0]) / sgLayout[0],
          static_cast<int>(dataTy.getShape()[1]) / sgLayout[1]};
      storeLayout = LayoutInfo(xegpu::LayoutAttr::get(
          dataTy.getContext(),
          DenseI32ArrayAttr::get(dataTy.getContext(), sgLayout),
          DenseI32ArrayAttr::get(dataTy.getContext(), sgData),
          /*inst_data =*/nullptr, /*lane_layout =*/nullptr,
          /*lane_data =*/nullptr, /*order =*/nullptr));
    }
    store.setLayoutAttr(
        dyn_cast<xegpu::DistributeLayoutAttr>(storeLayout.get()));
  }
  // Propagate the layout to the value operand.
  // Both operands should have the same layout
  for (LayoutInfoLattice *operand : operands)
    propagateIfChanged(operand, operand->meet(storeLayout));
}

/// Propagate the layout of the value to the tensor descriptor operand in
/// LoadNdOp.
void LayoutInfoPropagation::visitLoadNdOp(
    xegpu::LoadNdOp load, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  LayoutInfo loadLayout;
  xegpu::DistributeLayoutAttr anchorLayout = load.getLayoutAttr();
  if (hasParamsOfLayoutKind(anchorLayout)) {
    loadLayout = LayoutInfo(anchorLayout);
  } else {

    LayoutInfo valueLayout = results[0]->getValue();
    // Need the layout of the value to propagate to the tensor descriptor.
    if (!valueLayout.isAssigned())
      return;
    loadLayout = valueLayout;
    // LoadNdOp has the transpose effect. However, at the stage of this analysis
    // this effect is not expected and should be abstracted away. Emit a
    // warning.
    if (auto transpose = load.getTranspose()) {
      load.emitWarning("Transpose effect is not expected for LoadNdOp at "
                       "LayoutInfoPropagation stage.");
      loadLayout = valueLayout.transpose(transpose.value());
    }
    load.setLayoutAttr(dyn_cast<xegpu::DistributeLayoutAttr>(loadLayout.get()));
  }
  // Propagate the new layout to the tensor descriptor operand.
  propagateIfChanged(operands[0], operands[0]->meet(loadLayout));
}

/// For vector::TransposeOp, the layout of the result is transposed and
/// propagated to the operand.
void LayoutInfoPropagation::visitTransposeOp(
    vector::TransposeOp transpose, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // Need the layout of transpose result to propagate to the operands.
  LayoutInfo resultLayout = results[0]->getValue();
  if (!resultLayout.isAssigned())
    return;
  auto consumerLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resultLayout.get());
  auto srcLayoutAttr = xegpu::inferTransposeSourceLayout(
      consumerLayoutAttr, transpose.getPermutation());
  // Propagate the new layout to the vector operand.
  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
}

/// For vector::BitCastOp, the lane_data of the source layout is changed based
/// on the bit width of the source and result types.
void LayoutInfoPropagation::visitVectorBitcastOp(
    vector::BitCastOp bitcast, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // Need the layout of bitcast result to propagate to the operands.
  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;

  auto srcVecType = bitcast.getSourceVectorType();
  auto resVecType = bitcast.getResultVectorType();

  auto consumerLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
  const uArch *uArch = getUArch(xegpu::getChipStr(bitcast).value_or(""));
  if (!uArch)
    return;
  auto requiredResLayoutAttr = setupBitCastResultLayout(
      layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);

  xegpu::setTemporaryLayout(bitcast->getResult(0), requiredResLayoutAttr);

  int inElemTyBitWidth = srcVecType.getElementType().getIntOrFloatBitWidth();
  int outElemTyBitWidth = resVecType.getElementType().getIntOrFloatBitWidth();

  // derive the source layout from the dominant layout and reduction dims
  auto srcLayoutAttr = xegpu::inferBitCastSourceLayout(
      requiredResLayoutAttr, outElemTyBitWidth, inElemTyBitWidth);

  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
}

void LayoutInfoPropagation::visitInsertStridedSliceOp(
    vector::InsertStridedSliceOp insertStridedSlice,
    ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  // The layout of the result must be present.
  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;

  auto srcVecType = insertStridedSlice.getSourceVectorType();
  auto resVecType = insertStridedSlice.getDestVectorType();

  auto consumerLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());
  const uArch *uArch =
      getUArch(xegpu::getChipStr(insertStridedSlice).value_or(""));
  if (!uArch)
    return;

  auto requiredResLayoutAttr = xegpu::setupInsertStridedSliceResultLayout(
      layoutKind, srcVecType, resVecType, consumerLayoutAttr, uArch);
  xegpu::setTemporaryLayout(insertStridedSlice->getResult(0),
                            requiredResLayoutAttr);

  auto srcLayoutAttr = xegpu::inferInsertStridedSliceSourceLayout(
      requiredResLayoutAttr, resVecType.getShape(), srcVecType.getShape());
  propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(srcLayoutAttr)));
  propagateIfChanged(operands[1],
                     operands[1]->meet(LayoutInfo(requiredResLayoutAttr)));
}

/// Propagate the layout of the result to the tensor descriptor, mask and offset
/// operands in LoadGatherOp.
void LayoutInfoPropagation::visitLoadGatherOp(
    xegpu::LoadGatherOp load, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
  xegpu::DistributeLayoutAttr anchorLayoutAttr = load.getLayoutAttr();
  const uArch *uArch = getUArch(getChipStr(load).value_or(""));
  if (!uArch)
    return;
  auto subgroupSize = uArch->getSubgroupSize();
  VectorType resVecTy = load.getValueType();
  int chunkSize = load.getChunkSize().value_or(1);

  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;
  auto consumerLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());

  if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
    requiredAnchorLayoutAttr = anchorLayoutAttr;
  } else {
    if (!resVecTy) {
      load.emitWarning("Not propagating, non-vector payload supplied.");
      return;
    }
    requiredAnchorLayoutAttr = xegpu::setupLoadGatherAnchorLayout(
        layoutKind, resVecTy, chunkSize, consumerLayoutAttr, uArch);
    load.setLayoutAttr(requiredAnchorLayoutAttr);
  }

  auto maskLayoutAttr = requiredAnchorLayoutAttr;
  // Special handling mask layout for chunked ops: Enforce the default xegpu 1D
  // layout for mask.
  if (chunkSize > 1) {
    if (layoutKind == xegpu::LayoutKind::InstData)
      maskLayoutAttr =
          xegpu::LayoutAttr::get(load->getContext(), {subgroupSize});
    else if (layoutKind == xegpu::LayoutKind::Lane)
      maskLayoutAttr =
          xegpu::LayoutAttr::get(load->getContext(), {subgroupSize}, {1});
    else
      assert(false &&
             "chunked StoreScatterOp should not be used at workgroup level");
  }

  LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);
  auto loadLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);

  // Propagate the new layout to the tensor descriptor operand.
  if (isa<xegpu::TensorDescType>(load.getSourceType()))
    propagateIfChanged(operands[0], operands[0]->meet(loadLayoutInfo));
  // Propagate the new layout to the mask and optional offset operand.
  propagateIfChanged(operands[1], operands[1]->meet(maskLayoutInfo));
  if (load.getOffsets())
    propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
}

/// Propagate the layout of the descriptor to the vector offset operand in
/// CreateDescOp.
void LayoutInfoPropagation::visitCreateDescOp(
    xegpu::CreateDescOp createDesc, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  LayoutInfo descLayout = results[0]->getValue();
  // Need the layout of the descriptor to propagate to the operands.
  if (!descLayout.isAssigned())
    return;
  const uArch *uArch = getUArch(getChipStr(createDesc).value_or(""));
  if (!uArch)
    return;
  // For offset operand propagate 1D default layout.
  LayoutInfo layout = getDefaultSIMTLayoutInfo(createDesc->getContext(), 1,
                                               uArch->getSubgroupSize());
  propagateIfChanged(operands[1], operands[1]->meet(layout));
}

/// Set the layout for the value, tensor descriptor, offset and mask operands in
/// the StoreScatterOp.
void LayoutInfoPropagation::visitStoreScatterOp(
    xegpu::StoreScatterOp storeScatter, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {

  xegpu::DistributeLayoutAttr requiredAnchorLayoutAttr;
  xegpu::DistributeLayoutAttr anchorLayoutAttr = storeScatter.getLayoutAttr();
  const uArch *uArch = getUArch(getChipStr(storeScatter).value_or(""));
  if (!uArch)
    return;
  auto subgroupSize = uArch->getSubgroupSize();
  VectorType srcVecTy = storeScatter.getValueType();
  int chunkSize = storeScatter.getChunkSize().value_or(1);

  if (hasParamsOfLayoutKind(anchorLayoutAttr)) {
    requiredAnchorLayoutAttr = anchorLayoutAttr;
  } else {
    if (!srcVecTy) {
      storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
      return;
    }
    requiredAnchorLayoutAttr = xegpu::setupStoreScatterAnchorLayout(
        layoutKind, srcVecTy, chunkSize, uArch);
    storeScatter.setLayoutAttr(requiredAnchorLayoutAttr);
  }

  LayoutInfo srcLayoutInfo = LayoutInfo(requiredAnchorLayoutAttr);
  auto maskLayoutAttr = requiredAnchorLayoutAttr;
  // Special handling mask layout for chunked ops: Enforce the default xegpu 1D
  // layout for mask.
  if (chunkSize > 1) {
    if (layoutKind == xegpu::LayoutKind::InstData)
      maskLayoutAttr =
          xegpu::LayoutAttr::get(storeScatter->getContext(), {subgroupSize});
    else if (layoutKind == xegpu::LayoutKind::Lane)
      maskLayoutAttr = xegpu::LayoutAttr::get(storeScatter->getContext(),
                                              {subgroupSize}, {1});
    else
      assert(false &&
             "chunked StoreScatterOp should not be used at workgroup level");
  }

  LayoutInfo maskLayoutInfo = LayoutInfo(maskLayoutAttr);

  // Propagate the payload operand layout
  propagateIfChanged(operands[0], operands[0]->meet(srcLayoutInfo));
  // Propagate the destination (if tdesc) operand layout
  if (isa<xegpu::TensorDescType>(storeScatter.getDestType()))
    propagateIfChanged(operands[1], operands[1]->meet(srcLayoutInfo));
  // Propagate the new layout to the mask and optional offset operand.
  propagateIfChanged(operands[2], operands[2]->meet(maskLayoutInfo));
  if (storeScatter.getOffsets())
    propagateIfChanged(operands[3], operands[3]->meet(maskLayoutInfo));
}

void LayoutInfoPropagation::visitLoadMatrixOp(
    xegpu::LoadMatrixOp loadMatrixOp, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {

  LayoutInfo resLayoutInfo = results[0]->getValue();
  if (!resLayoutInfo.isAssigned())
    return;

  auto consumerLayoutAttr =
      dyn_cast<xegpu::DistributeLayoutAttr>(resLayoutInfo.get());

  xegpu::DistributeLayoutAttr anchorLayout = loadMatrixOp.getLayoutAttr();

  // only need to set anchor layout, no need to porpagate to memdesc and
  // offset
  if (!hasParamsOfLayoutKind(anchorLayout)) {
    VectorType resVecTy =
        llvm::cast<VectorType>(loadMatrixOp.getRes().getType());
    const uArch *uArch = getUArch(getChipStr(loadMatrixOp).value_or(""));
    if (!uArch)
      return;
    auto requiredAnchorLayoutAttr = xegpu::setupLoadMatrixAnchorLayout(
        layoutKind, resVecTy, consumerLayoutAttr, uArch);
    loadMatrixOp.setLayoutAttr(requiredAnchorLayoutAttr);
  }
}

// Store matrix is a flavor of scattered store for 2D shapes.
void LayoutInfoPropagation::visitStoreMatrixOp(
    xegpu::StoreMatrixOp storeMatrix, ArrayRef<LayoutInfoLattice *> operands,
    ArrayRef<const LayoutInfoLattice *> results) {
  xegpu::DistributeLayoutAttr anchorLayout = storeMatrix.getLayoutAttr();
  LayoutInfo layout;
  if (hasParamsOfLayoutKind(anchorLayout)) {
    layout = LayoutInfo(anchorLayout);
  } else {
    VectorType srcVecTy =
        llvm::cast<VectorType>(storeMatrix.getData().getType());
    const uArch *uArch = getUArch(getChipStr(storeMatrix).value_or(""));
    if (!uArch)
      return;
    auto requiredAnchorLayoutAttr =
        xegpu::setupStoreMatrixAnchorLayout(layoutKind, srcVecTy, uArch);
    storeMatrix.setLayoutAttr(requiredAnchorLayoutAttr);
    layout = LayoutInfo(requiredAnchorLayoutAttr);
  }

  propagateIfChanged(operands[0], operands[0]->meet(layout));
}

namespace {
//===----------------------------------------------------------------------===//
// RunLayoutInfoPropagation
//===----------------------------------------------------------------------===//

/// Driver class for running the LayoutInfoPropagation analysis.
class RunLayoutInfoPropagation {
public:
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(RunLayoutInfoPropagation)

  RunLayoutInfoPropagation(Operation *op, xegpu::LayoutKind layoutKind,
                           unsigned indexBitWidth)
      : target(op) {
    SymbolTableCollection symbolTable;
    loadBaselineAnalyses(solver);
    solver.load<LayoutInfoPropagation>(symbolTable, layoutKind, indexBitWidth);
    (void)solver.initializeAndRun(op);
  }

  LayoutInfo getLayoutInfo(Value val);

  void printAnalysisResult(llvm::raw_ostream &os);

private:
  DataFlowSolver solver;
  const Operation *target;
};
} // namespace

LayoutInfo RunLayoutInfoPropagation::getLayoutInfo(Value val) {
  auto *state = solver.lookupState<LayoutInfoLattice>(val);
  if (!state)
    return {};
  return state->getValue();
}

// Print the analysis result for debugging purposes.
void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
  auto printFunctionResult = [&](FunctionOpInterface funcOp) {
    os << "function: " << funcOp.getName() << ":\n";
    // Function arguments
    for (BlockArgument arg : funcOp.getArguments()) {
      LayoutInfo layout = getLayoutInfo(arg);
      os << "argument: " << arg << "\n";
      os << "layout  : ";
      layout.print(os);
      os << "\n";
    }
    // Function ops
    funcOp.walk([&](Operation *op) {
      // Skip ops that do not have results
      if (op->getResults().empty())
        return;
      os << "op    : ";
      // For control-flow ops, print the op name only.
      if (isa<BranchOpInterface>(op) || isa<RegionBranchOpInterface>(op))
        os << op->getName();
      else
        op->print(os);
      os << "\n";
      // Print the layout for each result.
      for (auto [i, r] : llvm::enumerate(op->getResults())) {
        LayoutInfo layout = getLayoutInfo(r);
        os << "layout for result #" << i << ": ";
        layout.print(os);
        os << "\n";
      }
    });
  };

  SmallVector<FunctionOpInterface> funcOps;
  if (auto modOp = dyn_cast<ModuleOp>(target)) {
    for (auto funcOp : modOp.getOps<FunctionOpInterface>())
      funcOps.push_back(funcOp);

    // Collect all GpuFuncOps in the module.
    for (auto gpuModOp : modOp.getOps<gpu::GPUModuleOp>()) {
      for (auto gpuFuncOp : gpuModOp.getOps<FunctionOpInterface>())
        funcOps.push_back(gpuFuncOp);
    }
  }
  // Print the analysis result for each function.
  for (FunctionOpInterface funcOp : funcOps)
    printFunctionResult(funcOp);
}

namespace {

//===----------------------------------------------------------------------===//
// ResolveLayoutConflicts
//===----------------------------------------------------------------------===//

/// Helper to get the defining CreateNdDescOp of a tensor descriptor value. This
/// function tries to find the defining CreateNdDescOp recursively accross
/// control-flow boundaries.
static xegpu::CreateNdDescOp getDefiningCreateNdDescOp(Value tdescValue) {
  // Try to get the defining CreateNdDescOp of the tensor descriptor.
  auto definingOp = tdescValue.getDefiningOp<xegpu::CreateNdDescOp>();
  if (definingOp)
    return definingOp;
  // If tdescValue is an argument, try to get the tied init value from the
  // parent loop-like op.
  if (auto arg = dyn_cast<BlockArgument>(tdescValue)) {
    auto *parentOp = arg.getOwner()->getParentOp();
    if (auto loop = dyn_cast<LoopLikeOpInterface>(parentOp)) {
      OpOperand *tiedInit = loop.getTiedLoopInit(arg);
      if (tiedInit)
        return getDefiningCreateNdDescOp(tiedInit->get());
    }
  }
  // If not found, return null.
  return nullptr;
}

struct ResolveLayoutConflicts {
  ResolveLayoutConflicts(Operation *parentOp)
      : parentOp(parentOp), builder(parentOp->getContext()) {}
  LogicalResult run();

private:
  Operation *parentOp;
  OpBuilder builder;
  LogicalResult resolveTensorDescConsumer(OpOperand &operand);
  LogicalResult resolveVectorConsumer(OpOperand &operand);
  LogicalResult assignResultLayout(OpResult &result);
};

} // namespace

LogicalResult ResolveLayoutConflicts::run() {
  // Scan all operations in the parent op and resolve layout conflicts at
  // tensor descriptor and vector use points.
  auto r = parentOp->walk([&](Operation *op) -> WalkResult {
    // if the operation inputs vector and output scalar, like multi-reduction we
    // need to check if the result has layout and add a convert_layout to serve
    // as anchor op for the reduction op's layout.
    if (isa<vector::MultiDimReductionOp>(op) || isa<vector::ReductionOp>(op)) {
      for (OpResult result : op->getResults()) {
        if (result.getType().isIntOrFloat()) {
          auto res = assignResultLayout(result);
          if (failed(res)) {
            DBGS() << "Failed to resolve vector consumer for multi-reduction "
                   << *op << "\n";
            return WalkResult::interrupt();
          }
        }
      }
    }
    for (OpOperand &operand : op->getOpOperands()) {
      // Handle conflicts in tensor descriptor operands.
      Type operandType = operand.get().getType();
      if (isa<xegpu::AnchorLayoutInterface>(op) &&
          isa<xegpu::TensorDescType>(operandType)) {
        auto res = resolveTensorDescConsumer(operand);
        if (failed(res)) {
          DBGS() << "Failed to resolve tensor descriptor consumer: " << *op
                 << "\n";
          return WalkResult::interrupt();
        }
      }
      // Handle conflicts in vector operands.
      if (isa<VectorType>(operandType)) {
        auto res = resolveVectorConsumer(operand);
        if (failed(res)) {
          DBGS() << "Failed to resolve vector consumer: " << *op << "\n";
          return WalkResult::interrupt();
        }
      }
    }
    return WalkResult::advance();
  });

  return r.wasInterrupted() ? failure() : success();
}

LogicalResult ResolveLayoutConflicts::assignResultLayout(OpResult &result) {
  Operation *producerOp = result.getDefiningOp();
  auto producerLayout = xegpu::getDistributeLayoutAttr(result);
  // Insert a convert_layout op to assign the layout.
  builder.setInsertionPointAfterValue(result);
  auto convertOp = xegpu::ConvertLayoutOp::create(
      builder, producerOp->getLoc(), result.getType(), result, producerLayout,
      producerLayout);
  result.replaceAllUsesExcept(convertOp.getResult(), convertOp);
  return success();
}

LogicalResult
ResolveLayoutConflicts::resolveVectorConsumer(OpOperand &operand) {
  Value vectorValue = operand.get();
  Operation *consumerOp = operand.getOwner();
  // Get the current layout of the vector value.
  auto producerLayout = xegpu::getDistributeLayoutAttr(vectorValue);
  if (!producerLayout) {
    if (auto vectorTy = dyn_cast<VectorType>(vectorValue.getType());
        vectorTy && vectorTy.getRank() > 1)
      consumerOp->emitWarning("Expected layout for non-1D vectors.");
    return success(); // uniform non-tensor-data vector does not require layout
  }
  // Get the consumer expected layout at this operand.
  auto consumerLayout = xegpu::getConsumerLayoutAt(operand);
  if (!consumerLayout)
    return consumerOp->emitError(
        "No consumer layout found for vector operand.");

  // If layouts are same, no conflict exists, return success.
  if (consumerLayout.isEqualTo(producerLayout))
    return success();

  // Insert a convert_layout op to resolve the conflict.
  builder.setInsertionPointAfterValue(vectorValue);
  auto convertOp = xegpu::ConvertLayoutOp::create(
      builder, consumerOp->getLoc(), vectorValue.getType(), vectorValue,
      producerLayout, consumerLayout);

  // Update the operand to use the converted value.
  operand.set(convertOp.getResult());
  return success();
}

LogicalResult
ResolveLayoutConflicts::resolveTensorDescConsumer(OpOperand &operand) {
  Operation *consumerOp = operand.getOwner();
  Value tdescValue = operand.get();
  auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(consumerOp);
  auto currTDescType = dyn_cast<xegpu::TensorDescType>(tdescValue.getType());
  assert(anchorOp && currTDescType &&
         "Expected anchor layout op and tensor descriptor consumer.");
  // TODO: Scattered tensor desc is not supported for now.
  if (currTDescType.isScattered()) {
    DBGS() << "Scattered tensor descriptor not supported: " << tdescValue
           << "\n";
    return failure();
  }
  Attribute currLayout = currTDescType.getLayout();
  Attribute expectedLayout = anchorOp.getAnchorLayout();
  // A conflict exists in tensor descriptor operand if tensor descriptor's
  // layout is different from the anchor layout expected by the consumer.
  if (expectedLayout && currLayout && expectedLayout != currLayout) {
    // Try to get the defining CreateNdDescOp of the tensor descriptor.
    auto conflictingCreateNdOp = getDefiningCreateNdDescOp(tdescValue);
    if (!conflictingCreateNdOp) {
      DBGS() << "Unable to find defining CreateNdDescOp for tensor descriptor: "
             << tdescValue << "\n";
      return failure();
    }
    // Duplicate the CreateNdDescOp with the expected layout.
    builder.setInsertionPointAfter(conflictingCreateNdOp);
    auto newTensorDescType = xegpu::TensorDescType::get(
        conflictingCreateNdOp.getContext(), currTDescType.getShape(),
        currTDescType.getElementType(), currTDescType.getEncoding(),
        expectedLayout);
    xegpu::CreateNdDescOp newOp = xegpu::CreateNdDescOp::create(
        builder, consumerOp->getLoc(), newTensorDescType,
        conflictingCreateNdOp->getOperands(),
        conflictingCreateNdOp->getAttrs());
    // Replace the tensor descriptor operand in the consumer op with the new
    // tensor descriptor.
    consumerOp->replaceUsesOfWith(tdescValue, newOp.getResult());
  }
  return success();
}

using GetLayoutFnTy = function_ref<xegpu::DistributeLayoutAttr(Value)>;
/// Update an operation with the layout of its results. If the result type is
/// a vector type, a temporary layout attribute is added to the operation. If
/// the result type is a tensor descriptor type, the type is updated with the
/// layout attribute. The users of the result are also updated with the layout
/// attribute.
static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
                              GetLayoutFnTy getLayoutOfValue) {
  // Region ops (like scf.for) are already handled by the
  // updateControlFlowOps.
  if (mlir::isa<mlir::RegionBranchOpInterface>(op))
    return success();

  // Iterate over all the results.
  for (OpResult result : op->getResults()) {
    Type resultType = result.getType();
    // Layouts are needed only for vector and tensor descriptor types.
    if (!isa<VectorType, xegpu::TensorDescType>(resultType))
      continue;
    // If the result has no layout but has users, emit a warning and continue.
    xegpu::DistributeLayoutAttr layout = getLayoutOfValue(result);
    if (!layout && result.getNumUses() > 0) {
      op->emitWarning("op has users but no layout assigned for its result");
      continue;
    }
    // If the result is a tensor descriptor type, update the tensor desc type
    // with layout.
    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
      auto typeWithLayout = xegpu::TensorDescType::get(
          tensorDescTy.getContext(), tensorDescTy.getShape(),
          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
      result.setType(typeWithLayout);
      continue;
    }
    // If the result is a vector type, add a temporary layout attribute to the
    // op.
    xegpu::setDistributeLayoutAttr(result, layout);
  }
  return success();
}

/// Region ops like scf.for need special handling because they have blocks
/// inside. If the blocks have tensor descriptor type as block arguments,
/// thier types must be updated. Also region op can have results that may not
/// have any users (e.g. A and B tiles). They are not assigned a layout by
/// layout analysis because they have no users. However inside the region op
/// corresponding block arguments for these results do have layouts.
/// Therefore, in this case we still need to update the result types with the
/// layout attribute. This function function updates the internal block
/// arguments and the result types of the region op with the assigned layouts.
/// clang-format off
/// Example: scf.for ... iter_args(...) -> (out types) {
///   ^bb0(block types):
///     ...
///   scf.yield ... : (yield types)
/// }
/// clang-format on
/// In this example, at scf.yield, control-flow can transfer to two successor
/// regions. One is the ^bb0 (for loop body) and the other is the scf.for op
/// itself (yield the results). So we update both the block arguments of the
/// successor region (i.e. block types) and the result types of the scf.for op
/// (i.e. out types). Note that yield types are updated by respective
/// producers inside bb0.
static LogicalResult
updateControlFlowOps(mlir::OpBuilder &builder,
                     mlir::RegionBranchTerminatorOpInterface terminator,
                     GetLayoutFnTy getLayoutOfValue) {
  // Only process if the terminator is inside a region branch op.
  auto branchOp = dyn_cast<RegionBranchOpInterface>(terminator->getParentOp());
  if (!branchOp)
    return success();

  RegionBranchSuccessorMapping mapping;
  branchOp.getSuccessorOperandInputMapping(mapping,
                                           RegionBranchPoint(terminator));
  for (const auto &[successorOperand, successorInputs] : mapping) {
    for (Value successorInput : successorInputs) {
      Type inputType = successorInput.getType();
      // We only need to operate on tensor descriptor or vector types.
      if (!isa<xegpu::TensorDescType, VectorType>(inputType))
        continue;
      xegpu::DistributeLayoutAttr successorInputLayout =
          getLayoutOfValue(successorInput);
      xegpu::DistributeLayoutAttr successorOperandLayout =
          getLayoutOfValue(successorOperand->get());

      // If either of the layouts is not assigned, we cannot proceed.
      if (!successorOperandLayout) {
        LLVM_DEBUG(DBGS() << "No layout assigned for forwarded operand in "
                             "branch terminator: "
                          << successorOperand->get() << "\n");
        return failure();
      }
      // We expect the layouts to match.
      if (successorInputLayout &&
          successorInputLayout != successorOperandLayout) {
        LLVM_DEBUG(DBGS() << "Conflicting layouts for region argument and "
                             "operand forwarded as the argument: "
                          << successorInputLayout << " vs "
                          << successorOperandLayout << "\n");
        return failure();
      }
      // Get tensor descriptor type with the layout.
      if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType)) {
        auto newTdescTy = xegpu::TensorDescType::get(
            tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
            tdescTy.getEncoding(), successorOperandLayout);
        successorInput.setType(newTdescTy);
        continue;
      }
      // If the type is a vector type and this region argument is an OpResult,
      // set the layout attribute on the OpResult.
      if (auto result = dyn_cast<OpResult>(successorInput))
        xegpu::setDistributeLayoutAttr(result, successorOperandLayout);
    }
  }
  return success();
}

/// Update the function arguments and results with the layouts.
static LogicalResult updateFunctionOpInterface(mlir::OpBuilder &builder,
                                               mlir::FunctionOpInterface funcOp,
                                               GetLayoutFnTy getLayoutOfValue) {
  // Only process functions whose type is a standard MLIR FunctionType.
  // Functions using a different type representation (e.g. llvm.func with
  // LLVMFunctionType) are not targets for XeGPU layout propagation, and
  // calling setType(FunctionType{}) on them would corrupt their type.
  if (!isa<FunctionType>(funcOp.getFunctionType()))
    return success();
  SmallVector<Type> newArgTypes;
  // Update the function arguments.
  for (BlockArgument arg : funcOp.getArguments()) {
    Type argType = arg.getType();
    newArgTypes.push_back(argType);
    if (!isa<VectorType, xegpu::TensorDescType>(argType))
      continue;
    xegpu::DistributeLayoutAttr layout = getLayoutOfValue(arg);
    if (!layout) {
      LLVM_DEBUG(DBGS() << "Expecting layout for function argument: " << arg
                        << " but got none.\n");
      return failure();
    }
    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(argType)) {
      auto newTdescTy = xegpu::TensorDescType::get(
          tensorDescTy.getContext(), tensorDescTy.getShape(),
          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
      arg.setType(newTdescTy);
      newArgTypes.back() = newTdescTy;
    }
  }
  // Update the function type with the new argument types.
  // NOTE: We assume that function results are not expected to have layouts.
  funcOp.setType(FunctionType::get(funcOp.getContext(), newArgTypes,
                                   funcOp.getResultTypes()));
  return success();
}

namespace {
struct XeGPUPropagateLayoutPass final
    : public xegpu::impl::XeGPUPropagateLayoutBase<XeGPUPropagateLayoutPass> {
  XeGPUPropagateLayoutPass() = default;
  XeGPUPropagateLayoutPass(const XeGPUPropagateLayoutPass &other) = default;
  XeGPUPropagateLayoutPass(xegpu::XeGPUPropagateLayoutOptions options)
      : XeGPUPropagateLayoutBase(std::move(options)) {}
  void runOnOperation() override;
};

} // namespace

LogicalResult xegpu::propagateLayouts(OpBuilder &builder, Operation *target,
                                      LayoutKind layoutKind,
                                      unsigned indexBitWidth, bool printOnly) {
  RunLayoutInfoPropagation analysis(target, layoutKind, indexBitWidth);
  // Print the analysis result and exit. (for debugging purposes)
  if (printOnly) {
    auto &os = llvm::outs();
    analysis.printAnalysisResult(os);
    return success();
  }
  // Helper to convert LayoutInfo to xegpu::LayoutAttr.
  auto getXeGPULayoutForValue = [&](Value val) -> xegpu::DistributeLayoutAttr {
    LayoutInfo layout = analysis.getLayoutInfo(val);
    if (!layout.isAssigned())
      return {};
    if (auto opResult = dyn_cast<OpResult>(val)) {

      Operation *defOp = opResult.getDefiningOp();
      if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
        auto anchorLayout = anchorOp.getAnchorLayout();
        if (anchorLayout != nullptr)
          return anchorLayout;
      }
      xegpu::DistributeLayoutAttr requiredResLayoutAttr =
          xegpu::getTemporaryLayout(opResult);
      if (requiredResLayoutAttr != nullptr)
        return requiredResLayoutAttr;
    }
    xegpu::DistributeLayoutAttr layoutAttr =
        cast<xegpu::DistributeLayoutAttr>(layout.get());
    if (layout.isSliceLayout())
      return cast<xegpu::SliceAttr>(layoutAttr);

    return cast<xegpu::LayoutAttr>(layoutAttr);
  };

  Operation *op = target;
  auto walkResult = op->walk([&](mlir::Block *block) -> WalkResult {
    for (mlir::Operation &op : llvm::reverse(block->getOperations())) {
      LogicalResult r = success();
      TypeSwitch<Operation *>(&op)
          .Case([&](mlir::RegionBranchTerminatorOpInterface branchTermOp) {
            r = updateControlFlowOps(builder, branchTermOp,
                                     getXeGPULayoutForValue);
          })
          .Case([&](mlir::FunctionOpInterface funcOp) {
            r = updateFunctionOpInterface(builder, funcOp,
                                          getXeGPULayoutForValue);
          })
          .Default([&](Operation *op) {
            r = updateOp(builder, op, getXeGPULayoutForValue);
          });
      if (failed(r)) {
        op.emitError("Failed to update operation with the layout.");
        return WalkResult::interrupt();
      }
    }
    return WalkResult::advance();
  });
  if (walkResult.wasInterrupted())
    return failure();

  return success();
}

LogicalResult xegpu::resolveLayoutConflicts(Operation *target) {
  ResolveLayoutConflicts resolver(target);
  return resolver.run();
}

void XeGPUPropagateLayoutPass::runOnOperation() {
  xegpu::LayoutKind layoutKind;
  if (this->layoutKind == "lane") {
    layoutKind = xegpu::LayoutKind::Lane;
  } else if (this->layoutKind == "inst") {
    layoutKind = xegpu::LayoutKind::InstData;
  } else if (this->layoutKind == "subgroup") {
    layoutKind = xegpu::LayoutKind::Subgroup;
  } else {
    getOperation()->emitError("Unsupported layout kind option: " +
                              this->layoutKind);
    signalPassFailure();
    return;
  }
  OpBuilder builder(&getContext());
  if (failed(xegpu::propagateLayouts(builder, getOperation(), layoutKind,
                                     this->indexBitWidth, this->printOnly))) {
    signalPassFailure();
    return;
  }
  // Resolve layout conflicts if any.
  if (failed(xegpu::resolveLayoutConflicts(getOperation()))) {
    signalPassFailure();
    return;
  }
}