llvm-project/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp

//===---- XeGPUUtils.cpp - MLIR Utilities for XeGPUOps   ------------------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements utility methods for working with the XeGPU dialect.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
#include "mlir/Dialect/SCF/Transforms/Patterns.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Operation.h"
#include "mlir/IR/ValueRange.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/FormatVariadic.h"
#include <cstdint>
#include <numeric>

using namespace mlir;

/// convert ArrayRef<ValueRange> into SmallVector<Value>
SmallVector<Value> xegpu::flattenValues(ArrayRef<ValueRange> values) {
  SmallVector<Value> result;
  for (const auto &vals : values)
    llvm::append_range(result, vals);
  return result;
}

FailureOr<VectorType>
mlir::xegpu::getDistributedVectorType(xegpu::TensorDescType tdescTy) {
  auto layout = llvm::dyn_cast_if_present<LayoutAttr>(tdescTy.getLayout());
  // It only works for subgroup level layout, which only has lane_layout
  // and lane_data, and is to distribute a SIMD code into SIMT code.
  if (!layout || !layout.isForSubgroup())
    return failure();

  SmallVector<int64_t> laneData(layout.getLaneData().asArrayRef());
  SmallVector<int64_t> laneLayout(layout.getLaneLayout().asArrayRef());
  auto tdescShape = tdescTy.getShape();
  auto elementType = tdescTy.getElementType();

  // compute sgSize by multiply elements of laneLayout
  // e.g. for 2D layout, sgSize = laneLayout[0] * laneLayout[1]
  // e.g. for 1D layout, sgSize = laneLayout[0]
  int64_t sgSize = llvm::product_of(laneLayout);

  // Case 1: regular loads/stores
  auto scatterAttr = tdescTy.getEncodingOfType<ScatterTensorDescAttr>();
  if (scatterAttr) {
    auto chunkSize = scatterAttr.getChunkSize().getInt();
    // Verify if the first dimension of the tensor descriptor shape is
    // distributable.
    assert(tdescShape[0] == laneLayout[0] &&
           "tensor descriptor shape is not distributable");
    return VectorType::get({chunkSize}, elementType);
  }

  // Case 2: block loads/stores
  // Check if the tensor descriptor shape is distributable.
  int64_t tensorSize = 1;
  for (auto [tdescDim, laneDim, laneDataDim] :
       llvm::zip_equal(tdescShape, laneLayout, laneData)) {
    assert((tdescDim % (laneDim * laneDataDim) == 0) &&
           "tensor descriptor shape is not distributable");
    tensorSize *= tdescDim;
  }
  // tensorSize must be adjusted for array_length.
  tensorSize *= tdescTy.getArrayLength();

  return VectorType::get({tensorSize / sgSize}, elementType);
}

FailureOr<VectorType>
mlir::xegpu::getDistributedVectorType(VectorType originalType,
                                      xegpu::LayoutAttr layout) {
  int64_t rank = originalType.getRank();
  // Distributed vector type is only supported for 1D, 2D and 3D vectors.
  if (rank < 1 || rank > 3)
    return failure();
  ArrayRef<int64_t> shape = originalType.getShape();
  // arrayLength is 1 for 1D and 2D vectors, and equal to the first dimension
  // of the 3D vector.
  int arrayLength = 1;
  if (rank == 3) {
    arrayLength = shape[0];
    shape = shape.drop_front();
  }
  auto helperTdescTy = xegpu::TensorDescType::get(
      shape, originalType.getElementType(), arrayLength,
      /*boundary_check=*/true,
      /*memory_space=*/xegpu::MemorySpace::Global, layout);
  return xegpu::getDistributedVectorType(helperTdescTy);
}

FailureOr<VectorType>
xegpu::getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
                                       VectorType originalType) {
  if (!layout)
    return failure();
  assert((isa<xegpu::LayoutAttr>(layout) || isa<xegpu::SliceAttr>(layout)) &&
         "Expecting a valid layout.");
  SmallVector<int64_t> effectiveLaneLayout =
      layout.getEffectiveLaneLayoutAsInt();
  assert(static_cast<size_t>(originalType.getRank()) >=
             effectiveLaneLayout.size() &&
         "Rank of the original vector type should be greater or equal to the "
         "size of the lane layout to distribute the vector type.");
  // TODO: replace the implementation with
  //   auto distributedShape = layout.computeDistributedShape(
  //       SmallVector<int64_t>(originalType.getShape()));
  SmallVector<int64_t> distributedShape(originalType.getShape());
  // Only distribute the last `laneLayout.size()` dimensions. The remaining
  // dimensions are not distributed.
  unsigned distributionStart =
      originalType.getRank() - effectiveLaneLayout.size();
  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
    if (i < distributionStart)
      continue;
    // Check if the dimension can be distributed evenly.
    if (dim % effectiveLaneLayout[i - distributionStart] != 0)
      return failure();
    distributedShape[i] = dim / effectiveLaneLayout[i - distributionStart];
  }
  return VectorType::get(distributedShape, originalType.getElementType());
}

std::string xegpu::getTemporaryLayoutName(const OpOperand &operand) {
  const StringRef prefix("layout_operand_");
  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
  return llvm::formatv("{0}{1}", prefix, idx).str();
}

std::string xegpu::getTemporaryLayoutName(const OpResult result) {
  const StringRef prefix = "layout_result_";
  return llvm::formatv("{0}{1}", prefix, result.getResultNumber()).str();
}

xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
  if (!value)
    return nullptr;

  if (auto tdescTy =
          dyn_cast_if_present<xegpu::TensorDescType>(value.getType()))
    return tdescTy.getLayoutAttr();

  if (auto result = dyn_cast<OpResult>(value)) {
    Operation *defOp = result.getDefiningOp();
    assert(defOp && "result must have a defining op");

    if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(defOp)) {
      auto layout = anchorOp.getAnchorLayout();
      return layout;
    }

    std::string layoutName = getTemporaryLayoutName(result);
    if (defOp->hasAttr(layoutName)) {
      auto layout =
          defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
      return layout;
    }
  }

  if (auto arg = dyn_cast<BlockArgument>(value)) {
    auto *parentOp = arg.getOwner()->getParentOp();
    if (auto loop = dyn_cast_if_present<LoopLikeOpInterface>(parentOp)) {
      OpOperand *tiedInit = loop.getTiedLoopInit(arg);
      if (tiedInit)
        return getDistributeLayoutAttr(tiedInit->get());
    }
  }

  return nullptr;
}
xegpu::DistributeLayoutAttr
xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
  Operation *op = opr.getOwner();
  unsigned idx = const_cast<OpOperand &>(opr).getOperandNumber();

  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(op)) {
    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(op)) {
      if (idx == 0) {
        return dpasOp.getLayoutAAttr();
      } else if (idx == 1) {
        return dpasOp.getLayoutBAttr();
      } else if (idx == 2) {
        return dpasOp.getLayoutCdAttr();
      }
    }
    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(op)) {
      return convertOp.getInputLayoutAttr();
    }
    auto layout = anchorOp.getAnchorLayout();

    if (idx == 0)
      return layout;

    // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
    // the layout is valid for the first two operands: value and memref/tdesc.
    // For other operations, the layout applies to the first operand only.
    if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
            op) &&
        (idx < 2))
      return layout;
  }

  std::string layoutName = xegpu::getTemporaryLayoutName(opr);
  if (op->hasAttr(layoutName)) {
    auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
    return layout;
  }

  return nullptr;
}

// Returns the permanent layout attribute for the given result if it's
// available on the defining op. Otherwise returns the provided layout.
xegpu::DistributeLayoutAttr
maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
                         const OpResult &result, mlir::Operation *owner,
                         const std::string &name) {
  xegpu::DistributeLayoutAttr candidate = layout;

  if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
    if (auto perm = loadOp.getLayoutAttr())
      candidate = perm;
  }

  return candidate;
}

// Returns the permanent layout attribute for the given operand if it's
// available on the defining op. Otherwise returns the provided layout.
xegpu::DistributeLayoutAttr
maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
                         const OpOperand &operand, mlir::Operation *owner,
                         const std::string &name) {
  xegpu::DistributeLayoutAttr candidate = layout;
  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();

  if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
    if (idx == 0) {
      if (auto perm = storeOp.getLayoutAttr())
        candidate = perm;
    }
  }

  return candidate;
}

// TODO-LayoutRefactor: Remove this function after replacing use
//  with setTemporaryLayout or setAnchorLayout
void xegpu::setDistributeLayoutAttr(
    const mlir::OpResult &result,
    const mlir::xegpu::DistributeLayoutAttr layout) {
  Operation *owner = result.getOwner();

  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
    if (anchorOp.getAnchorLayout() == layout)
      return;
    anchorOp.setAnchorLayout(layout);
    return;
  }

  std::string name = xegpu::getTemporaryLayoutName(result);
  if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
    return;
  }
  if (layout) {
    owner->setAttr(name, layout);
  }
}

// TODO-LayoutRefactor: Remove this function after replacing use
//  with setTemporaryLayout or setAnchorLayout
void xegpu::setDistributeLayoutAttr(const OpOperand &operand,
                                    const DistributeLayoutAttr layout) {
  Operation *owner = operand.getOwner();
  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();

  if (!layout) {
    return;
  }
  if (auto anchorOp = dyn_cast<xegpu::AnchorLayoutInterface>(owner)) {
    if (auto dpasOp = dyn_cast<xegpu::DpasOp>(owner)) {
      if (idx == 0) {
        return dpasOp.setLayoutAAttr(layout);
      } else if (idx == 1) {
        return dpasOp.setLayoutBAttr(layout);
      } else if (idx == 2) {
        return dpasOp.setLayoutCdAttr(layout);
      }
    }
    if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(owner)) {
      return convertOp.setInputLayoutAttr(layout);
    }

    // For store operations (StoreScatterOp, StoreNdOp, StoreMatrixOp),
    // the layout is valid for the first two operands: value and memref/tdesc.
    // For other operations, the layout applies to the first operand only.
    if (isa<xegpu::StoreScatterOp, xegpu::StoreNdOp, xegpu::StoreMatrixOp>(
            owner)) {
      if (idx < 2) {
        anchorOp.setAnchorLayout(layout);
      }
    } else {
      if (idx == 0) {
        anchorOp.setAnchorLayout(layout);
      }
    }
  }

  std::string name = xegpu::getTemporaryLayoutName(operand);
  if (owner->hasAttrOfType<DistributeLayoutAttr>(name)) {
    return;
  }
  if (layout) {
    owner->setAttr(name, layout);
  }
}

template <typename T, typename>
xegpu::DistributeLayoutAttr
xegpu::getTemporaryLayout(const T &operandOrResult) {
  Operation *op = operandOrResult.getOwner();

  std::string layoutName = xegpu::getTemporaryLayoutName(operandOrResult);
  if (op->hasAttr(layoutName)) {
    auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
    return layout;
  }

  return nullptr;
}

template xegpu::DistributeLayoutAttr
xegpu::getTemporaryLayout<mlir::OpResult>(const OpResult &result);
template xegpu::DistributeLayoutAttr
xegpu::getTemporaryLayout<mlir::OpOperand>(const OpOperand &operand);

template <typename T, typename>
void xegpu::setTemporaryLayout(const T &operandOrResult,
                               const xegpu::DistributeLayoutAttr layout) {
  Operation *owner = operandOrResult.getOwner();
  std::string name = xegpu::getTemporaryLayoutName(operandOrResult);
  if (owner->hasAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
    return;
  }
  if (layout) {
    owner->setAttr(name, layout);
  }
}

template void xegpu::setTemporaryLayout<mlir::OpResult>(
    const mlir::OpResult &result,
    const mlir::xegpu::DistributeLayoutAttr layout);

template void xegpu::setTemporaryLayout<mlir::OpOperand>(
    const mlir::OpOperand &operand,
    const mlir::xegpu::DistributeLayoutAttr layout);

SmallVector<Value>
xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
                                        Value value, ArrayRef<int64_t> shape) {
  auto vecTy = dyn_cast<VectorType>(value.getType());
  if (!vecTy)
    return {value};

  ArrayRef<int64_t> srcShape = vecTy.getShape();
  if (!computeShapeRatio(srcShape, shape))
    return {value};

  int64_t srcShapeRank = srcShape.size();
  int64_t targetShapeRank = shape.size();

  SmallVector<int64_t> adjustedTargetShape(srcShape.size());
  int64_t rankDiff = srcShapeRank - targetShapeRank;
  std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
            1);
  llvm::copy(shape, adjustedTargetShape.begin() + rankDiff);

  SmallVector<Value> result;
  for (SmallVector<int64_t> offsets :
       StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
    SmallVector<int64_t> staticStrides(offsets.size(), 1);
    Value slice = vector::ExtractStridedSliceOp::create(
        builder, loc, value, offsets, adjustedTargetShape, staticStrides);

    // Reshape to remove leading unit dims if needed
    if (srcShapeRank > targetShapeRank) {
      auto targetTy = VectorType::get(shape, vecTy.getElementType());
      slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
    }
    result.push_back(slice);
  }

  return result;
}

Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
                                             ValueRange values,
                                             ArrayRef<int64_t> shape) {
  VectorType inputTy = dyn_cast<VectorType>(values[0].getType());
  assert(llvm::all_of(values.getTypes(),
                      [&](Type type) { return type == inputTy; }) &&
         "values must be of the same VectorType");

  Type elemTy = inputTy.getElementType();
  ArrayRef<int64_t> tileShape = inputTy.getShape();

  VectorType resultTy = VectorType::get(shape, elemTy);
  auto zeroAttr = builder.getZeroAttr(elemTy);
  Value result = arith::ConstantOp::create(
      builder, loc, resultTy, DenseElementsAttr::get(resultTy, zeroAttr));

  for (auto [src, offsets] :
       llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
    SmallVector<int64_t> staticStrides(tileShape.size(), 1);
    result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
                                                  offsets, staticStrides);
  }
  return result;
}

void xegpu::doSCFStructuralTypeConversionWithTensorType(
    Operation *op, TypeConverter converter) {
  MLIRContext *context = op->getContext();

  auto materializeCast = [](OpBuilder &builder, Type type, ValueRange inputs,
                            Location loc) -> Value {
    return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
        .getResult(0);
  };

  { // convert VectorType to RankedTensorType for SCF Structural ops
    TypeConverter converter;
    converter.addConversion([](Type type) -> Type { return type; });
    converter.addConversion([](VectorType type) -> Type {
      return RankedTensorType::get(type.getShape(), type.getElementType());
    });
    converter.addSourceMaterialization(materializeCast);
    converter.addTargetMaterialization(materializeCast);

    mlir::ConversionTarget target(*context);
    target.addLegalOp<UnrealizedConversionCastOp>();

    mlir::RewritePatternSet patterns(context);
    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                         target);
    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
  }

  { // propagate the layout attribute to RankedTensorType by checking
    // BuiltInUnrealizedCastOps
    // for VectorType to RankedTensorType cast.
    op->walk([](UnrealizedConversionCastOp castOp) {
      if (castOp.getNumOperands() != 1 || castOp.getNumResults() != 1)
        return WalkResult::skip();

      Value input = castOp.getInputs()[0];
      Value result = castOp.getResults()[0];
      auto inputTy = dyn_cast<VectorType>(input.getType());
      auto resultTy = dyn_cast<RankedTensorType>(result.getType());

      // Only look at ops casting from VectorType to RankedTensorType
      if (!inputTy || !resultTy)
        return WalkResult::skip();

      xegpu::DistributeLayoutAttr layout =
          xegpu::getDistributeLayoutAttr(input);
      if (!layout)
        return WalkResult::skip();

      RankedTensorType newTy = resultTy.cloneWithEncoding(layout);
      result.setType(newTy);

      // update the arguments if user is a LoopLike op.
      for (OpOperand &use : result.getUses()) {
        if (auto loop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
          BlockArgument arg = loop.getTiedLoopRegionIterArg(&use);
          arg.setType(newTy);
        }
        // whileOp has two regions, the BlockArgument of the after region
        // is not exposed by LoopLikeOpInterface
        if (auto whileOp = dyn_cast<scf::WhileOp>(use.getOwner())) {
          unsigned idx = use.getOperandNumber();
          BlockArgument arg = whileOp.getAfterArguments()[idx];
          arg.setType(newTy);
        }
      }
      return WalkResult::advance();
    });

    // using yieldOp as anchor to update the result type of its ParentOp
    op->walk([](scf::YieldOp yieldOp) {
      Operation *parentOp = yieldOp->getParentOp();
      for (OpResult r : parentOp->getOpResults()) {
        unsigned idx = r.getResultNumber();
        Type resultTy = r.getType();
        Type yieldTy = yieldOp.getResults()[idx].getType();
        if (isa<RankedTensorType>(resultTy) && yieldTy != resultTy)
          r.setType(yieldTy);
      }
    });
  }

  { // perform the conversion from RankedTensorType to VectorType based on the
    // DistributeLayoutAttr

    // Handle the UnrealizedConversionCastOp introduced by the first step.
    // For vector->RankedTensorType, it will simply forward the inputs.
    // For RankedTensorType->vector, it will update the inputs with the
    // one from the adaptor.
    class UnrealizedConversionCastOpPattern
        : public OpConversionPattern<mlir::UnrealizedConversionCastOp> {
      using OpConversionPattern<
          mlir::UnrealizedConversionCastOp>::OpConversionPattern;

      mlir::LogicalResult
      matchAndRewrite(mlir::UnrealizedConversionCastOp op,
                      OneToNOpAdaptor adaptor,
                      ConversionPatternRewriter &rewriter) const override {
        auto inputs = op.getOperands();
        auto outputs = op.getOutputs();

        if (inputs.size() != 1 || outputs.size() != 1)
          return failure();

        auto inputTy = inputs[0].getType();
        auto outputTy = outputs[0].getType();

        if (isa<VectorType>(inputTy) && isa<RankedTensorType>(outputTy)) {
          rewriter.replaceOpWithMultiple(op, adaptor.getInputs());
          return success();
        }

        if (isa<RankedTensorType>(inputTy) && isa<VectorType>(outputTy)) {
          SmallVector<Value> values = xegpu::flattenValues(adaptor.getInputs());
          auto newOp = UnrealizedConversionCastOp::create(rewriter, op.getLoc(),
                                                          outputTy, values);
          rewriter.replaceOp(op, newOp);
          return success();
        }
        return failure();
      }
    };

    converter.addSourceMaterialization(materializeCast);
    converter.addTargetMaterialization([&](OpBuilder &builder, TypeRange type,
                                           ValueRange inputs, Location loc) {
      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
          .getResults();
    });

    mlir::ConversionTarget target(*context);
    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
        [](UnrealizedConversionCastOp op) {
          auto isTensorTy = [](Type type) {
            return isa<RankedTensorType>(type);
          };
          return llvm::none_of(op->getOperandTypes(), isTensorTy) &&
                 llvm::none_of(op->getResultTypes(), isTensorTy);
        });
    mlir::RewritePatternSet patterns(context);
    patterns.insert<UnrealizedConversionCastOpPattern>(context);
    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
                                                         target);
    (void)mlir::applyPartialConversion(op, target, std::move(patterns));
  }
}

std::optional<std::string> xegpu::getChipStr(Operation *op) {
  auto gpuModuleOp = op->getParentOfType<gpu::GPUModuleOp>();

  if (!gpuModuleOp)
    return std::nullopt;

  auto targetAttrs = gpuModuleOp.getTargets();
  if (targetAttrs) {
    for (auto &attr : *targetAttrs) {
      auto xevmAttr = llvm::dyn_cast<xevm::XeVMTargetAttr>(attr);
      if (xevmAttr)
        return xevmAttr.getChip().str();
    }
  }

  return std::nullopt;
}

/// Generates element-wise addition ops of two arrays with same length.
SmallVector<OpFoldResult> xegpu::addElementwise(OpBuilder &builder,
                                                Location loc,
                                                ArrayRef<OpFoldResult> lhs,
                                                ArrayRef<OpFoldResult> rhs) {
  assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size");
  SmallVector<OpFoldResult> results;
  for (auto [l, r] : llvm::zip_equal(lhs, rhs)) {
    auto lval = getValueOrCreateConstantIndexOp(builder, loc, l);
    auto rval = getValueOrCreateConstantIndexOp(builder, loc, r);
    results.push_back(builder.createOrFold<arith::AddIOp>(loc, lval, rval));
  }
  return results;
}

/// Generates element-wise addition ops of two arrays with automatic alignment.
/// When the input arrays have different sizes, the shorter array is
/// right-aligned with the longer array, and the unmatched leading elements from
/// the longer array are preserved unchanged. This is commonly used for offset
/// computation where higher-dimensional offsets need to be added to
/// lower-dimensional adjustments.
///
/// Example:
///   lhs = [l1, l2, l3], rhs = [r1, r2]
///   Result: [11, l2+r1, l3+r2]
SmallVector<OpFoldResult>
xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
                           ArrayRef<OpFoldResult> lhs,
                           ArrayRef<OpFoldResult> rhs) {
  // ensure a is longer than b
  ArrayRef<OpFoldResult> a = lhs.size() >= rhs.size() ? lhs : rhs;
  ArrayRef<OpFoldResult> b = lhs.size() >= rhs.size() ? rhs : lhs;
  SmallVector<OpFoldResult> results(a.take_front(a.size() - b.size()));
  a = a.slice(a.size() - b.size());
  results.append(addElementwise(builder, loc, a, b));
  return results;
}

template <typename T>
int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
                             ArrayRef<T> candidateMultiples) {
  static_assert(std::is_integral<T>::value, "T must be an integer type");
  int largest = -1;
  SmallVector<T> multiples = {1};
  if (!candidateMultiples.empty())
    multiples =
        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
  for (T candidate : candidates) {
    for (T multiple : multiples) {
      int value = static_cast<int>(candidate * multiple);
      if (value != 0 && dim % value == 0 && value > largest)
        largest = value;
    }
  }
  return largest;
}

Value xegpu::subgroupReduction(Location loc, OpBuilder &builder, Value input,
                               vector::CombiningKind kind, uint32_t size) {
  // First reduce on a single thread to get per lane reduction value.
  Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
  // Parallel reduction using butterfly shuffles.
  for (uint64_t i = 1; i < size; i <<= 1) {
    Value shuffled =
        gpu::ShuffleOp::create(builder, loc, laneVal, i, /**  width = **/ size,
                               /**  mode = **/ gpu::ShuffleMode::XOR)
            .getShuffleResult();
    laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
  }
  return laneVal;
}

Value xegpu::lowerToVectorReductions(TypedValue<VectorType> src,
                                     TypedValue<VectorType> acc,
                                     vector::CombiningKind kind,
                                     int64_t reductionDim, Location loc,
                                     PatternRewriter &rewriter) {
  VectorType sourceType = src.getType();
  int64_t sourceRank = sourceType.getRank();
  // Expecting at least a 2D source vector. Leading dimensions (all except the
  // last two) must be unit.
  assert(sourceRank >= 2 && "expected at least a 2D source vector");
  for (int64_t i = 0; i < sourceRank - 2; ++i)
    assert(sourceType.getShape()[i] == 1 &&
           "expected leading dimensions to be unit");
  int64_t rowIdx = sourceRank - 2;
  int64_t columnIdx = sourceRank - 1;
  int64_t sourceH = sourceType.getShape()[rowIdx];
  int64_t sourceW = sourceType.getShape()[columnIdx];
  int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;
  // Create a constant vector to hold the result of the reduction.
  TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
  Value reductionResult = arith::ConstantOp::create(
      rewriter, loc, acc.getType(),
      DenseElementsAttr::get(acc.getType(), zeroAttr));
  auto srcLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(src));
  auto accLayout = xegpu::getTemporaryLayout(dyn_cast<OpResult>(acc));
  // Reduction result should have the same layout as the accumulator.
  xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
  // For each slice of the source, extract the slice vector, do a reduction
  // and, insert the reduced value back to the result vector.
  int64_t accRank = acc.getType().getRank();
  for (int i = 0; i < nSlices; ++i) {
    // Build nD offsets, sizes, and strides. Leading unit dims get
    // offset=0, size=1. The last two dims are set based on reductionDim.
    SmallVector<int64_t> sliceOffsets(sourceRank, 0);
    SmallVector<int64_t> sliceSizes(sourceRank, 1);
    SmallVector<int64_t> strides(sourceRank, 1);
    if (reductionDim == columnIdx) {
      sliceOffsets[rowIdx] = i;
      sliceSizes[columnIdx] = sourceW;
    } else {
      sliceOffsets[columnIdx] = i;
      sliceSizes[rowIdx] = sourceH;
    }

    vector::ExtractStridedSliceOp extractOp =
        vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                              sliceSizes, strides);
    // Extract strided slice has the same layout as src.
    xegpu::setTemporaryLayout(extractOp->getOpResult(0), srcLayout);

    int64_t nSliceElements = extractOp.getResult().getType().getNumElements();

    vector::ShapeCastOp slice = vector::ShapeCastOp::create(
        rewriter, loc,
        VectorType::get({nSliceElements}, sourceType.getElementType()),
        extractOp.getResult());

    // Shape cast output has the same layout as the accumulator. Shape cast
    // source has the same layout as the original reduction source.
    xegpu::setTemporaryLayout(slice->getOpOperand(0), srcLayout);
    xegpu::setTemporaryLayout(slice->getOpResult(0), accLayout);
    // Extract and reduction results in scalars, so no result layout is needed.
    // Build multi-dim index into acc (sourceRank-1 dims, i.e. source shape with
    // the reduction dim removed). Leading unit dims get index 0.
    SmallVector<int64_t> accIdx(accRank, 0);
    accIdx[accRank - 1] = i;
    Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
    Value reduction = vector::ReductionOp::create(
        rewriter, loc, kind, slice.getResult(), accExtract);
    reductionResult = vector::InsertOp::create(rewriter, loc, reduction,
                                               reductionResult, accIdx);
    // Insert op should have the same layout as the accumulator.
    xegpu::setTemporaryLayout(cast<OpResult>(reductionResult), accLayout);
  }
  return reductionResult;
}

Value xegpu::lowerCrossLaneReductionToShuffles(
    TypedValue<VectorType> src, TypedValue<VectorType> acc,
    vector::CombiningKind kind, int64_t reductionDim, int64_t reductionSize,
    Location loc, PatternRewriter &rewriter) {
  VectorType sourceType = src.getType();
  int64_t sourceRank = sourceType.getRank();
  // Expecting at least a 2D source vector. Leading dimensions (all except the
  // last two) must be unit.
  assert(sourceRank >= 2 && "expected at least a 2D source vector");
  for (int64_t i = 0; i < sourceRank - 2; ++i)
    assert(sourceType.getShape()[i] == 1 &&
           "expected leading dimensions to be unit");
  int64_t rowIdx = sourceRank - 2;
  int64_t columnIdx = sourceRank - 1;
  int64_t sourceH = sourceType.getShape()[rowIdx];
  int64_t sourceW = sourceType.getShape()[columnIdx];

  // Create a constant vector to hold the result of the reduction.
  TypedAttr zeroAttr = rewriter.getZeroAttr(sourceType.getElementType());
  Value reductionResult = arith::ConstantOp::create(
      rewriter, loc, acc.getType(),
      DenseElementsAttr::get(acc.getType(), zeroAttr));

  // nSlices is the number of reduction operations needed to reduce the entire
  // source vector. For example, if reductionDim is the row dim, we are
  // reducing across rows, and each slice is a column. So the number of slices
  // is the number of columns, which is sourceW.
  int nSlices = (reductionDim == rowIdx) ? sourceW : sourceH;

  // For each slice of the source, extract the slice vector, do a reduction
  // and, insert the reduced value back to the result vector.
  int64_t accRank = acc.getType().getRank();
  for (int i = 0; i < nSlices; ++i) {
    // Build nD offsets, sizes, and strides. Leading unit dims get
    // offset=0, size=1. The last two dims are set based on reductionDim.
    SmallVector<int64_t> sliceOffsets(sourceRank, 0);
    SmallVector<int64_t> sliceSizes(sourceRank, 1);
    SmallVector<int64_t> strides(sourceRank, 1);
    if (reductionDim == columnIdx) {
      sliceOffsets[rowIdx] = i;
      sliceSizes[columnIdx] = sourceW;
    } else {
      sliceOffsets[columnIdx] = i;
      sliceSizes[rowIdx] = sourceH;
    }

    vector::ExtractStridedSliceOp extractOp =
        vector::ExtractStridedSliceOp::create(rewriter, loc, src, sliceOffsets,
                                              sliceSizes, strides);
    int64_t nSliceElements = extractOp.getResult().getType().getNumElements();
    vector::ShapeCastOp slice = vector::ShapeCastOp::create(
        rewriter, loc,
        VectorType::get({nSliceElements}, sourceType.getElementType()),
        extractOp.getResult());

    SmallVector<int64_t> accIdx(accRank, 0);
    accIdx[accRank - 1] = i;
    Value accExtract = vector::ExtractOp::create(rewriter, loc, acc, accIdx);
    Value fullReduce =
        xegpu::subgroupReduction(loc, rewriter, slice, kind, reductionSize);
    fullReduce =
        vector::makeArithReduction(rewriter, loc, kind, fullReduce, accExtract);
    reductionResult = vector::InsertOp::create(rewriter, loc, fullReduce,
                                               reductionResult, accIdx);
  }
  return reductionResult;
}

Value xegpu::createReductionNeutralValue(OpBuilder &builder, Location loc,
                                         Type type,
                                         vector::CombiningKind kind) {
  auto vecTy = dyn_cast<VectorType>(type);
  Type elemTy = vecTy ? vecTy.getElementType() : type;

  // Helper to create either a splat vector or scalar constant from an attr.
  auto makeConst = [&](Attribute scalarAttr) -> Value {
    if (vecTy)
      return arith::ConstantOp::create(
          builder, loc, vecTy, DenseElementsAttr::get(vecTy, scalarAttr));
    return arith::ConstantOp::create(builder, loc, cast<TypedAttr>(scalarAttr));
  };

  switch (kind) {
  case vector::CombiningKind::ADD:
  case vector::CombiningKind::XOR:
  case vector::CombiningKind::OR:
  case vector::CombiningKind::MAXUI:
    return makeConst(builder.getZeroAttr(elemTy));

  case vector::CombiningKind::MUL:
  case vector::CombiningKind::AND:
    return makeConst(builder.getOneAttr(elemTy));

  case vector::CombiningKind::MINSI:
    if (auto intTy = dyn_cast<IntegerType>(elemTy))
      return makeConst(builder.getIntegerAttr(
          elemTy, APInt::getSignedMaxValue(intTy.getWidth())));
    return nullptr;

  case vector::CombiningKind::MINUI:
    if (auto intTy = dyn_cast<IntegerType>(elemTy))
      return makeConst(
          builder.getIntegerAttr(elemTy, APInt::getMaxValue(intTy.getWidth())));
    return nullptr;

  case vector::CombiningKind::MAXSI:
    if (auto intTy = dyn_cast<IntegerType>(elemTy))
      return makeConst(builder.getIntegerAttr(
          elemTy, APInt::getSignedMinValue(intTy.getWidth())));
    return nullptr;

  case vector::CombiningKind::MINNUMF:
  case vector::CombiningKind::MINIMUMF:
    if (auto floatTy = dyn_cast<FloatType>(elemTy))
      return makeConst(builder.getFloatAttr(
          elemTy, APFloat::getInf(floatTy.getFloatSemantics())));
    return nullptr;

  case vector::CombiningKind::MAXNUMF:
  case vector::CombiningKind::MAXIMUMF:
    if (auto floatTy = dyn_cast<FloatType>(elemTy))
      return makeConst(builder.getFloatAttr(
          elemTy, APFloat::getInf(floatTy.getFloatSemantics(), true)));
    return nullptr;
  }
  return nullptr;
}

/// Explicit instantiations
template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
                                           ArrayRef<int> candidateMultiples);
template int
xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
                                   ArrayRef<unsigned> candidateMultiples);

bool xegpu::requirePacked(const xegpu::LayoutAttr layout) {
  if (!layout)
    return false;
  auto laneData = layout.getEffectiveLaneDataAsInt();
  if (laneData.size() != 2)
    return false;
  return laneData[0] != 1;
}

bool xegpu::requireTranspose(const xegpu::LayoutAttr layout,
                             const xegpu::uArch::uArch *uArch) {
  // Return false for unsupported targets.
  // TODO: Add more support or move to target info.
  if (uArch->getName().equals_insensitive("pvc") &&
      uArch->getName().equals_insensitive("bmg"))
    return false;
  if (!layout)
    return false;
  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
  if (laneLayout.size() != 2)
    return false;
  return laneLayout[0] == uArch->getSubgroupSize() && laneLayout[1] == 1;
}

// Check if dst shape is an expansion of src shape by inserting unit dimensions.
// Returns true if all dimensions in src match corresponding dimensions in dst
// (after skipping unit dimensions), and populates expandedUnitDims with the
// indices of the unit dimensions in dst that were added (not present in src).
// Example: src=[2,3], dst=[1,2,3,1] -> true, expandedUnitDims=[0,3]
bool xegpu::matchUnitDimExpansion(ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
                                  SmallVector<int64_t> &expandedUnitDims) {
  // All unit dimensions in dst that don't appear in src are the expanded
  // unit dimensions
  size_t srcIdx = 0;
  for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx)
    if (srcIdx < src.size() && src[srcIdx] == dst[dstIdx])
      srcIdx++;
    else if (dst[dstIdx] == 1)
      expandedUnitDims.push_back(dstIdx);
    else
      return false;
  return srcIdx == src.size();
}

// Checks if dst shape is an expansion of src shape where each dimension in src
// is split into one or more consecutive dimensions in dst whose product equals
// the original dimension. Populates splitDimGroups with groups of dst indices
// that correspond to each src dimension. Example: src=[6,4], dst=[2,3,2,2] ->
// true
bool xegpu::matchSplitDimExpansion(
    ArrayRef<int64_t> src, ArrayRef<int64_t> dst,
    SmallVector<SmallVector<int64_t>> &splitDimGroups) {
  // each dim in src can be mapped to one or more dims in dst whose product
  // equals to the src dim
  size_t srcIdx = 0;
  int64_t accumulatedSize = 1;
  SmallVector<int64_t> currentDstDims;

  splitDimGroups.clear();
  for (size_t dstIdx = 0; dstIdx < dst.size(); ++dstIdx) {
    if (srcIdx >= src.size())
      return false;
    accumulatedSize *= dst[dstIdx];
    currentDstDims.push_back(dstIdx);

    if (accumulatedSize == src[srcIdx]) {
      // Record the mapping: srcIdx -> currentDstDims
      splitDimGroups.push_back(currentDstDims);
      // move to next src dim
      srcIdx++;
      accumulatedSize = 1;
      currentDstDims.clear();
    } else if (accumulatedSize > src[srcIdx]) {
      return false;
    }
  }
  return srcIdx == src.size();
}