llvm-project/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp

//===---- XeGPUBlocking.cpp ---- XeGPU Blocking Pass ----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/XeGPU/Transforms/Passes.h"

#include "mlir/Dialect/Index/IR/IndexDialect.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
#include "mlir/Dialect/XeGPU/Transforms/XeGPULayoutImpl.h"
#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
#include "mlir/Interfaces/LoopLikeInterface.h"
#include "mlir/Pass/PassManager.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/DebugLog.h"

namespace mlir {
namespace xegpu {
#define GEN_PASS_DEF_XEGPUBLOCKING
#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
} // namespace xegpu
} // namespace mlir

#define DEBUG_TYPE "xegpu-blocking"

using namespace mlir;

namespace {

// reslove the unrealized conversion cast ops generated when doing SCF
// Structural Type Conversion. It will have two formats, N:1 vector
// cast and 1:N vector cast. vector::insert_strided_slice ops will be
// used for the first case, and vector::extract_strided_slice ops will be
// used for the second case.
static void
resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
  ValueRange inputs = castOp.getInputs();
  ValueRange outputs = castOp.getOutputs();

  auto hasIdenticalVectorTypes = [](ValueRange values) {
    auto types = values.getTypes();
    return llvm::all_of(types, [&](Type type) {
      return isa<VectorType>(type) && type == types.front();
    });
  };

  // We only interest in the case where all inputs and outputs have the
  // identical VectorTypes
  if (!hasIdenticalVectorTypes(inputs) || !hasIdenticalVectorTypes(outputs)) {
    LDBG() << "skip unrealized conversion cast op not emulating pack/unpack.";
    return;
  }

  VectorType outputTy = dyn_cast<VectorType>(outputs[0].getType());
  OpBuilder builder(castOp);
  if (inputs.size() > 1 && outputs.size() == 1) {
    // the castOp is emulating an unpack op
    ArrayRef<int64_t> shape = outputTy.getShape();
    Value result = xegpu::createVectorWithShapeFromValues(
        builder, castOp.getLoc(), inputs, shape);
    castOp->replaceAllUsesWith(ValueRange(result));
    castOp->erase();
  } else if (castOp.getNumResults() > 1 && castOp.getNumOperands() == 1) {
    // the castOp is emulating a pack op
    ArrayRef<int64_t> tileShape = outputTy.getShape();
    SmallVector<Value> results = xegpu::extractVectorsWithShapeFromValue(
        builder, castOp.getLoc(), inputs[0], tileShape);
    castOp->replaceAllUsesWith(results);
    castOp->erase();
  }
}

// This pattern lowers ConvertLayoutOp by removing the inst_data field from the
// layout attributes. Since both producer and consumer operations handle data
// partitioning based on their own inst_data, while maintaining original input
// and output shape, ConvertLayoutOp does not need to manage inst_data.
struct ConvertLayoutOpPattern
    : public OpRewritePattern<xegpu::ConvertLayoutOp> {
  using OpRewritePattern::OpRewritePattern;
  LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
                                PatternRewriter &rewriter) const override {
    xegpu::DistributeLayoutAttr inputLayout = op.getInputLayoutAttr();
    xegpu::DistributeLayoutAttr targetLayout = op.getTargetLayoutAttr();
    if (inputLayout.getEffectiveInstDataAsInt().empty() ||
        targetLayout.getEffectiveInstDataAsInt().empty())
      return rewriter.notifyMatchFailure(op, "Not a target ConvertLayoutOp.");

    inputLayout = inputLayout.dropInstData();
    targetLayout = targetLayout.dropInstData();
    auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
        op.getLoc(), op.getType(), op.getSource(), inputLayout, targetLayout);
    rewriter.replaceOp(op, newOp);
    return success();
  }
};

//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
// smaller shapes, as specified by the inst_data in the layout attribute. This
// enables each resulting operation to be efficiently mapped to a hardware
// instruction.
//===------------------------------------------------------------------------===//

class XeGPUBlockingPass final
    : public xegpu::impl::XeGPUBlockingBase<XeGPUBlockingPass> {
public:
  void runOnOperation() override;

private:
  // Get the tile shape for a given OpOperand or OpResult by examining the
  // corresponding layout attribute. If layout is not present or is not a
  // subgroup level layout, it returns std::nullopt.
  template <typename T,
            typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                        std::is_same_v<T, OpResult>>>
  std::optional<SmallVector<int64_t>>
  getTileShape(const T &operandOrResult) const;

  // Get the tile shape for a given operation.
  std::optional<SmallVector<int64_t>> getTileShape(Operation *op) const;

  // Determine if the operation requires unrolling. Return false if all operands
  // and results have tile shapes identical to their original types. Otherwise,
  // return true.
  bool needsUnroll(Operation *op) const;
};
} // namespace

template <typename T, typename>
std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
  Value value;
  Operation *ownerOp;
  if constexpr (std::is_same_v<T, OpOperand>) {
    value = operandOrResult.get();
    ownerOp = operandOrResult.getOwner();
  } else {
    value = (Value)operandOrResult;
    ownerOp = value.getDefiningOp();
  }

  xegpu::DistributeLayoutAttr layout =
      xegpu::getDistributeLayoutAttr(operandOrResult);
  if (layout && layout.isForSubgroup()) {
    if (!layout.getEffectiveInstDataAsInt().empty()) {
      SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
      // Remove leading unit dimensions from inst_data for non-rank-sensitive
      // ops. For example, if the inst_data is [1, 1, 32] it will pass [32] as
      // the unroll/blocking size.
      // Skip it for rank-sensitive ops, whose semantics depend on the tensor
      // rank (and consequently its shape), and therefore must not alter the
      // input tile rank or shape, such as by dropping leading dimensions.
      bool skipLeadingUnitDimRemoval =
          ownerOp &&
          (isa<xegpu::CreateNdDescOp, xegpu::DpasOp, xegpu::ConvertLayoutOp,
               xegpu::LoadMatrixOp, xegpu::StoreMatrixOp, xegpu::AtomicRMWOp,
               xegpu::LoadNdOp, xegpu::StoreNdOp, xegpu::PrefetchNdOp,
               vector::TransposeOp, vector::ShapeCastOp,
               vector::MultiDimReductionOp, vector::BroadcastOp>(ownerOp));
      if (!skipLeadingUnitDimRemoval) {
        auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
        instData.erase(instData.begin(), it);
      }
      return instData;
    }

    if (auto type = dyn_cast<ShapedType>(value.getType()))
      return llvm::to_vector(type.getShape());
  }
  LDBG() << "failed to getTileShape for: " << value;
  return std::nullopt;
}

std::optional<SmallVector<int64_t>>
XeGPUBlockingPass::getTileShape(Operation *op) const {
  if (isa<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp, xegpu::CreateDescOp,
          xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op))
    return getTileShape(op->getOpResult(0));
  if (isa<xegpu::PrefetchNdOp, xegpu::LoadNdOp, xegpu::PrefetchOp,
          xegpu::StoreMatrixOp>(op))
    return getTileShape(op->getOpOperand(0));
  if (isa<xegpu::StoreNdOp>(op))
    return getTileShape(op->getOpOperand(1));

  // Handle LoadGatherOp and StoreScatterOp (with and without offset)
  if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(op)) {
    if (loadGatherOp.getOffsets())
      return getTileShape(loadGatherOp->getOpResult(0));
    else
      return getTileShape(loadGatherOp->getOpOperand(0));
  }

  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
    return getTileShape(storeScatterOp.getOffsets()
                            ? storeScatterOp->getOpOperand(0)
                            : storeScatterOp->getOpOperand(1));

  if (isa<xegpu::DpasOp>(op)) {
    std::optional<SmallVector<int64_t>> aTile =
        getTileShape(op->getOpOperand(0));
    std::optional<SmallVector<int64_t>> bTile =
        getTileShape(op->getOpOperand(1));

    if (!aTile || aTile->size() != 2 || !bTile || bTile->size() != 2)
      return std::nullopt;

    // semantic check for A and B
    if ((*aTile)[1] != (*bTile)[0])
      return std::nullopt;

    // semantic check for C
    if (op->getNumOperands() == 3) {
      std::optional<SmallVector<int64_t>> cTile =
          getTileShape(op->getOpOperand(2));
      int64_t expectedCTile[2] = {(*aTile)[0], (*bTile)[1]};
      if (!cTile || !llvm::equal(*cTile, expectedCTile))
        return std::nullopt;
    }

    return SmallVector<int64_t>({(*aTile)[0], (*aTile)[1], (*bTile)[1]});
  }

  if (OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)
    return getTileShape(op->getOpResult(0));

  if (isa<vector::MultiDimReductionOp>(op))
    return getTileShape(op->getOpOperand(0));

  if (isa<vector::TransposeOp, vector::BroadcastOp, vector::StepOp,
          vector::ConstantMaskOp, vector::CreateMaskOp>(op))
    return getTileShape(op->getOpResult(0));

  return std::nullopt;
}

bool XeGPUBlockingPass::needsUnroll(Operation *op) const {
  // skip the op if any of its operands or results has workgroup level layouts
  bool hasWgLayoutOperands =
      llvm::any_of(op->getOpOperands(), [](OpOperand &opr) {
        xegpu::DistributeLayoutAttr layout =
            xegpu::getDistributeLayoutAttr(opr);
        return layout && layout.isForWorkgroup();
      });
  bool hasWgLayoutResults =
      llvm::any_of(op->getOpResults(), [](OpResult result) {
        xegpu::DistributeLayoutAttr layout =
            xegpu::getDistributeLayoutAttr(result);
        return layout && layout.isForWorkgroup();
      });
  if (hasWgLayoutOperands || hasWgLayoutResults) {
    LDBG() << "skip unrolling for op with workgroup level layout: " << *op;
    return false;
  }

  auto isUnrollable = [](Value value, ArrayRef<int64_t> tileShape) {
    Type valTy = value.getType();
    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(valTy)) {
      xegpu::DistributeLayoutAttr layout = tdescTy.getLayoutAttr();
      return layout && !layout.getEffectiveInstDataAsInt().empty();
    }
    auto shapedType = dyn_cast<ShapedType>(valTy);
    return shapedType && !llvm::equal(tileShape, shapedType.getShape());
  };

  bool hasUnrollableOperands =
      llvm::any_of(op->getOpOperands(), [&](OpOperand &opr) {
        std::optional<SmallVector<int64_t>> tileShape = getTileShape(opr);
        return tileShape.has_value() && isUnrollable(opr.get(), *tileShape);
      });
  bool hasUnrollableResults =
      llvm::any_of(op->getOpResults(), [&](OpResult result) {
        std::optional<SmallVector<int64_t>> tileShape = getTileShape(result);
        return tileShape.has_value() && isUnrollable(result, *tileShape);
      });
  return hasUnrollableOperands || hasUnrollableResults;
}

void XeGPUBlockingPass::runOnOperation() {
  MLIRContext *ctx = &getContext();
  Operation *op = getOperation();

  // TODO-LayoutRefactor: unify the local propagation for layout preprocessing
  // replace the function with recoverTemporaryLayouts
  // if (!xegpu::recoverTemporaryLayouts(op)) {
  //   signalPassFailure();
  //   return;
  // }
  xegpu::recoverTemporaryLayoutsDeprecated(op);

  auto getTileShapeAndCount = [](llvm::ArrayRef<int64_t> shape,
                                 xegpu::LayoutAttr layout) {
    int count = 1;
    SmallVector<int64_t> tileShape(shape);
    if (layout && layout.getInstData()) {
      DenseI32ArrayAttr instData = layout.getInstData();
      tileShape = llvm::to_vector_of<int64_t>(instData.asArrayRef());
      count = computeProduct(shape) / computeProduct(tileShape);
    }
    return std::make_pair(tileShape, count);
  };

  // Perform type conversion for SCF control folow ops
  TypeConverter converter;
  converter.addConversion([](Type type) -> Type { return type; });
  converter.addConversion(
      [&](RankedTensorType type,
          SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
        Type elemTy = type.getElementType();
        ArrayRef<int64_t> shape = type.getShape();

        auto layout =
            llvm::dyn_cast_if_present<xegpu::LayoutAttr>(type.getEncoding());
        if (layout && layout.isForWorkgroup())
          return failure();

        int count;
        SmallVector<int64_t> subShape;
        std::tie(subShape, count) = getTileShapeAndCount(shape, layout);
        auto newTy = VectorType::get(subShape, elemTy);
        result.append(count, newTy);
        return success();
      });
  converter.addConversion(
      [&](xegpu::TensorDescType type,
          SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
        Type elemTy = type.getElementType();
        ArrayRef<int64_t> shape = type.getShape();

        xegpu::LayoutAttr layout = type.getLayoutAttr();
        if (layout && layout.isForWorkgroup())
          return failure();

        int count;
        SmallVector<int64_t> subShape;
        std::tie(subShape, count) = getTileShapeAndCount(shape, layout);

        if (layout)
          layout = layout.dropInstData();

        auto newTy = xegpu::TensorDescType::get(
            type.getContext(), subShape, elemTy, type.getEncoding(), layout);
        result.append(count, newTy);
        return success();
      });

  xegpu::doSCFStructuralTypeConversionWithTensorType(op, converter);

  // Remove leading unit dimensions from vector ops and then
  // do the unrolling.
  {
    RewritePatternSet patterns(ctx);
    vector::populateCastAwayVectorLeadingOneDimPatterns(patterns);
    (void)applyPatternsGreedily(op, std::move(patterns));
  }
  xegpu::UnrollOptions options;
  options.setFilterConstraint(
      [&](Operation *op) -> LogicalResult { return success(needsUnroll(op)); });

  options.setNativeShapeFn([&](Operation *op) { return getTileShape(op); });

  options.setUnrolledTypesFn([&](ShapedType type, ArrayRef<int64_t> tileShape,
                                 bool returnSingleType = false) {
    Type elemTy = type.getElementType();
    Type newTy;

    if (auto tdescTy = dyn_cast<xegpu::TensorDescType>(type)) {

      Attribute encoding = tdescTy.getEncoding();
      // If the encoding is a ScatterTensorDescAttr, we need to
      // potentially adjust the chunk size based on the inst_data.
      if (tdescTy.isScattered()) {
        int64_t chunkSize = tdescTy.getChunkSizeAsInt();

        if (chunkSize > 1) {
          int64_t blockedChunkSize = chunkSize;
          auto instData = tdescTy.getLayoutAttr().getInstData();
          if (!instData.empty())
            blockedChunkSize = instData.asArrayRef().back();

          // To create a new attribute with a different chunk_size:
          auto newEncoding = xegpu::ScatterTensorDescAttr::get(
              ctx, tdescTy.getMemorySpace(), blockedChunkSize);
          encoding = newEncoding;
        }
      }

      newTy =
          xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                     tdescTy.getLayoutAttr().dropInstData());
    } else {
      newTy = VectorType::get(tileShape, elemTy);
    }

    if (returnSingleType)
      return SmallVector<Type>{newTy};
    std::optional<SmallVector<int64_t>> ratio =
        computeShapeRatio(type.getShape(), tileShape);
    assert(ratio && "The shape of the type must be a multiple of tileShape.");
    return SmallVector<Type>(computeProduct(*ratio), newTy);
  });

  RewritePatternSet patterns(ctx);
  patterns.add<ConvertLayoutOpPattern>(ctx);

  vector::UnrollVectorOptions vectorOptions;
  vectorOptions.setNativeShapeFn(options.nativeShape);

  populateXeGPUUnrollPatterns(patterns, options);
  vector::populateVectorUnrollPatterns(patterns, vectorOptions);

  (void)applyPatternsGreedily(op, std::move(patterns));

  op->walk([](Operation *op) {
    // Remove the layout attributes cached per operands.
    for (OpOperand &opr : op->getOpOperands()) {
      std::string name = xegpu::getTemporaryLayoutName(opr);
      if (op->hasAttrOfType<xegpu::DistributeLayoutAttr>(name))
        op->removeAttr(name);
    }

    // Update the layout attributes per result.
    for (OpResult result : op->getOpResults()) {
      std::string name = xegpu::getTemporaryLayoutName(result);
      if (auto layout = op->getAttrOfType<xegpu::DistributeLayoutAttr>(name)) {
        op->removeAttr(name);
        if (!isa<LoopLikeOpInterface>(op))
          xegpu::setDistributeLayoutAttr(result, layout.dropInstData());
      }
    }

    // Resolve unrealized conversion cast ops emulating pack/unpack
    if (auto castOp = dyn_cast<UnrealizedConversionCastOp>(op))
      resolveUnrealizedConversionCastOp(castOp);
  });
}