llvm-project/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

//===- GPUTransformOps.cpp - Implementation of GPU transform ops ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/TransformOps/Utils.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/Transform/IR/TransformDialect.h"
#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/Dialect/Vector/Transforms/VectorTransforms.h"
#include "mlir/IR/AffineExpr.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/MLIRContext.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/IR/Visitors.h"
#include "mlir/Support/LLVM.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include <type_traits>

using namespace mlir;
using namespace mlir::gpu;
using namespace mlir::transform;
using namespace mlir::transform::gpu;

#define DEBUG_TYPE "gpu-transforms"
#define DEBUG_TYPE_ALIAS "gpu-transforms-alias"

#define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
#define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ")

//===----------------------------------------------------------------------===//
// Apply...ConversionPatternsOp
//===----------------------------------------------------------------------===//

void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns(
    TypeConverter &typeConverter, RewritePatternSet &patterns) {
  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
  // NVVM uses alloca in the default address space to represent private
  // memory allocations, so drop private annotations. NVVM uses address
  // space 3 for shared memory. NVVM uses the default address space to
  // represent global memory.
  // Used in populateGpuToNVVMConversionPatternsso attaching here for now.
  // TODO: We should have a single to_nvvm_type_converter.
  populateGpuMemorySpaceAttributeConversions(
      llvmTypeConverter, [](AddressSpace space) -> unsigned {
        switch (space) {
        case AddressSpace::Global:
          return static_cast<unsigned>(
              NVVM::NVVMMemorySpace::kGlobalMemorySpace);
        case AddressSpace::Workgroup:
          return static_cast<unsigned>(
              NVVM::NVVMMemorySpace::kSharedMemorySpace);
        case AddressSpace::Private:
          return 0;
        }
        llvm_unreachable("unknown address space enum value");
        return 0;
      });
  // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now.
  // TODO: We should have a single to_nvvm_type_converter.
  llvmTypeConverter.addConversion(
      [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); });
  // Set higher benefit, so patterns will run before generic LLVM lowering.
  populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns,
                                      getBenefit());
}

LogicalResult
transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter(
    transform::TypeConverterBuilderOpInterface builder) {
  if (builder.getTypeConverterType() != "LLVMTypeConverter")
    return emitOpError("expected LLVMTypeConverter");
  return success();
}

void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns(
    TypeConverter &typeConverter, RewritePatternSet &patterns) {
  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
  populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns);
}

LogicalResult
transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter(
    transform::TypeConverterBuilderOpInterface builder) {
  if (builder.getTypeConverterType() != "LLVMTypeConverter")
    return emitOpError("expected LLVMTypeConverter");
  return success();
}

void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
    populatePatterns(TypeConverter &typeConverter,
                     RewritePatternSet &patterns) {
  auto &llvmTypeConverter = static_cast<LLVMTypeConverter &>(typeConverter);
  populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns);
}

LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp::
    verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) {
  if (builder.getTypeConverterType() != "LLVMTypeConverter")
    return emitOpError("expected LLVMTypeConverter");
  return success();
}

//===----------------------------------------------------------------------===//
// Apply...PatternsOp
//===----------------------------------------------------------------------===//s

void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) {
  populateGpuRewritePatterns(patterns);
}

//===----------------------------------------------------------------------===//
// ApplyUnrollVectorsSubgroupMmaOp
//===----------------------------------------------------------------------===//

/// Pick an unrolling order that will allow tensorcore operation to reuse LHS
/// register.
static std::optional<SmallVector<int64_t>>
gpuMmaUnrollOrder(vector::ContractionOp contract) {
  SmallVector<int64_t> order;
  // First make reduction the outer dimensions.
  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
    if (vector::isReductionIterator(iter)) {
      order.push_back(index);
    }
  }

  llvm::SmallDenseSet<int64_t> dims;
  for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) {
    dims.insert(cast<AffineDimExpr>(expr).getPosition());
  }
  // Then parallel dimensions that are part of Lhs as we want to re-use Lhs.
  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
    if (vector::isParallelIterator(iter) && dims.count(index)) {
      order.push_back(index);
    }
  }
  // Then the remaining parallel loops.
  for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) {
    if (vector::isParallelIterator(iter) && !dims.count(index)) {
      order.push_back(index);
    }
  }
  return order;
}

/// Returns the target vector size for the target operation based on the native
/// vector size specified with `m`, `n`, and `k`.
static std::optional<SmallVector<int64_t>>
getSubgroupMmaNativeVectorSize(Operation *op, int64_t m, int64_t n, int64_t k) {
  if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
    int64_t contractRank = contract.getIteratorTypes().size();
    if (contractRank < 3)
      return std::nullopt;
    SmallVector<int64_t> nativeSize(contractRank - 3, 1);
    nativeSize.append({m, n, k});
    return nativeSize;
  }
  if (auto writeOp = dyn_cast<vector::TransferWriteOp>(op)) {
    int64_t writeRank = writeOp.getVectorType().getRank();
    if (writeRank < 2)
      return std::nullopt;
    SmallVector<int64_t> nativeSize(writeRank - 2, 1);
    nativeSize.append({m, n});
    return nativeSize;
  }
  if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
    // Transfer read ops may need different shapes based on how they are being
    // used. For simplicity just match the shape used by the extract strided op.
    VectorType sliceType;
    for (Operation *users : op->getUsers()) {
      auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
      if (!extract)
        return std::nullopt;
      auto vecType = cast<VectorType>(extract.getResult().getType());
      if (sliceType && sliceType != vecType)
        return std::nullopt;
      sliceType = vecType;
    }
    return llvm::to_vector(sliceType.getShape());
  }
  if ((OpTrait::hasElementwiseMappableTraits(op) && op->getNumResults() == 1)) {
    if (auto vecType = dyn_cast<VectorType>(op->getResultTypes()[0])) {
      // TODO: The condition for unrolling elementwise should be restricted
      // only to operations that need unrolling (connected to the contract).
      if (vecType.getRank() < 2)
        return std::nullopt;

      // First check whether there is a slice to infer the shape from. This is
      // required for cases where the accumulator type differs from the input
      // types, in which case we will see an `arith.ext_` between the contract
      // and transfer_read which needs to be unrolled.
      VectorType sliceType;
      for (Operation *users : op->getUsers()) {
        auto extract = dyn_cast<vector::ExtractStridedSliceOp>(users);
        if (!extract)
          return std::nullopt;
        auto vecType = cast<VectorType>(extract.getResult().getType());
        if (sliceType && sliceType != vecType)
          return std::nullopt;
        sliceType = vecType;
      }
      if (sliceType)
        return llvm::to_vector(sliceType.getShape());

      // Else unroll for trailing elementwise.
      SmallVector<int64_t> nativeSize(vecType.getRank() - 2, 1);
      // Map elementwise ops to the output shape.
      nativeSize.append({m, n});
      return nativeSize;
    }
  }
  return std::nullopt;
}

void transform::ApplyUnrollVectorsSubgroupMmaOp::populatePatterns(
    RewritePatternSet &patterns) {
  auto unrollOrder = [](Operation *op) -> std::optional<SmallVector<int64_t>> {
    auto contract = dyn_cast<vector::ContractionOp>(op);
    if (!contract)
      return std::nullopt;
    return gpuMmaUnrollOrder(contract);
  };

  int64_t m = getM();
  int64_t n = getN();
  int64_t k = getK();
  auto nativeShapeFn =
      [m, n, k](Operation *op) -> std::optional<SmallVector<int64_t>> {
    return getSubgroupMmaNativeVectorSize(op, m, n, k);
  };
  vector::populateVectorUnrollPatterns(
      patterns, vector::UnrollVectorOptions()
                    .setNativeShapeFn(nativeShapeFn)
                    .setUnrollTraversalOrderFn(unrollOrder));
}

//===----------------------------------------------------------------------===//
// EliminateBarriersOp
//===----------------------------------------------------------------------===//

void EliminateBarriersOp::populatePatterns(RewritePatternSet &patterns) {
  populateGpuEliminateBarriersPatterns(patterns);
}

//===----------------------------------------------------------------------===//
// Block and thread mapping utilities.
//===----------------------------------------------------------------------===//

namespace {
/// Local types used for mapping verification.
struct MappingKind {};
struct BlockMappingKind : MappingKind {};
struct ThreadMappingKind : MappingKind {};
} // namespace

static DiagnosedSilenceableFailure
definiteFailureHelper(std::optional<TransformOpInterface> transformOp,
                      Operation *target, const Twine &message) {
  if (transformOp.has_value())
    return transformOp->emitDefiniteFailure() << message;
  return emitDefiniteFailure(target, message);
}

/// Check if given mapping attributes are one of the desired attributes
template <typename MappingKindType>
static DiagnosedSilenceableFailure
checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
                           scf::ForallOp forallOp) {
  if (!forallOp.getMapping().has_value()) {
    return definiteFailureHelper(transformOp, forallOp,
                                 "scf.forall op requires a mapping attribute");
  }

  bool hasBlockMapping = llvm::any_of(forallOp.getMapping().value(),
                                      llvm::IsaPred<GPUBlockMappingAttr>);
  bool hasWarpgroupMapping = llvm::any_of(
      forallOp.getMapping().value(), llvm::IsaPred<GPUWarpgroupMappingAttr>);
  bool hasWarpMapping = llvm::any_of(forallOp.getMapping().value(),
                                     llvm::IsaPred<GPUWarpMappingAttr>);
  bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
                                       llvm::IsaPred<GPUThreadMappingAttr>);
  int64_t countMappingTypes = 0;
  countMappingTypes += hasBlockMapping ? 1 : 0;
  countMappingTypes += hasWarpgroupMapping ? 1 : 0;
  countMappingTypes += hasWarpMapping ? 1 : 0;
  countMappingTypes += hasThreadMapping ? 1 : 0;
  if (countMappingTypes > 1) {
    return definiteFailureHelper(
        transformOp, forallOp,
        "cannot mix different mapping types, use nesting");
  }
  if (std::is_same<MappingKindType, BlockMappingKind>::value &&
      !hasBlockMapping) {
    return definiteFailureHelper(
        transformOp, forallOp,
        "scf.forall op requires a mapping attribute of kind 'block'");
  }
  if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
      !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
    return definiteFailureHelper(transformOp, forallOp,
                                 "scf.forall op requires a mapping attribute "
                                 "of kind 'thread' or 'warp'");
  }

  DenseSet<Attribute> seen;
  for (Attribute map : forallOp.getMapping()->getValue()) {
    if (seen.contains(map)) {
      return definiteFailureHelper(
          transformOp, forallOp,
          "duplicate attribute, cannot map different loops "
          "to the same mapping id");
    }
    seen.insert(map);
  }

  auto isLinear = [](Attribute a) {
    return cast<DeviceMappingAttrInterface>(a).isLinearMapping();
  };
  if (llvm::any_of(forallOp.getMapping()->getValue(), isLinear) &&
      !llvm::all_of(forallOp.getMapping()->getValue(), isLinear)) {
    return definiteFailureHelper(
        transformOp, forallOp,
        "cannot mix linear and non-linear mapping modes");
  }

  return DiagnosedSilenceableFailure::success();
}

template <typename MappingKindType>
static DiagnosedSilenceableFailure
verifyGpuMapping(std::optional<TransformOpInterface> transformOp,
                 scf::ForallOp forallOp) {
  // Check the types of the mapping attributes match.
  DiagnosedSilenceableFailure typeRes =
      checkMappingAttributeTypes<MappingKindType>(transformOp, forallOp);
  if (!typeRes.succeeded())
    return typeRes;

  // Perform other non-types verifications.
  if (!forallOp.isNormalized())
    return definiteFailureHelper(transformOp, forallOp,
                                 "unsupported non-normalized loops");
  if (forallOp.getNumResults() > 0)
    return definiteFailureHelper(transformOp, forallOp,
                                 "only bufferized scf.forall can be mapped");
  bool useLinearMapping = cast<DeviceMappingAttrInterface>(
                              forallOp.getMapping()->getValue().front())
                              .isLinearMapping();
  // TODO: This would be more natural with support for Optional<EnumParameter>
  // in GPUDeviceMappingAttr.
  int64_t maxNumMappingsSupported =
      useLinearMapping ? (getMaxEnumValForMappingId() -
                          static_cast<uint64_t>(MappingId::DimZ))
                       : 3;
  if (forallOp.getRank() > maxNumMappingsSupported) {
    return definiteFailureHelper(transformOp, forallOp,
                                 "scf.forall with rank > ")
           << maxNumMappingsSupported
           << " does not lower for the specified mapping attribute type";
  }
  auto numParallelIterations =
      getConstantIntValues(forallOp.getMixedUpperBound());
  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
    return definiteFailureHelper(
        transformOp, forallOp,
        "requires statically sized, normalized forall op");
  }
  return DiagnosedSilenceableFailure::success();
}

/// Struct to return the result of the rewrite of a forall operation.
struct ForallRewriteResult {
  SmallVector<int64_t> mappingSizes;
  SmallVector<Value> mappingIds;
};

/// Helper to replace ids of dimensions known to be 1 by 0 to simplify the IR.
template <typename OpTy, typename OperationOrBlock>
static void
replaceUnitMappingIdsHelper(RewriterBase &rewriter, Location loc,
                            OperationOrBlock *parent, Value replacement,
                            ArrayRef<int64_t> availableMappingSizes) {
  parent->walk([&](OpTy idOp) {
    if (availableMappingSizes[static_cast<int64_t>(idOp.getDimension())] == 1)
      rewriter.replaceAllUsesWith(idOp.getResult(), replacement);
  });
}

static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
    scf::ForallOp forallOp, ArrayRef<int64_t> availableMappingSizes,
    ForallRewriteResult &result, const GpuIdBuilder &gpuIdBuilder) {
  LDBG("--start rewriteOneForallCommonImpl");

  // Step 1. Complete the mapping to a full mapping (with 1s) if necessary.
  auto numParallelIterations =
      getConstantIntValues(forallOp.getMixedUpperBound());
  assert(forallOp.isNormalized() && numParallelIterations.has_value() &&
         "requires statically sized, normalized forall op");
  SmallVector<int64_t> tmpMappingSizes = numParallelIterations.value();
  SetVector<Attribute> forallMappingAttrs;
  forallMappingAttrs.insert_range(forallOp.getMapping()->getValue());
  auto comparator = [](Attribute a, Attribute b) -> bool {
    return cast<DeviceMappingAttrInterface>(a).getMappingId() <
           cast<DeviceMappingAttrInterface>(b).getMappingId();
  };

  // Step 1.b. In the linear case, compute the max mapping to avoid needlessly
  // mapping all dimensions. In the 3-D mapping case we need to map all
  // dimensions.
  DeviceMappingAttrInterface maxMapping = cast<DeviceMappingAttrInterface>(
      *llvm::max_element(forallMappingAttrs, comparator));
  DeviceMappingAttrInterface maxLinearMapping;
  if (maxMapping.isLinearMapping())
    maxLinearMapping = maxMapping;
  for (auto attr : gpuIdBuilder.mappingAttributes) {
    // If attr overflows, just skip.
    if (maxLinearMapping && comparator(maxLinearMapping, attr))
      continue;
    // Try to insert. If element was already present, just continue.
    if (!forallMappingAttrs.insert(attr))
      continue;
    // Otherwise, we have a new insertion without a size -> use size 1.
    tmpMappingSizes.push_back(1);
  }
  LLVM_DEBUG(
      llvm::interleaveComma(
          tmpMappingSizes,
          DBGS() << "----tmpMappingSizes extracted from scf.forall op: ");
      llvm::dbgs() << "\n");

  // Step 2. sort the values by the corresponding DeviceMappingAttrInterface.
  SmallVector<int64_t> forallMappingSizes = getValuesSortedByKey(
      forallMappingAttrs.getArrayRef(), tmpMappingSizes, comparator);
  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
                                   DBGS() << "----forallMappingSizes: ");
             llvm::dbgs() << "\n"; llvm::interleaveComma(
                 forallMappingAttrs, DBGS() << "----forallMappingAttrs: ");
             llvm::dbgs() << "\n");

  // Step 3. Generate the mappingIdOps using the provided generator.
  Location loc = forallOp.getLoc();
  OpBuilder::InsertionGuard guard(rewriter);
  rewriter.setInsertionPoint(forallOp);
  SmallVector<int64_t> originalBasis(availableMappingSizes);
  bool originalBasisWasProvided = !originalBasis.empty();
  if (!originalBasisWasProvided) {
    originalBasis = forallMappingSizes;
    while (originalBasis.size() < 3)
      originalBasis.push_back(1);
  }

  IdBuilderResult builderResult =
      gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);

  // Step 4. Map the induction variables to the mappingIdOps, this may involve
  // a permutation.
  SmallVector<Value> mappingIdOps = builderResult.mappingIdOps;
  IRMapping bvm;
  for (auto [iv, dim] : llvm::zip_equal(
           forallOp.getInductionVars(),
           forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
    auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
    Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
    bvm.map(iv, peIdOp);
  }

  // Step 5. If the originalBasis is already known, create conditionals to
  // predicate the region. Otherwise, the current forall determines the
  // originalBasis and no predication occurs.
  Value predicate;
  if (originalBasisWasProvided) {
    SmallVector<int64_t> activeMappingSizes = builderResult.activeMappingSizes;
    SmallVector<int64_t> availableMappingSizes =
        builderResult.availableMappingSizes;
    SmallVector<Value> activeIdOps = builderResult.activeIdOps;
    // clang-format off
    LLVM_DEBUG(
        llvm::interleaveComma(
          activeMappingSizes, DBGS() << "----activeMappingSizes: ");
        llvm::dbgs() << "\n";
        llvm::interleaveComma(
          availableMappingSizes, DBGS() << "----availableMappingSizes: ");
        llvm::dbgs() << "\n";
        llvm::interleaveComma(activeIdOps, DBGS() << "----activeIdOps: ");
        llvm::dbgs() << "\n");
    // clang-format on
    for (auto [activeId, activeMappingSize, availableMappingSize] :
         llvm::zip_equal(activeIdOps, activeMappingSizes,
                         availableMappingSizes)) {
      if (activeMappingSize > availableMappingSize) {
        return definiteFailureHelper(
            transformOp, forallOp,
            "Trying to map to fewer GPU threads than loop iterations but "
            "overprovisioning is not yet supported. "
            "Try additional tiling of the before mapping or map to more "
            "threads.");
      }
      if (activeMappingSize == availableMappingSize)
        continue;
      Value idx =
          rewriter.create<arith::ConstantIndexOp>(loc, activeMappingSize);
      Value tmpPredicate = rewriter.create<arith::CmpIOp>(
          loc, arith::CmpIPredicate::ult, activeId, idx);
      LDBG("----predicate: " << tmpPredicate);
      predicate = predicate ? rewriter.create<arith::AndIOp>(loc, predicate,
                                                             tmpPredicate)
                            : tmpPredicate;
    }
  }

  // Step 6. Move the body of forallOp.
  // Erase the terminator first, it will not be used.
  rewriter.eraseOp(forallOp.getTerminator());
  Block *targetBlock;
  Block::iterator insertionPoint;
  if (predicate) {
    // Step 6.a. If predicated, move at the beginning.
    auto ifOp = rewriter.create<scf::IfOp>(loc, predicate,
                                           /*withElseRegion=*/false);
    targetBlock = ifOp.thenBlock();
    insertionPoint = ifOp.thenBlock()->begin();
  } else {
    // Step 6.b. Otherwise, move inline just at the rewriter insertion
    // point.
    targetBlock = forallOp->getBlock();
    insertionPoint = rewriter.getInsertionPoint();
  }
  Block &sourceBlock = forallOp.getRegion().front();
  targetBlock->getOperations().splice(insertionPoint,
                                      sourceBlock.getOperations());

  // Step 7. RAUW indices.
  for (Value loopIndex : forallOp.getInductionVars()) {
    Value threadIdx = bvm.lookup(loopIndex);
    rewriter.replaceAllUsesWith(loopIndex, threadIdx);
  }

  // Step 8. Erase old op.
  rewriter.eraseOp(forallOp);

  LLVM_DEBUG(llvm::interleaveComma(forallMappingSizes,
                                   DBGS() << "----result forallMappingSizes: ");
             llvm::dbgs() << "\n"; llvm::interleaveComma(
                 mappingIdOps, DBGS() << "----result mappingIdOps: ");
             llvm::dbgs() << "\n");

  result = ForallRewriteResult{forallMappingSizes, mappingIdOps};
  return DiagnosedSilenceableFailure::success();
}

//===----------------------------------------------------------------------===//
// MapForallToBlocks
//===----------------------------------------------------------------------===//

DiagnosedSilenceableFailure mlir::transform::gpu::mapForallToBlocksImpl(
    RewriterBase &rewriter, TransformOpInterface transformOp,
    scf::ForallOp forallOp, SmallVectorImpl<int64_t> &gridDims,
    const GpuIdBuilder &gpuIdBuilder) {
  LDBG("Start mapForallToBlocksImpl");

  {
    // GPU-specific verifications. There is no better place to anchor
    // those right now: the ForallOp is target-independent and the transform
    // op does not apply to individual ForallOp.
    DiagnosedSilenceableFailure diag =
        verifyGpuMapping<BlockMappingKind>(transformOp, forallOp);
    if (!diag.succeeded())
      return diag;
  }

  Location loc = forallOp.getLoc();
  Block *parentBlock = forallOp->getBlock();
  Value zero;
  {
    // Create an early zero index value for replacements and immediately reset
    // the insertion point.
    OpBuilder::InsertionGuard guard(rewriter);
    rewriter.setInsertionPointToStart(parentBlock);
    zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
  }

  ForallRewriteResult rewriteResult;
  DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
      rewriter, transformOp, forallOp,
      /*availableMappingSizes=*/gridDims, rewriteResult, gpuIdBuilder);

  // Return if anything goes wrong, use silenceable failure as a match
  // failure.
  if (!diag.succeeded())
    return diag;

  // If gridDims was not provided already, set it from the return.
  if (gridDims.empty()) {
    gridDims = rewriteResult.mappingSizes;
    while (gridDims.size() < 3)
      gridDims.push_back(1);
  }
  assert(gridDims.size() == 3 && "Need 3-D gridDims");

  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
  // Here, the result of mapping determines the available mapping sizes.
  replaceUnitMappingIdsHelper<BlockDimOp>(rewriter, loc, parentBlock, zero,
                                          rewriteResult.mappingSizes);

  return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure
mlir::transform::gpu::findTopLevelForallOp(Operation *target,
                                           scf::ForallOp &topLevelForallOp,
                                           TransformOpInterface transformOp) {
  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
    if (forallOp->getParentOfType<scf::ForallOp>())
      return WalkResult::advance();
    if (topLevelForallOp)
      // TODO: Handle multiple forall if they are independent.
      return WalkResult::interrupt();
    topLevelForallOp = forallOp;
    return WalkResult::advance();
  });

  if (walkResult.wasInterrupted() || !topLevelForallOp)
    return transformOp.emitSilenceableError()
           << "could not find a unique topLevel scf.forall";
  return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure transform::MapForallToBlocks::applyToOne(
    transform::TransformRewriter &rewriter, Operation *target,
    ApplyToEachResultList &results, transform::TransformState &state) {
  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
  auto transformOp = cast<TransformOpInterface>(getOperation());

  if (!getGenerateGpuLaunch() && !gpuLaunch) {
    DiagnosedSilenceableFailure diag =
        emitSilenceableError()
        << "Given target is not gpu.launch, set `generate_gpu_launch` "
           "attribute";
    diag.attachNote(target->getLoc()) << "when applied to this payload op";
    return diag;
  }

  scf::ForallOp topLevelForallOp;
  DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(
      target, topLevelForallOp, transformOp);
  if (!diag.succeeded()) {
    diag.attachNote(target->getLoc()) << "when applied to this payload op";
    return diag;
  }
  assert(topLevelForallOp && "expect an scf.forall");

  SmallVector<int64_t> gridDims{getGridDims()};
  if (!getGenerateGpuLaunch() && gridDims.size() != 3)
    return transformOp.emitDefiniteFailure("transform require size-3 mapping");

  OpBuilder::InsertionGuard guard(rewriter);
  rewriter.setInsertionPoint(topLevelForallOp);

  // Generate gpu launch here and move the forall inside
  if (getGenerateGpuLaunch()) {
    DiagnosedSilenceableFailure diag =
        createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
    if (!diag.succeeded())
      return diag;

    rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
    Operation *newForallOp = rewriter.clone(*topLevelForallOp);
    rewriter.eraseOp(topLevelForallOp);
    topLevelForallOp = cast<scf::ForallOp>(newForallOp);
  }

  // The BlockIdBuilder adapts to whatever is thrown at it.
  bool useLinearMapping = false;
  if (topLevelForallOp.getMapping()) {
    auto mappingAttr = cast<DeviceMappingAttrInterface>(
        topLevelForallOp.getMapping()->getValue().front());
    useLinearMapping = mappingAttr.isLinearMapping();
  }
  GpuBlockIdBuilder gpuBlockIdBuilder(getContext(), useLinearMapping);

  diag = mlir::transform::gpu::mapForallToBlocksImpl(
      rewriter, transformOp, topLevelForallOp, gridDims, gpuBlockIdBuilder);
  if (!diag.succeeded())
    return diag;

  // Set the GPU launch configuration for the grid dims late, this is
  // subject to IR inspection.
  diag = alterGpuLaunch(rewriter, gpuLaunch,
                        cast<TransformOpInterface>(getOperation()), gridDims[0],
                        gridDims[1], gridDims[2]);

  results.push_back(gpuLaunch);
  return diag;
}

LogicalResult transform::MapForallToBlocks::verify() {
  if (!getGridDims().empty() && getGridDims().size() != 3) {
    return emitOpError() << "transform requires empty or size-3 grid_dims";
  }
  return success();
}

//===----------------------------------------------------------------------===//
// MapNestedForallToThreads
//===----------------------------------------------------------------------===//

static DiagnosedSilenceableFailure checkMappingSpec(
    std::optional<TransformOpInterface> transformOp, scf::ForallOp forallOp,
    ArrayRef<int64_t> numParallelIterations, ArrayRef<int64_t> blockOrGridSizes,
    int factor, bool useLinearMapping = false) {
  if (!useLinearMapping && blockOrGridSizes.front() % factor != 0) {
    auto diag = definiteFailureHelper(
        transformOp, forallOp,
        Twine("3-D mapping: size of threadIdx.x must be a multiple of ") +
            std::to_string(factor));
    return diag;
  }
  if (computeProduct(numParallelIterations) * factor >
      computeProduct(blockOrGridSizes)) {
    auto diag = definiteFailureHelper(
        transformOp, forallOp,
        Twine("the number of required parallel resources (blocks or "
              "threads) ") +
            std::to_string(computeProduct(numParallelIterations) * factor) +
            std::string(" overflows the number of available resources ") +
            std::to_string(computeProduct(blockOrGridSizes)));
    return diag;
  }
  return DiagnosedSilenceableFailure::success();
}

static DiagnosedSilenceableFailure
getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
                   scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes,
                   int64_t warpSize, GpuIdBuilder &gpuIdBuilder) {
  auto mappingAttr = cast<DeviceMappingAttrInterface>(
      forallOp.getMapping()->getValue().front());
  bool useLinearMapping = mappingAttr.isLinearMapping();

  // Sanity checks that may result in runtime verification errors.
  auto numParallelIterations =
      getConstantIntValues((forallOp.getMixedUpperBound()));
  if (!forallOp.isNormalized() || !numParallelIterations.has_value()) {
    return definiteFailureHelper(
        transformOp, forallOp,
        "requires statically sized, normalized forall op");
  }
  int64_t factor = 1;
  if (isa<GPUWarpgroupMappingAttr>(mappingAttr)) {
    factor = GpuWarpgroupIdBuilder::kNumWarpsPerGroup * warpSize;
  } else if (isa<GPUWarpMappingAttr>(mappingAttr)) {
    factor = warpSize;
  }
  DiagnosedSilenceableFailure diag =
      checkMappingSpec(transformOp, forallOp, numParallelIterations.value(),
                       blockSizes, factor, useLinearMapping);
  if (!diag.succeeded())
    return diag;

  // Start mapping.
  MLIRContext *ctx = forallOp.getContext();
  gpuIdBuilder =
      TypeSwitch<DeviceMappingAttrInterface, GpuIdBuilder>(mappingAttr)
          .Case([&](GPUWarpgroupMappingAttr) {
            return GpuWarpgroupIdBuilder(ctx, warpSize, useLinearMapping);
          })
          .Case([&](GPUWarpMappingAttr) {
            return GpuWarpIdBuilder(ctx, warpSize, useLinearMapping);
          })
          .Case([&](GPUThreadMappingAttr) {
            return GpuThreadIdBuilder(ctx, useLinearMapping);
          })
          .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
            llvm_unreachable("unknown mapping attribute");
          });
  return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure mlir::transform::gpu::mapOneForallToThreadsImpl(
    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
    scf::ForallOp forallOp, ArrayRef<int64_t> blockSizes, int64_t warpSize,
    bool syncAfterDistribute) {

  {
    // GPU-specific verifications. There is no better place to anchor
    // those right now: the ForallOp is target-independent and the transform
    // op does not apply to individual ForallOp.
    DiagnosedSilenceableFailure diag =
        verifyGpuMapping<ThreadMappingKind>(transformOp, forallOp);
    if (!diag.succeeded())
      return diag;
  }

  GpuIdBuilder gpuIdBuilder;
  {
    // Try to construct the id builder, if it fails, return.
    DiagnosedSilenceableFailure diag = getThreadIdBuilder(
        transformOp, forallOp, blockSizes, warpSize, gpuIdBuilder);
    if (!diag.succeeded())
      return diag;
  }

  Location loc = forallOp.getLoc();
  OpBuilder::InsertionGuard g(rewriter);
  // Insert after to allow for syncthreads after `forall` is erased.
  rewriter.setInsertionPointAfter(forallOp);
  ForallRewriteResult rewriteResult;
  DiagnosedSilenceableFailure diag = rewriteOneForallCommonImpl(
      rewriter, transformOp, forallOp, blockSizes, rewriteResult, gpuIdBuilder);
  if (!diag.succeeded())
    return diag;
  // Add a syncthreads if needed. TODO: warpsync
  if (syncAfterDistribute)
    rewriter.create<BarrierOp>(loc);

  return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForallToThreadsImpl(
    RewriterBase &rewriter, std::optional<TransformOpInterface> transformOp,
    Operation *target, ArrayRef<int64_t> blockDims, int64_t warpSize,
    bool syncAfterDistribute) {
  LDBG("Start mapNestedForallToThreadsImpl");
  if (blockDims.size() != 3) {
    return definiteFailureHelper(transformOp, target,
                                 "requires size-3 thread mapping");
  }

  // Create an early zero index value for replacements.
  Location loc = target->getLoc();
  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
  DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
  WalkResult walkResult = target->walk([&](scf::ForallOp forallOp) {
    diag = mlir::transform::gpu::mapOneForallToThreadsImpl(
        rewriter, transformOp, forallOp, blockDims, warpSize,
        syncAfterDistribute);
    if (diag.isDefiniteFailure())
      return WalkResult::interrupt();
    if (diag.succeeded())
      return WalkResult::skip();
    return WalkResult::advance();
  });
  if (walkResult.wasInterrupted())
    return diag;

  // Replace ids of dimensions known to be 1 by 0 to simplify the IR.
  // Here, the result of mapping determines the available mapping sizes.
  replaceUnitMappingIdsHelper<ThreadIdOp>(rewriter, loc, target, zero,
                                          blockDims);

  return DiagnosedSilenceableFailure::success();
}

DiagnosedSilenceableFailure transform::MapNestedForallToThreads::applyToOne(
    transform::TransformRewriter &rewriter, Operation *target,
    ApplyToEachResultList &results, TransformState &state) {
  LaunchOp gpuLaunch = dyn_cast<LaunchOp>(target);
  auto transformOp = cast<TransformOpInterface>(getOperation());

  // Basic high-level verifications.
  if (!gpuLaunch)
    return emitSilenceableError() << "Given target is not a gpu.launch";

  // Mapping to block ids.
  SmallVector<int64_t> blockDims{getBlockDims()};
  DiagnosedSilenceableFailure diag =
      checkGpuLimits(transformOp, std::nullopt, std::nullopt, std::nullopt,
                     blockDims[0], blockDims[1], blockDims[2]);
  if (diag.isSilenceableFailure()) {
    diag.attachNote(getLoc()) << getBlockDimsAttrName() << " is too large";
    return diag;
  }

  // Set the GPU launch configuration for the block dims early, this is not
  // subject to IR inspection.
  diag = alterGpuLaunch(rewriter, gpuLaunch, transformOp, std::nullopt,
                        std::nullopt, std::nullopt, blockDims[0], blockDims[1],
                        blockDims[2]);

  rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
  diag =
      mapNestedForallToThreadsImpl(rewriter, transformOp, gpuLaunch, blockDims,
                                   getWarpSize(), getSyncAfterDistribute());

  results.push_back(gpuLaunch.getOperation());
  return diag;
}

//===----------------------------------------------------------------------===//
// Transform op registration
//===----------------------------------------------------------------------===//

namespace {
/// Registers new ops and declares PDL as dependent dialect since the
/// additional ops are using PDL types for operands and results.
class GPUTransformDialectExtension
    : public transform::TransformDialectExtension<
          GPUTransformDialectExtension> {
public:
  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(GPUTransformDialectExtension)

  GPUTransformDialectExtension() {
    declareGeneratedDialect<scf::SCFDialect>();
    declareGeneratedDialect<arith::ArithDialect>();
    declareGeneratedDialect<GPUDialect>();
    registerTransformOps<
#define GET_OP_LIST
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"
        >();
  }
};
} // namespace

#define GET_OP_CLASSES
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.cpp.inc"

void mlir::gpu::registerTransformDialectExtension(DialectRegistry &registry) {
  registry.addExtensions<GPUTransformDialectExtension>();
}