//===- ACCLoopTiling.cpp - Tile ACC Loops ---------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass implements the OpenACC loop tiling transformation for acc.loop // operations that have the tile clause (OpenACC 3.4 spec, section 2.9.8). // // Overview: // --------- // The tile clause specifies that the iterations of the associated loops should // be divided into tiles (rectangular blocks). This pass transforms a single // or nested acc.loop with tile clauses into a structure of "tile loops" // (iterating over tiles) containing "element loops" (iterating within tiles). // // For example, tiling a 2-level nested loop with tile(T1, T2) produces: // // // Before tiling: // acc.loop tile(T1, T2) control(%i, %j) = (lb1, lb2) to (ub1, ub2) step (s1, // s2) // // // After tiling: // acc.loop control(%i) = (lb1) to (ub1) step (s1*T1) { // tile loop 1 // acc.loop control(%j) = (lb2) to (ub2) step (s2*T2) { // tile loop 2 // acc.loop control(%ii) = (%i) to (min(ub1, %i+s1*T1)) step (s1) { // // element 1 // acc.loop control(%jj) = (%j) to (min(ub2, %j+s2*T2)) step (s2) { // // element 2 // // loop body using %ii, %jj // } // } // } // } // // Gang/worker/vector attributes are distributed as follows: // - gang: applied to tile loops // - vector: applied to element loops // - worker: removed from inner loops // // Unknown Tile Sizes: // ------------------- // The OpenACC tile(*) syntax indicates an implementation-defined tile size. // In the IR, this is represented as -1. The pass resolves these to the // default tile size (configurable via pass option). // // Requirements: // ------------- // 1. The pass uses the OpenACCSupport analysis for remark and NYI (not yet // implemented) emission. Custom implementations can be registered via // setImplementation() to provide pipeline-specific handling. // //===----------------------------------------------------------------------===// #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h" #include "mlir/Dialect/OpenACC/OpenACC.h" #include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" namespace mlir { namespace acc { #define GEN_PASS_DEF_ACCLOOPTILING #include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc" } // namespace acc } // namespace mlir #define DEBUG_TYPE "acc-loop-tile" namespace { using namespace mlir; struct ACCLoopTilingImpl : public OpRewritePattern { ACCLoopTilingImpl(MLIRContext *context, int32_t defaultTileSize, acc::OpenACCSupport &accSupport) : OpRewritePattern(context), defaultTileSize(defaultTileSize), accSupport(accSupport) {} // Check that tile size types are not narrower than IV types. // We only check when both types are IntegerType. For IndexType, the width // is target-dependent and the casting utility will handle it correctly. LogicalResult checkTileSizeTypes(acc::LoopOp loop, ArrayRef tileSizes) const { auto ivTypes = loop.getBody().getArgumentTypes(); for (size_t i = 0; i < tileSizes.size() && i < ivTypes.size(); ++i) { Type tileType = tileSizes[i].getType(); Type ivType = ivTypes[i]; // Skip unknown tile sizes (will be created with correct type) auto constVal = getConstantIntValue(tileSizes[i]); if (constVal && *constVal < 0) continue; // Only compare when both are integer types (not index) auto tileIntType = dyn_cast(tileType); auto ivIntType = dyn_cast(ivType); if (tileIntType && ivIntType) { if (tileIntType.getWidth() > ivIntType.getWidth()) { accSupport.emitNYI(loop.getLoc(), "tile size type (i" + std::to_string(tileIntType.getWidth()) + ") is wider than loop IV type (i" + std::to_string(ivIntType.getWidth()) + ")"); return failure(); } } } return success(); } void emitTilingRemarks(acc::LoopOp loop, ArrayRef tileSizes) const { // Emit remarks for loop tiling accSupport.emitRemark( loop, [&]() { auto getTileSizeStr = [&](Value v) -> std::string { std::string name = accSupport.getVariableName(v); // Use "*" for unknown tile sizes (represented as -1 or empty) if (name.empty() || name == "-1") return "*"; return name; }; SmallVector tileStrs; for (Value v : tileSizes) tileStrs.push_back(getTileSizeStr(v)); return "Tiling " + std::to_string(tileSizes.size()) + "-level loop nest with tile(" + llvm::join(tileStrs, ",") + ")"; }, DEBUG_TYPE); // Emit remarks for unknown tile sizes that will be resolved to default // TODO: Need to base the default tile size on some heuristics. for (Value tileSize : tileSizes) { std::optional val = getConstantIntValue(tileSize); if (val && *val < 0) { accSupport.emitRemark( loop, [&]() { return "Picking default tile size " + std::to_string(defaultTileSize) + " for unknown tile size '*'"; }, DEBUG_TYPE); } } } LogicalResult matchAndRewrite(acc::LoopOp origLoop, PatternRewriter &rewriter) const override { if (origLoop.getTileValues().empty()) return success(); SmallVector tileSizes(origLoop.getTileValues().begin(), origLoop.getTileValues().end()); unsigned tileCount = tileSizes.size(); unsigned collapseCount = origLoop.getCollapseValue().value_or(1); // Sanity check tile size types if (failed(checkTileSizeTypes(origLoop, tileSizes))) return failure(); // Emit remarks for loop tiling. This is emitted before the original loop // is modified. However, it assumes that tiling will not fail. emitTilingRemarks(origLoop, tileSizes); LLVM_DEBUG(llvm::dbgs() << "\nBefore tiling:\n" << *origLoop << "\n"); // Clear tile operands from origLoop rewriter.startOpModification(origLoop); origLoop.getTileOperandsMutable().clear(); origLoop.removeTileOperandsSegmentsAttr(); origLoop.removeTileOperandsDeviceTypeAttr(); rewriter.finalizeOpModification(origLoop); SmallVector loopsToTile; if (collapseCount < tileCount) { // Uncollapse tile loops before tiling if necessary loopsToTile = acc::uncollapseLoops(origLoop, tileCount, collapseCount, rewriter); rewriter.replaceOp(origLoop, loopsToTile[0]); LLVM_DEBUG(llvm::dbgs() << "\nAfter uncollapsing:\n" << *loopsToTile[0] << "\n"); } else { loopsToTile.push_back(origLoop); } // loopsToTile is a vector of perfectly nested loops. The outermost loop // may have multiple IVs but inner loops can only have one IV. // The utility handles unknown tile sizes (*) by using `defaultTileSize`. acc::tileACCLoops(loopsToTile, tileSizes, defaultTileSize, rewriter); LLVM_DEBUG(llvm::dbgs() << "\nAfter tiling:\n " << *loopsToTile[0] << "\n"); return success(); } private: int32_t defaultTileSize; acc::OpenACCSupport &accSupport; }; class ACCLoopTiling : public acc::impl::ACCLoopTilingBase { public: using ACCLoopTilingBase::ACCLoopTilingBase; void runOnOperation() override { func::FuncOp funcOp = getOperation(); MLIRContext *context = funcOp.getContext(); acc::OpenACCSupport &accSupport = getAnalysis(); RewritePatternSet patterns(context); patterns.insert(context, defaultTileSize, accSupport); GreedyRewriteConfig grc; grc.setUseTopDownTraversal(true); grc.setMaxIterations(1); (void)applyPatternsGreedily(funcOp, std::move(patterns), grc); } }; } // namespace