The OpenACC remark emission utilities previously only accepted Twine for message construction. However, complex remarks often require additional logic to build messages, such as resolving variable names. This results in unnecessary work when remarks are disabled. Add an overload that accepts a lambda for message generation, which is only invoked when remark emission is enabled. Update ACCLoopTiling to use this lazy API for tile size reporting. Additionally, getVariableName now returns numeric strings for constant integer values. This is also being used by ACCLoopTiling along with the lazy remark update.
230 lines
8.5 KiB
C++
230 lines
8.5 KiB
C++
//===- ACCLoopTiling.cpp - Tile ACC Loops ---------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This pass implements the OpenACC loop tiling transformation for acc.loop
|
|
// operations that have the tile clause (OpenACC 3.4 spec, section 2.9.8).
|
|
//
|
|
// Overview:
|
|
// ---------
|
|
// The tile clause specifies that the iterations of the associated loops should
|
|
// be divided into tiles (rectangular blocks). This pass transforms a single
|
|
// or nested acc.loop with tile clauses into a structure of "tile loops"
|
|
// (iterating over tiles) containing "element loops" (iterating within tiles).
|
|
//
|
|
// For example, tiling a 2-level nested loop with tile(T1, T2) produces:
|
|
//
|
|
// // Before tiling:
|
|
// acc.loop tile(T1, T2) control(%i, %j) = (lb1, lb2) to (ub1, ub2) step (s1,
|
|
// s2)
|
|
//
|
|
// // After tiling:
|
|
// acc.loop control(%i) = (lb1) to (ub1) step (s1*T1) { // tile loop 1
|
|
// acc.loop control(%j) = (lb2) to (ub2) step (s2*T2) { // tile loop 2
|
|
// acc.loop control(%ii) = (%i) to (min(ub1, %i+s1*T1)) step (s1) { //
|
|
// element 1
|
|
// acc.loop control(%jj) = (%j) to (min(ub2, %j+s2*T2)) step (s2) { //
|
|
// element 2
|
|
// // loop body using %ii, %jj
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
//
|
|
// Gang/worker/vector attributes are distributed as follows:
|
|
// - gang: applied to tile loops
|
|
// - vector: applied to element loops
|
|
// - worker: removed from inner loops
|
|
//
|
|
// Unknown Tile Sizes:
|
|
// -------------------
|
|
// The OpenACC tile(*) syntax indicates an implementation-defined tile size.
|
|
// In the IR, this is represented as -1. The pass resolves these to the
|
|
// default tile size (configurable via pass option).
|
|
//
|
|
// Requirements:
|
|
// -------------
|
|
// 1. The pass uses the OpenACCSupport analysis for remark and NYI (not yet
|
|
// implemented) emission. Custom implementations can be registered via
|
|
// setImplementation() to provide pipeline-specific handling.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Func/IR/FuncOps.h"
|
|
#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
|
|
#include "mlir/Dialect/OpenACC/OpenACC.h"
|
|
#include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h"
|
|
#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
|
|
#include "mlir/IR/BuiltinAttributes.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
#include "mlir/Support/LLVM.h"
|
|
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
|
#include "llvm/ADT/StringExtras.h"
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
namespace mlir {
|
|
namespace acc {
|
|
#define GEN_PASS_DEF_ACCLOOPTILING
|
|
#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
|
|
} // namespace acc
|
|
} // namespace mlir
|
|
|
|
#define DEBUG_TYPE "acc-loop-tile"
|
|
|
|
namespace {
|
|
using namespace mlir;
|
|
|
|
struct ACCLoopTilingImpl : public OpRewritePattern<acc::LoopOp> {
|
|
ACCLoopTilingImpl(MLIRContext *context, int32_t defaultTileSize,
|
|
acc::OpenACCSupport &accSupport)
|
|
: OpRewritePattern<acc::LoopOp>(context),
|
|
defaultTileSize(defaultTileSize), accSupport(accSupport) {}
|
|
|
|
// Check that tile size types are not narrower than IV types.
|
|
// We only check when both types are IntegerType. For IndexType, the width
|
|
// is target-dependent and the casting utility will handle it correctly.
|
|
LogicalResult checkTileSizeTypes(acc::LoopOp loop,
|
|
ArrayRef<Value> tileSizes) const {
|
|
auto ivTypes = loop.getBody().getArgumentTypes();
|
|
for (size_t i = 0; i < tileSizes.size() && i < ivTypes.size(); ++i) {
|
|
Type tileType = tileSizes[i].getType();
|
|
Type ivType = ivTypes[i];
|
|
|
|
// Skip unknown tile sizes (will be created with correct type)
|
|
auto constVal = getConstantIntValue(tileSizes[i]);
|
|
if (constVal && *constVal < 0)
|
|
continue;
|
|
|
|
// Only compare when both are integer types (not index)
|
|
auto tileIntType = dyn_cast<IntegerType>(tileType);
|
|
auto ivIntType = dyn_cast<IntegerType>(ivType);
|
|
if (tileIntType && ivIntType) {
|
|
if (tileIntType.getWidth() > ivIntType.getWidth()) {
|
|
accSupport.emitNYI(loop.getLoc(),
|
|
"tile size type (i" +
|
|
std::to_string(tileIntType.getWidth()) +
|
|
") is wider than loop IV type (i" +
|
|
std::to_string(ivIntType.getWidth()) + ")");
|
|
return failure();
|
|
}
|
|
}
|
|
}
|
|
return success();
|
|
}
|
|
|
|
void emitTilingRemarks(acc::LoopOp loop, ArrayRef<Value> tileSizes) const {
|
|
// Emit remarks for loop tiling
|
|
accSupport.emitRemark(
|
|
loop,
|
|
[&]() {
|
|
auto getTileSizeStr = [&](Value v) -> std::string {
|
|
std::string name = accSupport.getVariableName(v);
|
|
// Use "*" for unknown tile sizes (represented as -1 or empty)
|
|
if (name.empty() || name == "-1")
|
|
return "*";
|
|
return name;
|
|
};
|
|
SmallVector<std::string> tileStrs;
|
|
for (Value v : tileSizes)
|
|
tileStrs.push_back(getTileSizeStr(v));
|
|
return "Tiling " + std::to_string(tileSizes.size()) +
|
|
"-level loop nest with tile(" + llvm::join(tileStrs, ",") +
|
|
")";
|
|
},
|
|
DEBUG_TYPE);
|
|
|
|
// Emit remarks for unknown tile sizes that will be resolved to default
|
|
// TODO: Need to base the default tile size on some heuristics.
|
|
for (Value tileSize : tileSizes) {
|
|
std::optional<int64_t> val = getConstantIntValue(tileSize);
|
|
if (val && *val < 0) {
|
|
accSupport.emitRemark(
|
|
loop,
|
|
[&]() {
|
|
return "Picking default tile size " +
|
|
std::to_string(defaultTileSize) +
|
|
" for unknown tile size '*'";
|
|
},
|
|
DEBUG_TYPE);
|
|
}
|
|
}
|
|
}
|
|
|
|
LogicalResult matchAndRewrite(acc::LoopOp origLoop,
|
|
PatternRewriter &rewriter) const override {
|
|
|
|
if (origLoop.getTileValues().empty())
|
|
return success();
|
|
|
|
SmallVector<Value> tileSizes(origLoop.getTileValues().begin(),
|
|
origLoop.getTileValues().end());
|
|
unsigned tileCount = tileSizes.size();
|
|
unsigned collapseCount = origLoop.getCollapseValue().value_or(1);
|
|
|
|
// Sanity check tile size types
|
|
if (failed(checkTileSizeTypes(origLoop, tileSizes)))
|
|
return failure();
|
|
|
|
// Emit remarks for loop tiling. This is emitted before the original loop
|
|
// is modified. However, it assumes that tiling will not fail.
|
|
emitTilingRemarks(origLoop, tileSizes);
|
|
|
|
LLVM_DEBUG(llvm::dbgs() << "\nBefore tiling:\n" << *origLoop << "\n");
|
|
|
|
// Clear tile operands from origLoop
|
|
rewriter.startOpModification(origLoop);
|
|
origLoop.getTileOperandsMutable().clear();
|
|
origLoop.removeTileOperandsSegmentsAttr();
|
|
origLoop.removeTileOperandsDeviceTypeAttr();
|
|
rewriter.finalizeOpModification(origLoop);
|
|
|
|
SmallVector<acc::LoopOp> loopsToTile;
|
|
if (collapseCount < tileCount) {
|
|
// Uncollapse tile loops before tiling if necessary
|
|
loopsToTile =
|
|
acc::uncollapseLoops(origLoop, tileCount, collapseCount, rewriter);
|
|
rewriter.replaceOp(origLoop, loopsToTile[0]);
|
|
LLVM_DEBUG(llvm::dbgs() << "\nAfter uncollapsing:\n"
|
|
<< *loopsToTile[0] << "\n");
|
|
} else {
|
|
loopsToTile.push_back(origLoop);
|
|
}
|
|
|
|
// loopsToTile is a vector of perfectly nested loops. The outermost loop
|
|
// may have multiple IVs but inner loops can only have one IV.
|
|
// The utility handles unknown tile sizes (*) by using `defaultTileSize`.
|
|
acc::tileACCLoops(loopsToTile, tileSizes, defaultTileSize, rewriter);
|
|
|
|
LLVM_DEBUG(llvm::dbgs() << "\nAfter tiling:\n " << *loopsToTile[0] << "\n");
|
|
return success();
|
|
}
|
|
|
|
private:
|
|
int32_t defaultTileSize;
|
|
acc::OpenACCSupport &accSupport;
|
|
};
|
|
|
|
class ACCLoopTiling : public acc::impl::ACCLoopTilingBase<ACCLoopTiling> {
|
|
public:
|
|
using ACCLoopTilingBase<ACCLoopTiling>::ACCLoopTilingBase;
|
|
|
|
void runOnOperation() override {
|
|
func::FuncOp funcOp = getOperation();
|
|
MLIRContext *context = funcOp.getContext();
|
|
acc::OpenACCSupport &accSupport = getAnalysis<acc::OpenACCSupport>();
|
|
|
|
RewritePatternSet patterns(context);
|
|
patterns.insert<ACCLoopTilingImpl>(context, defaultTileSize, accSupport);
|
|
GreedyRewriteConfig grc;
|
|
grc.setUseTopDownTraversal(true);
|
|
grc.setMaxIterations(1);
|
|
(void)applyPatternsGreedily(funcOp, std::move(patterns), grc);
|
|
}
|
|
};
|
|
|
|
} // namespace
|