This change does a bunch of renaming to clear up confusions in these files. In particular, this change: * Renames variables and methods to clarify the "dim"/"lvl" distinction, and changes them to use the `Dimension`/`Level` types as appropriate. * Introduces new typedefs * `ExprId`, `LatPointId`, `LatSetId`: to clarify the interning design of the Merger. * `LoopId`, `LoopOrd`: to clarify the distinction between arbitrary names for loop-variables, vs numeric identifiers based on the actual order of loop generation. * `TensorId` * (Future CLs will change these from typedefs to structs/classes, so that the typechecker can help avoid mixups.) * Updates documentation to match the new terminology * Adds additional assertions * Adds `const` to local variables along the way Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D145756
413 lines
17 KiB
C++
413 lines
17 KiB
C++
//===- LoopEmitter.h --------------------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
|
|
#define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
|
|
|
|
#include <vector>
|
|
|
|
#include "mlir/Dialect/SparseTensor/IR/Enums.h"
|
|
#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
|
|
#include "mlir/Dialect/SparseTensor/Utils/Merger.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
|
|
namespace mlir {
|
|
namespace sparse_tensor {
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
/// The position of a loop in the loop-stack, or the position of a
|
|
/// `LoopId` in a topologically-sorted list of `LoopId`s.
|
|
///
|
|
/// Although this type may have the same cardinality as `LoopId`, it must
|
|
/// not be confused with that type. The `LoopId` type is used by the `Merger`
|
|
/// as a unique identifier for loop-variables, regardless of the ordering
|
|
/// of those loops. Whereas the `LoopOrd` type is used by the `LoopEmitter`
|
|
/// (and `CodegenEnv`) to refer to the actual order in which loops are
|
|
/// generated.
|
|
///
|
|
/// TODO: further explicate the correspondences between these various
|
|
/// types. In particular, since the `$dim` argument to `linalg::IndexOp`
|
|
/// is a De Bruijn index, it seems like that should correspond to `LoopOrd`,
|
|
/// and yet the `Merger` has that correspond with `LoopId` instead.
|
|
/// In addition `LoopEmitter::genAffine` has `AffineDimExpr::position`
|
|
/// correspond to `LoopId`, however it is unclear what the providence
|
|
/// of those `AffineDimExpr` is.
|
|
//
|
|
// TODO: use a struct/class rather than a typedef, so that we can actually
|
|
// typecheck this to avoid mixups in the code.
|
|
using LoopOrd = unsigned;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// SparseTensorLoopEmiter class, manages sparse tensors and helps to
|
|
// generate loop structure to (co)-iterate sparse tensors.
|
|
//
|
|
// An example usage:
|
|
// To generate the following loops over T1<?x?> and T2<?x?>
|
|
//
|
|
// for i in TENSOR_1_0 {
|
|
// for j : TENSOR_2_0 {
|
|
// for k : TENSOR_1_1 {}
|
|
// for k : TENSOR_2_1 {}
|
|
// }
|
|
// }
|
|
//
|
|
// One can use
|
|
//
|
|
// LoopEmiter loopEmiter({T1, T1});
|
|
// loopEmiter.initializeLoopEmit();
|
|
// loopEmiter.enterLoopOverTensorAtLvl(T1, 0);
|
|
// loopEmiter.enterLoopOverTensorAtLvl(T2, 0);
|
|
// loopEmiter.enterLoopOverTensorAtLvl(T1, 1);
|
|
// loopEmiter.exitCurrentLoop();
|
|
// loopEmiter.enterLoopOverTensorAtLvl(T2, 1);
|
|
// loopEmiter.exitCurrentLoop(); // exit k
|
|
// loopEmiter.exitCurrentLoop(); // exit j
|
|
// loopEmiter.exitCurrentLoop(); // exit i
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
class LoopEmitter {
|
|
public:
|
|
/// Optional callback function to setup dense output tensors when
|
|
/// initializing the loop emitter (e.g., to fill a dense output with zeros).
|
|
using OutputUpdater = function_ref<Value(OpBuilder &builder, Location loc,
|
|
Value memref, Value tensor)>;
|
|
|
|
LoopEmitter() = default;
|
|
|
|
/// Takes an array of input tensors, which the generated loops will
|
|
/// iterate over. Each tensor is given a `TensorId` (numerically equal
|
|
/// to the position of that tensor `Value` in the array). Setting
|
|
/// `isSparseOut` indicates that the sparse output tensor is empty,
|
|
/// so the loop emitter will generate loops over it according to the
|
|
/// level-sizes. The `topSort` array specifies the actual order in
|
|
/// which loops are generated, thus providing a mapping from `LoopOrd`
|
|
/// to `LoopId`.
|
|
void initialize(ValueRange tensors, StringAttr loopTag = nullptr,
|
|
bool hasOutput = false, bool isSparseOut = false,
|
|
ArrayRef<LoopId> topSort = {});
|
|
|
|
explicit LoopEmitter(ValueRange tensors, StringAttr loopTag = nullptr,
|
|
bool hasOutput = false, bool isSparseOut = false,
|
|
ArrayRef<LoopId> topSort = {});
|
|
|
|
/// Starts a loop emitting session by generating all the buffers needed
|
|
/// for iterating over the tensors.
|
|
void initializeLoopEmit(OpBuilder &builder, Location loc,
|
|
OutputUpdater updater = nullptr);
|
|
|
|
/// Generates code to compute an affine expression whose variables are
|
|
/// `LoopId`s (i.e., `a.cast<AffineDimExpr>().getPosition()` is a valid
|
|
/// `LoopId`).
|
|
Value genAffine(OpBuilder &builder, Location loc, AffineExpr a);
|
|
|
|
/// Enters a new loop sequence, the loops within the same sequence starts
|
|
/// from the break points of previous loop instead of starting over from 0.
|
|
/// e.g.,
|
|
/// {
|
|
/// // loop sequence start.
|
|
/// p0 = while(xxx)
|
|
/// ...
|
|
/// break p0
|
|
///
|
|
/// // Starts loop from p0
|
|
/// for (i = p0; i < end; i++)
|
|
/// ...
|
|
/// // loop sequence end.
|
|
/// }
|
|
void enterNewLoopSeq(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorId> tids, ArrayRef<Level> lvls);
|
|
|
|
/// Exits the current loop sequence, this will reset universal index to 0.
|
|
void exitCurrentLoopSeq() {
|
|
assert(loopSeqStack.size() == loopStack.size() + 1);
|
|
loopSeqStack.pop_back();
|
|
}
|
|
|
|
// TODO: Get rid of `lvls` in the argument list? Track the level we
|
|
// are currently at internally. Then it would be enterNextLvlForTensor.
|
|
// Still need a way to specify the lvl for non-annotated tensors though,
|
|
// as those can be accessed out of order.
|
|
//
|
|
/// Emits loop over tensor_tid_lvl, it assumes that loops between
|
|
/// tensor_tid_[0, lvl - 1] have already been generated.
|
|
/// The function will also perform in-place update on the `reduc` vector to
|
|
/// return the reduction variable used inside the generated loop.
|
|
Operation *enterLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorId> tids,
|
|
ArrayRef<Level> lvls,
|
|
MutableArrayRef<Value> reduc = {},
|
|
bool isParallel = false);
|
|
|
|
Operation *enterFilterLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl,
|
|
AffineExpr affine,
|
|
MutableArrayRef<Value> reduc = {});
|
|
|
|
void genDenseAffineAddress(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl, AffineExpr lvlExpr);
|
|
|
|
/// Emits a co-iteration loop over a set of tensors.
|
|
Operation *enterCoIterationOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorId> tids,
|
|
ArrayRef<Level> lvls, bool needsUniv, MutableArrayRef<Value> reduc = {});
|
|
|
|
void exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc = {});
|
|
|
|
/// Fills the out-parameter with the loop induction variables for all
|
|
/// loops in the current loop-stack. The variables are given in the
|
|
/// same order as the loop-stack, hence `ivs` should be indexed into
|
|
/// by `LoopOrd` (not `LoopId`).
|
|
void getLoopIVs(SmallVectorImpl<Value> &ivs) const {
|
|
ivs.clear();
|
|
ivs.reserve(getCurrentDepth());
|
|
for (auto &l : loopStack)
|
|
ivs.push_back(l.iv);
|
|
}
|
|
|
|
/// Gets the current depth of the loop-stack. The result is given
|
|
/// the type `LoopOrd` for the same reason as one-past-the-end iterators.
|
|
LoopOrd getCurrentDepth() const { return loopStack.size(); }
|
|
|
|
/// Gets loop induction variable for the given `LoopOrd`.
|
|
Value getLoopIV(LoopOrd n) const {
|
|
return n < getCurrentDepth() ? loopStack[n].iv : Value();
|
|
}
|
|
|
|
///
|
|
/// Getters.
|
|
///
|
|
const std::vector<std::vector<Value>> &getPosits() const { return posits; };
|
|
const std::vector<std::vector<Value>> &getCoords() const { return coords; };
|
|
const std::vector<std::vector<Value>> &getHighs() const { return highs; };
|
|
const std::vector<std::vector<Value>> &getPositionBuffers() const {
|
|
return positionsBuffers;
|
|
};
|
|
const std::vector<std::vector<Value>> &getCoordinateBuffers() const {
|
|
return coordinatesBuffers;
|
|
};
|
|
const std::vector<Value> &getValBuffer() const { return valBuffer; };
|
|
|
|
constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() {
|
|
return llvm::StringLiteral("Emitted from");
|
|
}
|
|
|
|
private:
|
|
struct LoopInfo {
|
|
LoopInfo(ArrayRef<TensorId> tids, ArrayRef<Level> lvls, Operation *loop,
|
|
Block *userBlock, Value iv, StringAttr loopTag)
|
|
: tids(tids), lvls(lvls), loop(loop), userCodeBlock(userBlock), iv(iv) {
|
|
// Attached a special tag to loop emitter generated loop.
|
|
if (loopTag)
|
|
loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag);
|
|
}
|
|
// TODO: maybe use a vector<pair> for tid and lvl?
|
|
// (Better yet, compress them together a la `TensorLoopId`.)
|
|
// The set of tensors that the loop is operating on
|
|
const llvm::SmallVector<TensorId> tids;
|
|
// The corresponding levels for the tensors
|
|
const llvm::SmallVector<Level> lvls;
|
|
const Operation *loop; // the loop operation
|
|
Block *const userCodeBlock; // the block holding users' generated code.
|
|
const Value iv; // the induction variable for the loop
|
|
};
|
|
|
|
/// Linearizes address for dense level (i.e., p = (i * d0) + j).
|
|
Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
Value iv);
|
|
|
|
/// Generates the segment high for a non-unique level (to fast forward
|
|
/// duplicated coordinates). That is, it generates the code:
|
|
///
|
|
/// crd = coordinates_tid_lvl[pos]
|
|
/// while (pos < pHi && coordinates_tid_lvl[pos] == crd)
|
|
/// pos++;
|
|
/// <return pos>;
|
|
Value genSegmentHigh(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl, Value pos, Value pHi);
|
|
|
|
/// Generates instructions to compute the coordinate of tensors[tid][lvl]
|
|
/// under the current loop context. The final argument is the
|
|
/// collapsed-output level, whereas this function handles converting
|
|
/// that to the uncollapsed-input level
|
|
Value genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level dstLvl);
|
|
|
|
/// Generates a predicate to determine whether the tranformed coordinates are
|
|
/// in the given slice.
|
|
/// Returns std::pair<Transformed coordinates, Predicate>
|
|
std::pair<Value, Value> genSliceLegitPredicate(OpBuilder &builder,
|
|
Location loc, Value crd,
|
|
TensorId tid, Level lvl);
|
|
|
|
TensorId getNumTensors() const { return tensors.size(); }
|
|
|
|
bool isOutputTensor(TensorId tid) const {
|
|
return hasOutput && tid == static_cast<TensorId>(getNumTensors() - 1);
|
|
}
|
|
|
|
bool isSparseOutput(TensorId tid) const {
|
|
return isOutputTensor(tid) && isSparseOut;
|
|
}
|
|
|
|
/// Prepares loop for iterating over `tensor[lvl]`, under the assumption
|
|
/// that `tensor[0...lvl-1]` loops have already been set up.
|
|
void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl);
|
|
|
|
/// Emits extra locals, since the locals might not be in simplified lattices
|
|
/// point used to generate the loops, but are still required to generate
|
|
/// expressions.
|
|
void emitExtraLocalsForTensorsAtDenseLvls(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorId> tids,
|
|
ArrayRef<Level> lvls);
|
|
|
|
/// Exits a for loop, returns the reduction results, e.g.,
|
|
/// For sequential for loops:
|
|
/// %ret = for () {
|
|
/// ...
|
|
/// %val = addi %args, %c
|
|
/// yield %val
|
|
/// }
|
|
/// For parallel loops, the following generated code by users:
|
|
/// %ret = parallel () init(%args) {
|
|
/// ...
|
|
/// %val = op %args, %c
|
|
/// }
|
|
/// will be transformed into
|
|
/// %ret = parallel () init(%args) {
|
|
/// ...
|
|
/// scf.reduce(%c) bb0(%0, %1){
|
|
/// %val = op %0, %1
|
|
/// scf.reduce.return %val
|
|
/// }
|
|
/// }
|
|
/// NOTE: only one instruction will be moved into reduce block,
|
|
/// transformation will fail if multiple instructions are used to compute
|
|
/// the reduction value. Return %ret to user, while %val is provided by
|
|
/// users (`reduc`).
|
|
void exitForLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc);
|
|
|
|
/// Exits a while loop, returns the reduction results.
|
|
void exitCoIterationLoop(OpBuilder &builder, Location loc,
|
|
MutableArrayRef<Value> reduc);
|
|
|
|
//
|
|
// View-based-reshape methods.
|
|
//
|
|
|
|
/// Get the collapse reassociation for `tensors[tid][dstLvl]`.
|
|
/// For unreshaped operands, the reassociation is simply an identity
|
|
/// transformation.
|
|
///
|
|
/// NOTE: the result uses `Level` rather than the `int64_t` of
|
|
/// `ReassociationIndices`, since the former gives clarity to what
|
|
/// the values actually mean.
|
|
///
|
|
/// TODO: why not do this computation when we first store the reassoc,
|
|
/// instead of doing it every time we look it up?
|
|
SmallVector<Level, 2> getCollapseReassociation(TensorId tid, Level dstLvl) {
|
|
assert(tid < getNumTensors() && "Invalid TensorId");
|
|
assert(collapseReassoc.size() == getNumTensors());
|
|
if (const auto reassoc = collapseReassoc[tid]) {
|
|
// TODO: store the dstLvlRank in the LoopEmitter so that we can
|
|
// check `dstLvl < dstLvlRank` at the top; and only here need to
|
|
// assert that `reassoc.size() == dstLvlRank`.
|
|
assert(dstLvl < reassoc.size() && "Level is out-of-bounds");
|
|
const auto srcLvls = reassoc[dstLvl].cast<ArrayAttr>();
|
|
return llvm::to_vector<2>(
|
|
llvm::map_range(srcLvls, [&](Attribute srcLvl) -> Level {
|
|
// TODO: replace this with the converter for `LevelAttr`.
|
|
return srcLvl.cast<IntegerAttr>().getValue().getZExtValue();
|
|
}));
|
|
}
|
|
return {dstLvl};
|
|
}
|
|
|
|
/// A optional string attribute that should be attached to the loop
|
|
/// generated by loop emitter, it might help following passes to identify
|
|
/// loops that operates on sparse tensors more easily.
|
|
StringAttr loopTag;
|
|
/// Whether the loop emitter needs to treat the last tensor as the output
|
|
/// tensor.
|
|
bool hasOutput;
|
|
bool isSparseOut;
|
|
|
|
//
|
|
// Fields which have `numTensor` many entries.
|
|
//
|
|
// TODO: switch to an AOS style to avoid any possible mismatches.
|
|
//
|
|
|
|
/// Input and (optional) output tensors.
|
|
std::vector<Value> tensors;
|
|
/// Level-types for each `(TensorId, Level)` pair.
|
|
std::vector<std::vector<DimLevelType>> lvlTypes;
|
|
// Sparse iteration information for each `(TensorId, Level)` pair.
|
|
// These arrays are updated to remain current within the current loop.
|
|
// TODO: Clarify which of these are indexed by dstLvl vs srcLvl.
|
|
//
|
|
/// The collection of positions for a given element (one such collection
|
|
/// for each tensor). This is the position analogue of the "coords"
|
|
/// naming convention.
|
|
///
|
|
/// FIXME: [CLARIFY_POSITS_LVL] It's unclear which levels are used
|
|
/// to index the `posits` array. On the one hand `genSparseCrd`
|
|
/// uses dstLvl; on the other hand `enterLoopOverTensorAtLvl`,
|
|
/// `prepareLoopOverTensorAtLvl`, and `enterCoIterationOverTensorsAtLvls`
|
|
/// uses srcLvl. So which is it?
|
|
std::vector<std::vector<Value>> posits;
|
|
/// The collection of coordinates for a given element (one such
|
|
/// collection for each tensor).
|
|
std::vector<std::vector<Value>> coords;
|
|
// The segment upper bound for non-uniques level after de-duplication.
|
|
std::vector<std::vector<Value>> segHi;
|
|
std::vector<std::vector<Value>> highs;
|
|
std::vector<std::vector<Value>> lvlSizes;
|
|
std::vector<std::vector<Value>> positionsBuffers; // to_positions
|
|
std::vector<std::vector<Value>> coordinatesBuffers; // to_coordinates
|
|
std::vector<Value> valBuffer; // to_value
|
|
|
|
/// Whether the sparse input is a slice.
|
|
std::vector<bool> isSparseSlices;
|
|
/// Values related to slices.
|
|
std::vector<std::vector<Value>> sliceOffsets;
|
|
std::vector<std::vector<Value>> sliceStrides;
|
|
|
|
/// Collapse Reassociations related to a specific tensor
|
|
// TODO: support expand.
|
|
std::vector<ArrayAttr> collapseReassoc;
|
|
|
|
/// TODO: not yet used, it should track the current level for each tensor
|
|
/// to help eliminate `lvls` paramters from above APIs.
|
|
/// std::vector<Level> curLvl;
|
|
|
|
//
|
|
// Fields which have at most `numLoops` many entries.
|
|
//
|
|
|
|
/// Loop Stack, stores the information of all the nested loops that are
|
|
/// alive.
|
|
std::vector<LoopInfo> loopStack;
|
|
|
|
/// Loop Sequence Stack, stores the universal index for the current loop
|
|
/// sequence.
|
|
std::vector<Value> loopSeqStack;
|
|
|
|
/// Maps `LoopId` (used by `AffineDimExpr`) to `LoopOrd` (in the `loopStack`).
|
|
/// TODO: We should probably use a callback function here to make it more
|
|
/// general.
|
|
std::vector<LoopOrd> loopIdToOrd;
|
|
};
|
|
|
|
} // namespace sparse_tensor
|
|
} // namespace mlir
|
|
|
|
#endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_
|