//===- LoopEmitter.h --------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #ifndef MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_ #define MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_ #include #include "mlir/Dialect/SparseTensor/IR/Enums.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h" #include "mlir/Dialect/SparseTensor/Utils/Merger.h" #include "mlir/IR/PatternMatch.h" namespace mlir { namespace sparse_tensor { //===----------------------------------------------------------------------===// /// The position of a loop in the loop-stack, or the position of a /// `LoopId` in a topologically-sorted list of `LoopId`s. /// /// Although this type may have the same cardinality as `LoopId`, it must /// not be confused with that type. The `LoopId` type is used by the `Merger` /// as a unique identifier for loop-variables, regardless of the ordering /// of those loops. Whereas the `LoopOrd` type is used by the `LoopEmitter` /// (and `CodegenEnv`) to refer to the actual order in which loops are /// generated. /// /// TODO: further explicate the correspondences between these various /// types. In particular, since the `$dim` argument to `linalg::IndexOp` /// is a De Bruijn index, it seems like that should correspond to `LoopOrd`, /// and yet the `Merger` has that correspond with `LoopId` instead. /// In addition `LoopEmitter::genAffine` has `AffineDimExpr::position` /// correspond to `LoopId`, however it is unclear what the providence /// of those `AffineDimExpr` is. // // TODO: use a struct/class rather than a typedef, so that we can actually // typecheck this to avoid mixups in the code. using LoopOrd = unsigned; //===----------------------------------------------------------------------===// // SparseTensorLoopEmiter class, manages sparse tensors and helps to // generate loop structure to (co)-iterate sparse tensors. // // An example usage: // To generate the following loops over T1 and T2 // // for i in TENSOR_1_0 { // for j : TENSOR_2_0 { // for k : TENSOR_1_1 {} // for k : TENSOR_2_1 {} // } // } // // One can use // // LoopEmiter loopEmiter({T1, T1}); // loopEmiter.initializeLoopEmit(); // loopEmiter.enterLoopOverTensorAtLvl(T1, 0); // loopEmiter.enterLoopOverTensorAtLvl(T2, 0); // loopEmiter.enterLoopOverTensorAtLvl(T1, 1); // loopEmiter.exitCurrentLoop(); // loopEmiter.enterLoopOverTensorAtLvl(T2, 1); // loopEmiter.exitCurrentLoop(); // exit k // loopEmiter.exitCurrentLoop(); // exit j // loopEmiter.exitCurrentLoop(); // exit i //===----------------------------------------------------------------------===// class LoopEmitter { public: /// Optional callback function to setup dense output tensors when /// initializing the loop emitter (e.g., to fill a dense output with zeros). using OutputUpdater = function_ref; LoopEmitter() = default; /// Takes an array of input tensors, which the generated loops will /// iterate over. Each tensor is given a `TensorId` (numerically equal /// to the position of that tensor `Value` in the array). Setting /// `isSparseOut` indicates that the sparse output tensor is empty, /// so the loop emitter will generate loops over it according to the /// level-sizes. The `topSort` array specifies the actual order in /// which loops are generated, thus providing a mapping from `LoopOrd` /// to `LoopId`. void initialize(ValueRange tensors, StringAttr loopTag = nullptr, bool hasOutput = false, bool isSparseOut = false, ArrayRef topSort = {}); explicit LoopEmitter(ValueRange tensors, StringAttr loopTag = nullptr, bool hasOutput = false, bool isSparseOut = false, ArrayRef topSort = {}); /// Starts a loop emitting session by generating all the buffers needed /// for iterating over the tensors. void initializeLoopEmit(OpBuilder &builder, Location loc, OutputUpdater updater = nullptr); /// Generates code to compute an affine expression whose variables are /// `LoopId`s (i.e., `a.cast().getPosition()` is a valid /// `LoopId`). Value genAffine(OpBuilder &builder, Location loc, AffineExpr a); /// Enters a new loop sequence, the loops within the same sequence starts /// from the break points of previous loop instead of starting over from 0. /// e.g., /// { /// // loop sequence start. /// p0 = while(xxx) /// ... /// break p0 /// /// // Starts loop from p0 /// for (i = p0; i < end; i++) /// ... /// // loop sequence end. /// } void enterNewLoopSeq(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls); /// Exits the current loop sequence, this will reset universal index to 0. void exitCurrentLoopSeq() { assert(loopSeqStack.size() == loopStack.size() + 1); loopSeqStack.pop_back(); } // TODO: Get rid of `lvls` in the argument list? Track the level we // are currently at internally. Then it would be enterNextLvlForTensor. // Still need a way to specify the lvl for non-annotated tensors though, // as those can be accessed out of order. // /// Emits loop over tensor_tid_lvl, it assumes that loops between /// tensor_tid_[0, lvl - 1] have already been generated. /// The function will also perform in-place update on the `reduc` vector to /// return the reduction variable used inside the generated loop. Operation *enterLoopOverTensorAtLvl(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, MutableArrayRef reduc = {}, bool isParallel = false); Operation *enterFilterLoopOverTensorAtLvl(OpBuilder &builder, Location loc, TensorId tid, Level lvl, AffineExpr affine, MutableArrayRef reduc = {}); void genDenseAffineAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, AffineExpr lvlExpr); /// Emits a co-iteration loop over a set of tensors. Operation *enterCoIterationOverTensorsAtLvls( OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls, bool needsUniv, MutableArrayRef reduc = {}); void exitCurrentLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc = {}); /// Fills the out-parameter with the loop induction variables for all /// loops in the current loop-stack. The variables are given in the /// same order as the loop-stack, hence `ivs` should be indexed into /// by `LoopOrd` (not `LoopId`). void getLoopIVs(SmallVectorImpl &ivs) const { ivs.clear(); ivs.reserve(getCurrentDepth()); for (auto &l : loopStack) ivs.push_back(l.iv); } /// Gets the current depth of the loop-stack. The result is given /// the type `LoopOrd` for the same reason as one-past-the-end iterators. LoopOrd getCurrentDepth() const { return loopStack.size(); } /// Gets loop induction variable for the given `LoopOrd`. Value getLoopIV(LoopOrd n) const { return n < getCurrentDepth() ? loopStack[n].iv : Value(); } /// /// Getters. /// const std::vector> &getPosits() const { return posits; }; const std::vector> &getCoords() const { return coords; }; const std::vector> &getHighs() const { return highs; }; const std::vector> &getPositionBuffers() const { return positionsBuffers; }; const std::vector> &getCoordinateBuffers() const { return coordinatesBuffers; }; const std::vector &getValBuffer() const { return valBuffer; }; constexpr static llvm::StringLiteral getLoopEmitterLoopAttrName() { return llvm::StringLiteral("Emitted from"); } private: struct LoopInfo { LoopInfo(ArrayRef tids, ArrayRef lvls, Operation *loop, Block *userBlock, Value iv, StringAttr loopTag) : tids(tids), lvls(lvls), loop(loop), userCodeBlock(userBlock), iv(iv) { // Attached a special tag to loop emitter generated loop. if (loopTag) loop->setAttr(LoopEmitter::getLoopEmitterLoopAttrName(), loopTag); } // TODO: maybe use a vector for tid and lvl? // (Better yet, compress them together a la `TensorLoopId`.) // The set of tensors that the loop is operating on const llvm::SmallVector tids; // The corresponding levels for the tensors const llvm::SmallVector lvls; const Operation *loop; // the loop operation Block *const userCodeBlock; // the block holding users' generated code. const Value iv; // the induction variable for the loop }; /// Linearizes address for dense level (i.e., p = (i * d0) + j). Value genAddress(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value iv); /// Generates the segment high for a non-unique level (to fast forward /// duplicated coordinates). That is, it generates the code: /// /// crd = coordinates_tid_lvl[pos] /// while (pos < pHi && coordinates_tid_lvl[pos] == crd) /// pos++; /// ; Value genSegmentHigh(OpBuilder &builder, Location loc, TensorId tid, Level lvl, Value pos, Value pHi); /// Generates instructions to compute the coordinate of tensors[tid][lvl] /// under the current loop context. The final argument is the /// collapsed-output level, whereas this function handles converting /// that to the uncollapsed-input level Value genSparseCrd(OpBuilder &builder, Location loc, TensorId tid, Level dstLvl); /// Generates a predicate to determine whether the tranformed coordinates are /// in the given slice. /// Returns std::pair std::pair genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd, TensorId tid, Level lvl); TensorId getNumTensors() const { return tensors.size(); } bool isOutputTensor(TensorId tid) const { return hasOutput && tid == static_cast(getNumTensors() - 1); } bool isSparseOutput(TensorId tid) const { return isOutputTensor(tid) && isSparseOut; } /// Prepares loop for iterating over `tensor[lvl]`, under the assumption /// that `tensor[0...lvl-1]` loops have already been set up. void prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc, TensorId tid, Level lvl); /// Emits extra locals, since the locals might not be in simplified lattices /// point used to generate the loops, but are still required to generate /// expressions. void emitExtraLocalsForTensorsAtDenseLvls(OpBuilder &builder, Location loc, ArrayRef tids, ArrayRef lvls); /// Exits a for loop, returns the reduction results, e.g., /// For sequential for loops: /// %ret = for () { /// ... /// %val = addi %args, %c /// yield %val /// } /// For parallel loops, the following generated code by users: /// %ret = parallel () init(%args) { /// ... /// %val = op %args, %c /// } /// will be transformed into /// %ret = parallel () init(%args) { /// ... /// scf.reduce(%c) bb0(%0, %1){ /// %val = op %0, %1 /// scf.reduce.return %val /// } /// } /// NOTE: only one instruction will be moved into reduce block, /// transformation will fail if multiple instructions are used to compute /// the reduction value. Return %ret to user, while %val is provided by /// users (`reduc`). void exitForLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc); /// Exits a while loop, returns the reduction results. void exitCoIterationLoop(OpBuilder &builder, Location loc, MutableArrayRef reduc); // // View-based-reshape methods. // /// Get the collapse reassociation for `tensors[tid][dstLvl]`. /// For unreshaped operands, the reassociation is simply an identity /// transformation. /// /// NOTE: the result uses `Level` rather than the `int64_t` of /// `ReassociationIndices`, since the former gives clarity to what /// the values actually mean. /// /// TODO: why not do this computation when we first store the reassoc, /// instead of doing it every time we look it up? SmallVector getCollapseReassociation(TensorId tid, Level dstLvl) { assert(tid < getNumTensors() && "Invalid TensorId"); assert(collapseReassoc.size() == getNumTensors()); if (const auto reassoc = collapseReassoc[tid]) { // TODO: store the dstLvlRank in the LoopEmitter so that we can // check `dstLvl < dstLvlRank` at the top; and only here need to // assert that `reassoc.size() == dstLvlRank`. assert(dstLvl < reassoc.size() && "Level is out-of-bounds"); const auto srcLvls = reassoc[dstLvl].cast(); return llvm::to_vector<2>( llvm::map_range(srcLvls, [&](Attribute srcLvl) -> Level { // TODO: replace this with the converter for `LevelAttr`. return srcLvl.cast().getValue().getZExtValue(); })); } return {dstLvl}; } /// A optional string attribute that should be attached to the loop /// generated by loop emitter, it might help following passes to identify /// loops that operates on sparse tensors more easily. StringAttr loopTag; /// Whether the loop emitter needs to treat the last tensor as the output /// tensor. bool hasOutput; bool isSparseOut; // // Fields which have `numTensor` many entries. // // TODO: switch to an AOS style to avoid any possible mismatches. // /// Input and (optional) output tensors. std::vector tensors; /// Level-types for each `(TensorId, Level)` pair. std::vector> lvlTypes; // Sparse iteration information for each `(TensorId, Level)` pair. // These arrays are updated to remain current within the current loop. // TODO: Clarify which of these are indexed by dstLvl vs srcLvl. // /// The collection of positions for a given element (one such collection /// for each tensor). This is the position analogue of the "coords" /// naming convention. /// /// FIXME: [CLARIFY_POSITS_LVL] It's unclear which levels are used /// to index the `posits` array. On the one hand `genSparseCrd` /// uses dstLvl; on the other hand `enterLoopOverTensorAtLvl`, /// `prepareLoopOverTensorAtLvl`, and `enterCoIterationOverTensorsAtLvls` /// uses srcLvl. So which is it? std::vector> posits; /// The collection of coordinates for a given element (one such /// collection for each tensor). std::vector> coords; // The segment upper bound for non-uniques level after de-duplication. std::vector> segHi; std::vector> highs; std::vector> lvlSizes; std::vector> positionsBuffers; // to_positions std::vector> coordinatesBuffers; // to_coordinates std::vector valBuffer; // to_value /// Whether the sparse input is a slice. std::vector isSparseSlices; /// Values related to slices. std::vector> sliceOffsets; std::vector> sliceStrides; /// Collapse Reassociations related to a specific tensor // TODO: support expand. std::vector collapseReassoc; /// TODO: not yet used, it should track the current level for each tensor /// to help eliminate `lvls` paramters from above APIs. /// std::vector curLvl; // // Fields which have at most `numLoops` many entries. // /// Loop Stack, stores the information of all the nested loops that are /// alive. std::vector loopStack; /// Loop Sequence Stack, stores the universal index for the current loop /// sequence. std::vector loopSeqStack; /// Maps `LoopId` (used by `AffineDimExpr`) to `LoopOrd` (in the `loopStack`). /// TODO: We should probably use a callback function here to make it more /// general. std::vector loopIdToOrd; }; } // namespace sparse_tensor } // namespace mlir #endif // MLIR_DIALECT_SPARSETENSOR_TRANSFORMS_SPARSETENSORLOOPEMITTER_H_