801 lines
30 KiB
C++
801 lines
30 KiB
C++
//===- LoopEmitter.cpp ----------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "LoopEmitter.h"
|
|
#include "CodegenUtils.h"
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::sparse_tensor;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local shorthand macros
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#define CMPI(p, l, r) \
|
|
(builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
|
|
.getResult())
|
|
|
|
#define C_IDX(v) (constantIndex(builder, loc, (v)))
|
|
#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
|
|
#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
|
|
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
|
|
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
|
|
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
|
|
#define REMUI(lhs, rhs) (builder.create<arith::RemUIOp>(loc, (lhs), (rhs)))
|
|
#define DIVUI(lhs, rhs) (builder.create<arith::DivUIOp>(loc, (lhs), (rhs)))
|
|
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Debugging utils
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef NDEBUG
|
|
LLVM_ATTRIBUTE_UNUSED static void dumpIndexMemRef(OpBuilder &builder,
|
|
Location loc, Value memref) {
|
|
memref = builder.create<memref::CastOp>(
|
|
loc, UnrankedMemRefType::get(builder.getIndexType(), 0), memref);
|
|
createFuncCall(builder, loc, "printMemrefInd", TypeRange{},
|
|
ValueRange{memref}, EmitCInterface::On);
|
|
}
|
|
#endif
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local helper functions.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// For index reduction loops, since the tensor are sliced into non-continuous
|
|
// fragments, we need a triple [pLo, pHi, pPtr], in which the pair (pLo, pHi)
|
|
// specifies the range of the fragment, and pPtr specifies the index of the
|
|
// corresponding fragment in the child level (i.e., a pointer to the sliced
|
|
// position array).
|
|
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
return createOrFoldSliceOffsetOp(builder, loc, tensor, toDim(enc, lvl));
|
|
}
|
|
|
|
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
return createOrFoldSliceStrideOp(builder, loc, tensor, toDim(enc, lvl));
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Sparse tensor loop emitter class implementations
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, unsigned numLoops,
|
|
DependentLvlGetter dimGetter,
|
|
SparseEmitStrategy emitStrategy) {
|
|
initialize(tensors, loopTag, hasOutput, isSparseOut, numLoops, dimGetter);
|
|
}
|
|
|
|
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, unsigned numLoops,
|
|
DependentLvlGetter dimGetter,
|
|
SparseEmitStrategy emitStrategy) {
|
|
// First initialize the top-level type of the fields.
|
|
this->loopTag = loopTag;
|
|
this->hasOutput = hasOutput;
|
|
this->isSparseOut = isSparseOut;
|
|
this->emitStrategy = emitStrategy;
|
|
|
|
const unsigned numManifestTensors = ts.size();
|
|
const unsigned synTensorId = numManifestTensors;
|
|
const unsigned numTensors = numManifestTensors + 1;
|
|
// tensors array (len == numManifestTensor).
|
|
this->tensors.assign(ts.begin(), ts.end());
|
|
// Arrays with len == numTensor.
|
|
this->valBuffer.assign(numTensors, nullptr);
|
|
this->lvls.resize(numTensors);
|
|
this->iters.resize(numTensors);
|
|
|
|
// These zeros will be overwritten below, but we need to initialize
|
|
// them to something since we'll need random-access assignment.
|
|
this->loopStack.reserve(numLoops);
|
|
this->loopSeqStack.reserve(numLoops);
|
|
|
|
// Index-reduction related fields.
|
|
this->dependentLvlMap.assign(
|
|
numTensors, std::vector<std::vector<std::pair<TensorLevel, unsigned>>>());
|
|
this->sliceMeta.assign(
|
|
numTensors, std::vector<std::vector<std::pair<Value, unsigned>>>());
|
|
this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
|
|
|
|
// Initialize nested types of `TensorId`-indexed fields.
|
|
for (TensorId tid = 0; tid < numTensors; tid++) {
|
|
Level lvlRank;
|
|
if (tid == synTensorId) {
|
|
// Synthetic tensor (conceptually) is an all-dense tensor with rank equal
|
|
// to the total number of loops (each level can potentially be mapped to
|
|
// one of the loop being generated).
|
|
lvlRank = numLoops;
|
|
} else {
|
|
const Value t = tensors[tid];
|
|
// a scalar or 0-dimension tensors
|
|
if (isZeroRankedTensorOrScalar(t.getType()))
|
|
continue;
|
|
|
|
auto rtp = getRankedTensorType(t);
|
|
const SparseTensorType stt(rtp);
|
|
lvlRank = stt.getLvlRank();
|
|
}
|
|
|
|
lvls[tid].resize(lvlRank);
|
|
iters[tid].resize(lvlRank);
|
|
loopHighs.assign(numLoops, nullptr);
|
|
|
|
// Slice-driven loops related initialization.
|
|
levelReducedDep[tid].assign(lvlRank, 0);
|
|
dependentLvlMap[tid].assign(
|
|
lvlRank, std::vector<std::pair<TensorLevel, unsigned>>());
|
|
sliceMeta[tid].assign(lvlRank, std::vector<std::pair<Value, unsigned>>());
|
|
if (dimGetter && !isSynTensor(tid)) {
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
std::vector<std::pair<LoopId, unsigned>> deps = dimGetter(tid, l);
|
|
// Sort the loop by order.
|
|
std::sort(deps.begin(), deps.end(),
|
|
[](auto &lhs, auto &rhs) { return lhs.first < rhs.first; });
|
|
|
|
dependentLvlMap[tid][l] = std::move(deps);
|
|
unsigned depends = dependentLvlMap[tid][l].size();
|
|
if (depends == 0)
|
|
continue;
|
|
sliceMeta[tid][l].reserve(depends);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<SparseIterator>
|
|
LoopEmitter::makeLevelIterator(OpBuilder &builder, Location loc, TensorId t,
|
|
Level l) {
|
|
auto it = makeSimpleIterator(*lvls[t][l], emitStrategy);
|
|
auto stt = getSparseTensorType(tensors[t]);
|
|
if (stt.hasEncoding() && stt.getEncoding().isSlice()) {
|
|
Value offset = genSliceOffset(builder, loc, tensors[t], l);
|
|
Value stride = genSliceStride(builder, loc, tensors[t], l);
|
|
auto slicedIt = makeSlicedLevelIterator(
|
|
std::move(it), offset, stride, lvls[t][l]->getSize(), emitStrategy);
|
|
return slicedIt;
|
|
}
|
|
return it;
|
|
}
|
|
|
|
void LoopEmitter::initializeLoopEmit(
|
|
OpBuilder &builder, Location loc, LoopEmitter::OutputUpdater updater,
|
|
LoopEmitter::SynTensorBoundSetter synSetter) {
|
|
// For every synthetic tensor, set the high bound by calling the callback.
|
|
if (synSetter) {
|
|
TensorId synId = getSynTensorId();
|
|
for (unsigned i = 0, e = loopHighs.size(); i < e; i++) {
|
|
Value sz = loopHighs[i] = synSetter(builder, loc, i);
|
|
auto [stl, it] = makeSynLevelAndIterator(sz, synId, i, emitStrategy);
|
|
lvls[synId][i] = std::move(stl);
|
|
iters[synId][i].emplace_back(std::move(it));
|
|
}
|
|
}
|
|
|
|
// For every manifest tensor:
|
|
// * get the values buffer.
|
|
// * For every level:
|
|
// * get the positions and coordinates buffers
|
|
// * get/compute the level-size, which is also used as the upper-bound
|
|
// on positions.
|
|
for (TensorId t = 0, numTensors = getNumManifestTensors(); t < numTensors;
|
|
t++) {
|
|
const Value tensor = tensors[t];
|
|
const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
|
|
if (!rtp)
|
|
// Skips only scalar, zero ranked tensor still need to be bufferized and
|
|
// (probably) filled with zeros by users.
|
|
continue;
|
|
// FIXME: the definition of `lvlRank` looks more like a dim-rank;
|
|
// but the variable is used as a level everywhere below, which
|
|
// suggests there may be some dim/lvl confusion going on here.
|
|
auto stt = getSparseTensorType(tensor);
|
|
const Level lvlRank = stt.getLvlRank();
|
|
const auto shape = rtp.getShape();
|
|
|
|
SmallVector<Value> lvlSzs;
|
|
for (Level l = 0; l < stt.getLvlRank(); l++) {
|
|
if (stt.hasEncoding())
|
|
lvlSzs.push_back(builder.create<LvlOp>(loc, tensor, l));
|
|
else
|
|
lvlSzs.push_back(builder.create<tensor::DimOp>(loc, tensor, l));
|
|
}
|
|
|
|
// Scan all levels of current tensor.
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
// Find upper bound in current dimension.
|
|
lvls[t][l] = makeSparseTensorLevel(builder, loc, tensor, t, l);
|
|
if (!dependentLvlMap[t][l].empty())
|
|
continue;
|
|
|
|
auto it = makeLevelIterator(builder, loc, t, l);
|
|
iters[t][l].emplace_back(std::move(it));
|
|
}
|
|
|
|
// Perform the required bufferization. Dense inputs materialize
|
|
// from the input tensors. Sparse inputs use sparse primitives to obtain the
|
|
// values.
|
|
// Delegates extra output initialization to clients.
|
|
bool isOutput = isOutputTensor(t);
|
|
Type elementType = stt.getElementType();
|
|
if (!stt.hasEncoding()) {
|
|
// Non-annotated dense tensors.
|
|
BaseMemRefType denseTp = MemRefType::get(shape, elementType);
|
|
|
|
// TODO: if we unconditionally use fully dynamic layout here, it breaks
|
|
// some vectorization passes which requires static stride = 1.
|
|
// Is it possible to call vectorization pass after bufferization?
|
|
if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
|
|
denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
|
|
|
|
Value denseVal =
|
|
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
|
|
// Dense outputs need special handling.
|
|
if (isOutput && updater)
|
|
denseVal = updater(builder, loc, denseVal, tensor);
|
|
|
|
valBuffer[t] = denseVal;
|
|
} else {
|
|
// Annotated sparse tensors.
|
|
// We also need the value buffer for all-dense annotated "sparse"
|
|
// tensors.
|
|
valBuffer[t] = genToValues(builder, loc, tensor);
|
|
}
|
|
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist
|
|
// some loop preparation from tensor iteration, but will also (undesirably)
|
|
// hoist the code ouside if-conditions.
|
|
}
|
|
// TODO: avoid treating subsection iterator as a special case.
|
|
initSubSectIterator(builder, loc);
|
|
}
|
|
|
|
void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
|
|
Value c0 = C_IDX(0);
|
|
for (TensorId t = 0, e = tensors.size(); t < e; t++) {
|
|
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
|
|
if (!rtp)
|
|
continue;
|
|
|
|
Level lvlRank = SparseTensorType(rtp).getLvlRank();
|
|
|
|
// Compute the dependency reduction order.
|
|
auto remDepStack = dependentLvlMap;
|
|
std::vector<std::tuple<LoopId, TensorId, Level>> depRedOrder;
|
|
for (Level lvl = 0; lvl < lvlRank; lvl++) {
|
|
// Reverse queue into a stack.
|
|
std::reverse(remDepStack[t][lvl].begin(), remDepStack[t][lvl].end());
|
|
for (auto [loop, coeff] : dependentLvlMap[t][lvl])
|
|
depRedOrder.emplace_back(std::make_tuple(loop, t, lvl));
|
|
}
|
|
|
|
if (depRedOrder.empty())
|
|
continue;
|
|
|
|
std::sort(depRedOrder.begin(), depRedOrder.end(),
|
|
[](auto &l, auto &r) { return std::get<0>(l) < std::get<0>(r); });
|
|
|
|
SmallVector<SparseIterator *> lastIter(tensors.size(), nullptr);
|
|
for (auto [loop, t, lvl] : depRedOrder) {
|
|
std::pair<LoopId, unsigned> curDep = remDepStack[t][lvl].back();
|
|
assert(curDep.first == loop);
|
|
remDepStack[t][lvl].pop_back();
|
|
|
|
auto lvlIt = makeLevelIterator(builder, loc, t, lvl);
|
|
const SparseIterator *parent = lastIter[t];
|
|
if (!parent && lvl > 0) {
|
|
if (dependentLvlMap[t][lvl - 1].empty()) {
|
|
parent = iters[t][lvl - 1].back().get();
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<SparseIterator> it;
|
|
if (!remDepStack[t][lvl].empty()) {
|
|
// Compute the subsection size.
|
|
Value size = c0;
|
|
for (auto [loop, stride] : remDepStack[t][lvl]) {
|
|
Value idxMax = SUBI(loopHighs[loop], C_IDX(1));
|
|
size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1)));
|
|
}
|
|
it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop],
|
|
std::move(lvlIt), size, curDep.second,
|
|
emitStrategy);
|
|
} else {
|
|
const SparseIterator &subSectIter = *iters[t][lvl].back();
|
|
it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent,
|
|
std::move(lvlIt), loopHighs[loop],
|
|
curDep.second, emitStrategy);
|
|
}
|
|
lastIter[t] = it.get();
|
|
iters[t][lvl].emplace_back(std::move(it));
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::categorizeIterators(
|
|
ArrayRef<TensorLevel> tidLvls, SmallVectorImpl<SparseIterator *> &raIters,
|
|
SmallVectorImpl<SparseIterator *> &spIters) {
|
|
// Finds out the tensor level that we should use to generate loops. Amongs all
|
|
// the tensor levels, there is at most one sparse tensor level.
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
SparseIterator *it = &getCurIterator(t, l);
|
|
if (it->randomAccessible())
|
|
raIters.push_back(it);
|
|
else
|
|
spIters.push_back(it);
|
|
}
|
|
|
|
std::stable_sort(spIters.begin(), spIters.end(), [](auto lhs, auto rhs) {
|
|
// AffineUnRed > Affine > Slice > Trivial
|
|
return static_cast<uint8_t>(lhs->kind) > static_cast<uint8_t>(rhs->kind);
|
|
});
|
|
}
|
|
|
|
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorLevel> tidLvls) {
|
|
// TODO: sort
|
|
assert(loopSeqStack.size() == loopStack.size());
|
|
// Prepares for all the tensors used in the current loop sequence.
|
|
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
levelReducedDep[tid][lvl]++;
|
|
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
|
|
}
|
|
|
|
// Universal Index starts from 0.
|
|
loopSeqStack.emplace_back(C_IDX(0), tidLvls.vec());
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
|
|
assert(loopSeqStack.size() == loopStack.size() + 1);
|
|
|
|
// Depending on whether the slice is resolved or not at current loop sequence,
|
|
// end them in different ways.
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopSeqStack.back().second))
|
|
levelReducedDep[tid][lvl]--;
|
|
|
|
loopSeqStack.pop_back();
|
|
}
|
|
|
|
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
|
|
switch (a.getKind()) {
|
|
case AffineExprKind::DimId: {
|
|
// FIXME: since the one callsite in Sparsification passes in a
|
|
// level-expression, the `getPosition` must in fact be a `Dimension`.
|
|
// However, elsewhere we have been lead to expect that `loopIdToOrd`
|
|
// should be indexed by `LoopId`...
|
|
const auto loopId = cast<AffineDimExpr>(a).getPosition();
|
|
return loopStack[loopId].iv;
|
|
}
|
|
case AffineExprKind::Add: {
|
|
auto binOp = cast<AffineBinaryOpExpr>(a);
|
|
return ADDI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Mul: {
|
|
auto binOp = cast<AffineBinaryOpExpr>(a);
|
|
return MULI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Constant: {
|
|
int64_t c = cast<AffineConstantExpr>(a).getValue();
|
|
return C_IDX(c);
|
|
}
|
|
default:
|
|
llvm_unreachable("unexpected affine subscript");
|
|
}
|
|
}
|
|
|
|
std::pair<Operation *, Value> LoopEmitter::emitForLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, SparseIterator &iter,
|
|
MutableArrayRef<Value> reduc, bool isParallel) {
|
|
|
|
// TODO: support dynamic slices.
|
|
// Uses the first dimension here to build the loop bound (which is also the
|
|
// biggest range).
|
|
|
|
Value step = C_IDX(1);
|
|
auto [lo, hi] = iter.genForCond(builder, loc);
|
|
Operation *loop = nullptr;
|
|
Value iv;
|
|
if (isParallel) {
|
|
scf::ParallelOp parOp =
|
|
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(parOp.getBody());
|
|
assert(parOp.getNumReductions() == reduc.size());
|
|
iv = parOp.getInductionVars()[0];
|
|
|
|
// In-place update on the reduction variable vector.
|
|
// Note that the init vals is not the actual reduction variables but instead
|
|
// used as a "special handle" to (temporarily) represent them. The
|
|
// expression on init vals will be moved into scf.reduce and replaced with
|
|
// the block arguments when exiting the loop (see exitForLoop). This is
|
|
// needed as we can not build the actual reduction block and get the actual
|
|
// reduction variable before users fill parallel loop body.
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = parOp.getInitVals()[i];
|
|
loop = parOp;
|
|
} else {
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
iv = forOp.getInductionVar();
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
loop = forOp;
|
|
}
|
|
assert(loop && iv);
|
|
|
|
Value crd = iv;
|
|
if (!iter.randomAccessible()) {
|
|
iter.linkNewScope(iv);
|
|
crd = iter.deref(builder, loc);
|
|
} else {
|
|
iter.locate(builder, loc, iv);
|
|
}
|
|
|
|
return {loop, crd};
|
|
}
|
|
|
|
std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<SparseIterator *> spIters,
|
|
MutableArrayRef<Value> reduc, bool needsUniv) {
|
|
// NOTE: the slice driven tensor-related reduction variable must
|
|
// appear before normal tensors.
|
|
|
|
// The set of induction variables for the while loop.
|
|
SmallVector<Value> ivs;
|
|
|
|
// Construct the while-loop with a parameter for each coordinate.
|
|
for (SparseIterator *it : spIters) {
|
|
ValueRange itVals = it->getCursor();
|
|
ivs.append(itVals.begin(), itVals.end());
|
|
}
|
|
|
|
// The position where user-supplied reduction variable starts.
|
|
ivs.append(reduc.begin(), reduc.end());
|
|
// Update universal index.
|
|
if (needsUniv)
|
|
ivs.push_back(loopSeqStack.back().first);
|
|
|
|
// Ensures all operands are valid.
|
|
assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; }));
|
|
TypeRange types = ValueRange(ivs).getTypes();
|
|
auto whileOp = builder.create<scf::WhileOp>(loc, types, ivs);
|
|
|
|
SmallVector<Location> locs(types.size(), loc);
|
|
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
|
|
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
|
|
|
|
// Generates loop conditions.
|
|
builder.setInsertionPointToStart(before);
|
|
ValueRange bArgs = before->getArguments();
|
|
Value whileCond = nullptr; // bool values for loop condition.
|
|
|
|
for (SparseIterator *it : spIters) {
|
|
auto [cond, remArgs] = it->genWhileCond(builder, loc, bArgs);
|
|
whileCond = !whileCond ? cond : ANDI(whileCond, cond);
|
|
bArgs = remArgs;
|
|
}
|
|
// The remaining block arguments are user-provided reduction values and an
|
|
// optional universal index. Make sure their sizes match.
|
|
assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0);
|
|
builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
|
|
|
|
// Generates loop body.
|
|
builder.setInsertionPointToStart(after);
|
|
ValueRange aArgs = after->getArguments();
|
|
// Since some LoopCondKind might need extra checks to filter out invalid
|
|
// iterations, we maintains another array to hold the iteration arguments to
|
|
// yield if the checks fails.
|
|
SmallVector<Value> nextArgs(aArgs.begin(), aArgs.end());
|
|
|
|
for (SparseIterator *it : spIters) {
|
|
aArgs = it->linkNewScope(aArgs);
|
|
// Dereference the iterator to cache the coordinate.
|
|
it->deref(builder, loc);
|
|
}
|
|
|
|
// In-place update on reduction variable.
|
|
assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0);
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = aArgs[i];
|
|
|
|
Value min;
|
|
// Finds the minimum coordinate
|
|
if (!needsUniv) {
|
|
for (SparseIterator *it : spIters) {
|
|
if (min) {
|
|
Value cmp = CMPI(ult, it->getCrd(), min);
|
|
min = SELECT(cmp, it->getCrd(), min);
|
|
} else {
|
|
min = it->getCrd();
|
|
}
|
|
}
|
|
} else {
|
|
// Otherwise, universal index is the minimal pos.
|
|
min = whileOp.getAfterArguments().back();
|
|
}
|
|
|
|
return {whileOp, min};
|
|
}
|
|
|
|
bool LoopEmitter::shouldIteratedByForLoop(ArrayRef<SparseIterator *> spIters) {
|
|
// If we need to co-iterate over two sparse tensors, we need a while loop
|
|
if (spIters.size() > 1)
|
|
return false;
|
|
|
|
if (spIters.size() == 1)
|
|
return spIters.front()->iteratableByFor();
|
|
|
|
return true;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
|
|
MutableArrayRef<Value> reduc, bool tryParallel, bool needsUniv) {
|
|
|
|
// TODO: support multiple return on parallel for?
|
|
tryParallel = tryParallel && reduc.size() <= 1;
|
|
|
|
SmallVector<SparseIterator *> raIters;
|
|
SmallVector<SparseIterator *> spIters;
|
|
categorizeIterators(tidLvls, raIters, spIters);
|
|
|
|
// Only when there is at least one sparse conditions, do we really need the
|
|
// universal index.
|
|
// TODO: Maybe we should instead requires merger to pass in a valid value at
|
|
// the first place instead of adjusting it in LoopEmitter?
|
|
needsUniv = !spIters.empty() && needsUniv;
|
|
// The TensorLevel used for loop conditions.
|
|
// If there is any sparse level, we need to use the sparse condition.
|
|
// If all levels are dense, we can pick arbitrary one (dense slice-driven loop
|
|
// can be generated using a simple ForOp as well).
|
|
Operation *l = nullptr;
|
|
Value iv = nullptr;
|
|
SmallVector<TensorLevel> tls;
|
|
|
|
// Generates loops differently depending on whether we need a slice-driven
|
|
// loop or a simple level traversal loop.
|
|
if (shouldIteratedByForLoop(spIters) && !needsUniv) {
|
|
assert(spIters.size() <= 1);
|
|
SparseIterator &it = spIters.empty() ? *raIters.front() : *spIters.front();
|
|
std::tie(l, iv) =
|
|
emitForLoopOverTensorAtLvl(builder, loc, it, reduc, tryParallel);
|
|
tls.push_back(makeTensorLevel(it.tid, it.lvl));
|
|
} else {
|
|
for (auto *it : spIters) {
|
|
tls.push_back(makeTensorLevel(it->tid, it->lvl));
|
|
}
|
|
|
|
if (needsUniv)
|
|
for (auto *it : raIters)
|
|
tls.push_back(makeTensorLevel(it->tid, it->lvl));
|
|
|
|
std::tie(l, iv) =
|
|
emitWhileLoopOverTensorsAtLvls(builder, loc, spIters, reduc, needsUniv);
|
|
}
|
|
|
|
// Enter dense tensor levels.
|
|
for (SparseIterator *it : raIters)
|
|
it->locate(builder, loc, iv);
|
|
|
|
// NOTE: we can also prepare for next dim here in advance
|
|
// Pushes the loop into stack.
|
|
loopStack.emplace_back(tls, l, builder.getInsertionBlock(), iv, loopTag);
|
|
return l;
|
|
}
|
|
|
|
void LoopEmitter::locateLvlAtAffineAddress(OpBuilder &builder, Location loc,
|
|
TensorLevel tidLvl,
|
|
AffineExpr lvlExpr) {
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvl);
|
|
|
|
const SparseIterator *parent =
|
|
lvl == 0 ? nullptr : iters[tid][lvl - 1].back().get();
|
|
auto &it = getCurIterator(tid, lvl);
|
|
it.genInit(builder, loc, parent);
|
|
|
|
assert(it.kind == IterKind::kTrivial && it.randomAccessible());
|
|
Value lvlCrd = genAffine(builder, loc, lvlExpr);
|
|
it.locate(builder, loc, lvlCrd);
|
|
}
|
|
|
|
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
// if this is the first level, there is no parent iterator for the current
|
|
// iterator.
|
|
// If the current iterator is a subsection-based iterator, the parent iterator
|
|
// is memorized by the iterator.
|
|
bool hasParent = lvl == 0 || !dependentLvlMap[tid][lvl].empty();
|
|
|
|
const SparseIterator *parent =
|
|
hasParent ? nullptr : iters[tid][lvl - 1].back().get();
|
|
auto &it = getCurIterator(tid, lvl);
|
|
it.genInit(builder, loc, parent);
|
|
|
|
// Locates the randon accessible iterator to 0.
|
|
if (it.randomAccessible())
|
|
it.locate(builder, loc, C_IDX(0));
|
|
}
|
|
|
|
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == forOp.getNumResults());
|
|
rewriter.create<scf::YieldOp>(loc, reduc);
|
|
}
|
|
// Exit the loop.
|
|
rewriter.setInsertionPointAfter(forOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
|
|
reduc[i] = forOp.getResult(i);
|
|
} else {
|
|
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
|
|
Operation *redExp = reduc.front().getDefiningOp();
|
|
// Reduction expression should have no use.
|
|
assert(redExp->getUses().empty());
|
|
// This must be a binary operation.
|
|
// NOTE: This is users' responsibility to ensure the operation are
|
|
// commutative.
|
|
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
|
|
|
|
Value redVal = parOp.getInitVals().front();
|
|
Value curVal;
|
|
if (redExp->getOperand(0) == redVal)
|
|
curVal = redExp->getOperand(1);
|
|
else if (redExp->getOperand(1) == redVal)
|
|
curVal = redExp->getOperand(0);
|
|
// One of the operands must be the init value (which is also the
|
|
// previous reduction value).
|
|
assert(curVal);
|
|
#ifndef NDEBUG
|
|
// The reduction expression should be the only user of the reduction val
|
|
// inside the parallel for.
|
|
unsigned numUsers = 0;
|
|
for (Operation *op : redVal.getUsers()) {
|
|
if (op->getParentOp() == parOp)
|
|
numUsers++;
|
|
}
|
|
assert(numUsers == 1);
|
|
#endif // NDEBUG
|
|
|
|
rewriter.setInsertionPointAfter(redExp);
|
|
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
|
|
// Attach to the reduction op.
|
|
Block *redBlock = &redOp.getReductions().front().front();
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
Operation *newRed = rewriter.clone(*redExp);
|
|
// Replaces arguments of the reduction expression by using the block
|
|
// arguments from scf.reduce.
|
|
rewriter.modifyOpInPlace(
|
|
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
|
|
// Erases the out-dated reduction expression.
|
|
rewriter.eraseOp(redExp);
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
|
|
}
|
|
rewriter.setInsertionPointAfter(parOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
|
|
reduc[i] = parOp.getResult(i);
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
|
|
Value iv = loopInfo.iv;
|
|
Value one = C_IDX(1);
|
|
|
|
// Finalize the induction. Note that the induction could be performed
|
|
// in the individual if-branches to avoid re-evaluating the conditions.
|
|
// However, that would result in a rather elaborate forest of yield
|
|
// instructions during code generation. Moreover, performing the induction
|
|
// after the if-statements more closely resembles code generated by TACO.
|
|
SmallVector<Value> operands;
|
|
ValueRange whileRes = whileOp.getResults();
|
|
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
|
|
SparseIterator &it = getCurIterator(tid, lvl);
|
|
if (!it.randomAccessible()) {
|
|
// Forward the sparse iterator.
|
|
Value cmp = CMPI(eq, it.getCrd(), iv);
|
|
it.forwardIf(builder, loc, cmp);
|
|
operands.append(it.getCursor().begin(), it.getCursor().end());
|
|
// const Value newPos = whileOp->getResult(o++);
|
|
// Following loops continue iteration from the break point of the
|
|
// current while loop.
|
|
whileRes = it.linkNewScope(whileRes);
|
|
} else {
|
|
// Make sure randomly accessible (dense) iterator is set to the right
|
|
// position according to the universal index.
|
|
Value uniIdx = whileOp.getResults().back();
|
|
it.locate(builder, loc, uniIdx);
|
|
}
|
|
}
|
|
|
|
// Reduction value from users.
|
|
for (auto &i : reduc) {
|
|
operands.push_back(i);
|
|
// Update user reduction variables.
|
|
i = whileRes.front();
|
|
whileRes = whileRes.drop_front();
|
|
}
|
|
|
|
// An (optional) universal index.
|
|
if (operands.size() < whileOp.getNumResults()) {
|
|
assert(operands.size() + 1 == whileOp.getNumResults());
|
|
// The last one is the universial index.
|
|
operands.push_back(ADDI(iv, one));
|
|
// update the loop starting point of current loop sequence
|
|
loopSeqStack.back().first = whileOp->getResults().back();
|
|
}
|
|
|
|
if (!operands.empty())
|
|
YIELD(operands);
|
|
|
|
builder.setInsertionPointAfter(whileOp);
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
// Clean up the values, it would help use to discover potential bug at a
|
|
// earlier stage (instead of silently using a wrong value).
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
|
|
// Sets the insertion point to the right position.
|
|
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
if (!loopInfo.userCodeBlock->empty() &&
|
|
llvm::isa<scf::YieldOp>(&loopInfo.userCodeBlock->back())) {
|
|
// scf::While/For inserts an implicit yield op when there is no loop
|
|
// iter args. In this case, we need to insert the code before the yield.
|
|
assert(loopInfo.userCodeBlock->back().getNumResults() == 0);
|
|
rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back());
|
|
}
|
|
|
|
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
|
|
exitWhileLoop(rewriter, loc, reduc);
|
|
} else {
|
|
exitForLoop(rewriter, loc, reduc);
|
|
}
|
|
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
loopStack.pop_back();
|
|
}
|
|
|
|
#undef CMPI
|
|
#undef C_IDX
|
|
#undef YIELD
|
|
#undef ADDI
|
|
#undef ANDI
|
|
#undef SUBI
|
|
#undef MULI
|
|
#undef SELECT
|