wren romano 76647fce13 [mlir][sparse] Combining dimOrdering+higherOrdering fields into dimToLvl
This is a major step along the way towards the new STEA design.  While a great deal of this patch is simple renaming, there are several significant changes as well.  I've done my best to ensure that this patch retains the previous behavior and error-conditions, even though those are at odds with the eventual intended semantics of the `dimToLvl` mapping.  Since the majority of the compiler does not yet support non-permutations, I've also added explicit assertions in places that previously had implicitly assumed it was dealing with permutations.

Reviewed By: aartbik

Differential Revision: https://reviews.llvm.org/D151505
2023-05-30 15:19:50 -07:00

2121 lines
85 KiB
C++

//===- LoopEmitter.cpp ----------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "LoopEmitter.h"
#include "CodegenUtils.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/Linalg/Utils/Utils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
#include "mlir/Dialect/Tensor/IR/Tensor.h"
using namespace mlir;
using namespace mlir::sparse_tensor;
//===----------------------------------------------------------------------===//
// File local shorthand macros
//===----------------------------------------------------------------------===//
#define CMPI(p, l, r) \
(builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
.getResult())
#define C_IDX(v) (constantIndex(builder, loc, (v)))
#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
//===----------------------------------------------------------------------===//
// File local helper functions.
//===----------------------------------------------------------------------===//
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
Level lvl) {
auto enc = getSparseTensorEncoding(tensor.getType());
// FIXME: `toOrigDim` is deprecated
return createOrFoldSliceOffsetOp(builder, loc, tensor, toOrigDim(enc, lvl));
}
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
Level lvl) {
auto enc = getSparseTensorEncoding(tensor.getType());
// FIXME: `toOrigDim` is deprecated
return createOrFoldSliceStrideOp(builder, loc, tensor, toOrigDim(enc, lvl));
}
/// Converts a coordinate relative to the slice to the coordinate relative
/// to the underlying tensor.
// FIXME: that description says "sliceCrd -> tensorCrd"; but the function
// name suggests it should be "tensorCrd -> sliceCrd".
static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd,
Value offset, Value stride, Value tensor, Level lvl) {
// tensorCrd = sliceCrd * stride + offset
return ADDI(MULI(crd, stride), offset);
}
/// Generates code to compute the *absolute* offset of the slice based on the
/// provide minimum coordinates in the slice.
/// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the
/// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute*
/// offset is the offset computed relative to the initial tensors T.
///
/// When isNonEmpty == true, the computed offset is meaningless and should not
/// be used during runtime, the method generates code to return 0 currently in
/// that case.
///
/// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0;
static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd,
Value size, Value isNonEmpty) {
Value geSize = CMPI(uge, minCrd, size);
Value pred = ANDI(isNonEmpty, geSize);
// Computes minCrd - size + 1
Value mms = SUBI(ADDI(minCrd, C_IDX(1)), size);
// This is the absolute offset related to the underly tensor.
return SELECT(pred, mms, C_IDX(0));
}
/// Converts a coordinate relative to the underlying tensor to the coordinate
/// relative to the slice, returns a extra reminder value
// FIXME: that description says "tensorCrd -> sliceCrd"; but the function
// name suggests it should be "sliceCrd -> tensorCrd".
static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
Value crd, Value offset,
Value stride, Value tensor,
Level lvl) {
// sliceCrd = (tensorCrd - offset) / stride
crd = SUBI(crd, offset);
Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
crd = builder.create<arith::DivUIOp>(loc, crd, stride);
return std::make_pair(crd, rem);
}
std::pair<Value, Value>
LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd,
TensorId tid, Level lvl) {
assert(isSparseSlices[tid]);
Value slice = tensors[tid];
Value offset = sliceOffsets[tid][lvl];
Value stride = sliceStrides[tid][lvl];
auto enc = getSparseTensorEncoding(slice.getType());
const auto [newCrd, crdRem] =
fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl);
SmallVector<Value, 3> conds; // at most 3 conditions
// First, coord >= offset (skip the check if offset is known to be 0).
if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl);
!(staticOffset.has_value() && *staticOffset == 0)) {
auto geOffset = CMPI(uge, crd, offset);
conds.push_back(geOffset);
}
// Second, coord_in_slice < length
auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]);
conds.push_back(ltLength);
// Third, rem == 0 (skip the check if stride is known to be 1).
if (auto staticStride = enc.getStaticLvlSliceStride(lvl);
!(staticStride.has_value() && *staticStride == 1)) {
auto fitStride = CMPI(eq, crdRem, C_IDX(0));
conds.push_back(fitStride);
}
// Must meet all condition to be a valid coordinate in slice.
auto pred = conds.front();
for (auto cond : ValueRange(conds).drop_front())
pred = ANDI(pred, cond);
return {newCrd, pred};
}
//===----------------------------------------------------------------------===//
// Sparse tensor loop emitter class implementations
//===----------------------------------------------------------------------===//
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid,
Level lvl, Value crd) {
Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1];
Value mul = MULI(highs[tid][lvl], pos);
if (isSparseSlices[tid])
crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl],
sliceStrides[tid][lvl], tensors[tid], lvl);
Value add = ADDI(mul, crd);
return add;
}
Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc,
TensorId tid, Level lvl, Value pLo,
Value pHi) {
const auto coordinates = coordinatesBuffers[tid][lvl];
const auto sameCrd = genIndexLoad(builder, loc, coordinates, pLo);
auto whileOp = builder.create<scf::WhileOp>(
loc, builder.getIndexType(), pLo,
/*beforeBuilder=*/
[pHi, coordinates, sameCrd](OpBuilder &builder, Location loc,
ValueRange ivs) {
const auto pos = ivs[0];
Value inBound = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::ult, pos, pHi);
auto ifInBound =
builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true);
{
OpBuilder::InsertionGuard guard(builder);
// Load the next coordinates only when inbound (to avoid OOB
// acccesses).
builder.setInsertionPointToStart(ifInBound.thenBlock());
Value crd = genIndexLoad(builder, loc, coordinates, pos);
Value isSameCrd = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::eq, crd, sameCrd);
YIELD(isSameCrd);
// Else, the position is out of bound, yield false to terminate the
// loop.
builder.setInsertionPointToStart(ifInBound.elseBlock());
YIELD(constantI1(builder, loc, false));
}
builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs);
},
/*afterBuilder=*/
[](OpBuilder &builder, Location loc, ValueRange ivs) {
// pos ++
Value nextPos = ADDI(ivs[0], C_IDX(1));
YIELD(nextPos);
});
// Return the segment high.
return whileOp.getResult(0);
}
Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
Level dstLvl) {
Value crd = C_IDX(0);
const auto reassoc = getCollapseReassociation(tid, dstLvl);
const unsigned reassocSize = reassoc.size();
for (unsigned i = 0; i < reassocSize; i++) {
const Level srcLvl = reassoc[i];
// A load on the coordinates array yields the coordinate.
const Value mem = coordinatesBuffers[tid][srcLvl];
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
const Value pos = posits[tid][dstLvl];
const Value off = genIndexLoad(builder, loc, mem, pos);
// Linearized the coordinates within the same collapse reassociation.
crd = ADDI(crd, off);
if (i != reassocSize - 1) {
crd = MULI(crd, this->lvlSizes[tid][reassoc[i + 1]]);
}
}
return crd;
}
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
bool isSparseOut, ArrayRef<LoopId> topSort,
DependentLvlGetter dimGetter) {
initialize(tensors, loopTag, hasOutput, isSparseOut, topSort, dimGetter);
}
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
bool isSparseOut, ArrayRef<LoopId> topSort,
DependentLvlGetter dimGetter) {
// First initialize the top-level type of the fields.
this->loopTag = loopTag;
this->hasOutput = hasOutput;
this->isSparseOut = isSparseOut;
const unsigned numTensors = ts.size();
this->tensors.assign(ts.begin(), ts.end());
this->lvlTypes.assign(numTensors, std::vector<DimLevelType>());
this->lvlSizes.assign(numTensors, std::vector<Value>());
this->highs.assign(numTensors, std::vector<Value>());
this->segHi.assign(numTensors, std::vector<Value>());
this->posits.assign(numTensors, std::vector<Value>());
this->coords.assign(numTensors, std::vector<Value>());
this->positionsBuffers.assign(numTensors, std::vector<Value>());
this->coordinatesBuffers.assign(numTensors, std::vector<Value>());
this->valBuffer.assign(numTensors, nullptr);
this->collapseReassoc.assign(numTensors, nullptr);
this->isSparseSlices.assign(numTensors, false);
this->sliceOffsets.assign(numTensors, std::vector<Value>());
this->sliceStrides.assign(numTensors, std::vector<Value>());
const LoopOrd numLoops = topSort.size();
// These zeros will be overwritten below, but we need to initialize
// them to something since we'll need random-access assignment.
this->loopIdToOrd.assign(numLoops, 0);
this->loopStack.reserve(numLoops);
this->loopSeqStack.reserve(numLoops);
// Index-reduction related fields.
this->dependentLvlMap.assign(
numTensors, std::vector<std::vector<std::pair<TensorId, Level>>>());
this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>());
this->sliceSizes.assign(numTensors, std::vector<std::vector<Value>>());
this->sliceStack.assign(numTensors, std::vector<SliceInfo>());
this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
// Initialize nested types of `TensorId`-indexed fields.
for (TensorId tid = 0; tid < numTensors; tid++) {
const Value t = tensors[tid];
// a scalar or 0-dimension tensors
if (isZeroRankedTensorOrScalar(t.getType()))
continue;
auto rtp = getRankedTensorType(t);
if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>();
isUniqueCOOType(rtp) && reshape) {
// TODO: Supports more kinds of sparse tensors.
// FIXME: We should instead lower reshape operations on sparse tensors to
// view change.
collapseReassoc[tid] = reshape.getReassociation();
rtp = reshape.getSrcType();
// Overwrites the tensor to the source tensor of reshape operations.
tensors[tid] = reshape.getSrc();
}
const SparseTensorType stt(rtp);
const Level lvlRank = stt.getLvlRank();
// We always treat sparse output tensor as dense so that we always iterate
// it based on lvl size.
if (stt.hasEncoding() && !(isOutputTensor(tid) && isSparseOut)) {
const auto enc = stt.getEncoding();
isSparseSlices[tid] = enc.isSlice();
for (auto lvlTp : enc.getLvlTypes())
lvlTypes[tid].push_back(lvlTp);
} else {
lvlTypes[tid].assign(lvlRank, DimLevelType::Dense);
}
// Initialize using empty value.
lvlSizes[tid].assign(lvlRank, Value());
highs[tid].assign(lvlRank, Value());
segHi[tid].assign(lvlRank, Value());
posits[tid].assign(lvlRank, Value());
coords[tid].assign(lvlRank, Value());
positionsBuffers[tid].assign(lvlRank, Value());
coordinatesBuffers[tid].assign(lvlRank, Value());
sliceOffsets[tid].assign(lvlRank, Value());
sliceStrides[tid].assign(lvlRank, Value());
// Slice-driven loops related initialization.
levelReducedDep[tid].assign(lvlRank, 0);
dependentLvlMap[tid].assign(lvlRank,
std::vector<std::pair<TensorId, Level>>());
slicePosBuffer[tid].assign(lvlRank, std::vector<Value>());
sliceSizes[tid].assign(lvlRank, std::vector<Value>());
sliceStack[tid].emplace_back(/*minCrd=*/Value(),
/*offset=*/Value(), /*isNonEmpty*/ Value(),
std::nullopt, 0);
if (dimGetter) {
auto reassoc = collapseReassoc[tid];
Level dstRank = reassoc ? reassoc.size() : lvlRank;
for (Level l = 0; l < dstRank; l++) {
dependentLvlMap[tid][l] = dimGetter(tid, l);
unsigned depends = dependentLvlMap[tid][l].size();
if (depends == 0)
continue;
// TODO: View-base collapse and dependent index reduction are not
// compatible right now.
assert(!reassoc);
// We need `depends - 1` slices to fully the affine expression.
sliceSizes[tid][l].assign(depends - 1, nullptr);
slicePosBuffer[tid][l].assign(depends - 1, nullptr);
}
}
}
// Construct the inverse of the `topSort` from the sparsifier.
// This is needed to map `AffineDimExpr`s back to the `LoopOrd`
// used in loop emitter.
// FIXME: This map should be maintained outside loop emitter.
for (LoopOrd n = 0; n < numLoops; n++)
loopIdToOrd[topSort[n]] = n;
}
void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc,
LoopEmitter::OutputUpdater updater) {
// For every tensor:
// * get the values buffer.
// * For every level:
// * get the positions and coordinates buffers
// * get/compute the level-size, which is also used as the upper-bound
// on positions.
for (TensorId t = 0, numTensors = getNumTensors(); t < numTensors; t++) {
const Value tensor = tensors[t];
const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
if (!rtp)
// Skips only scalar, zero ranked tensor still need to be bufferized and
// (probably) filled with zeros by users.
continue;
// FIXME: the definition of `lvlRank` looks more like a dim-rank;
// but the variable is used as a level everywhere below, which
// suggests there may be some dim/lvl confusion going on here.
const Level lvlRank = rtp.getRank();
const auto shape = rtp.getShape();
const auto enc = getSparseTensorEncoding(rtp);
const Level cooStart = enc ? getCOOStart(enc) : lvlRank;
// Scan all levels of current tensor.
for (Level l = 0; l < lvlRank; l++) {
// This should be called only once at beginning.
assert(!positionsBuffers[t][l] && !coordinatesBuffers[t][l] &&
!highs[t][l]);
const auto lvlTp = lvlTypes[t][l];
// Handle sparse storage schemes.
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
// Generate sparse primitives to obtain positions and coordinates.
positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l);
coordinatesBuffers[t][l] =
genToCoordinates(builder, loc, tensor, l, cooStart);
} else if (isSingletonDLT(lvlTp)) {
// Singleton level, fetch coordinates.
coordinatesBuffers[t][l] =
genToCoordinates(builder, loc, tensor, l, cooStart);
} else {
// Dense level, nothing to fetch.
assert(isDenseDLT(lvlTp));
}
// FIXME: `toOrigDim` is deprecated. For now this relies on the
// 1:1 mapping between levels and dimensions, since nowhere else
// in the code supports non-permutations yet either.
Value lvlSz = mlir::linalg::createOrFoldDimOp(builder, loc, tensor,
toOrigDim(enc, l));
// Find upper bound in current dimension.
highs[t][l] = lvlSizes[t][l] = lvlSz;
if (isSparseSlices[t]) {
sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l);
sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l);
}
}
// Perform the required bufferization. Dense inputs materialize
// from the input tensors. Sparse inputs use sparse primitives to obtain the
// values.
// Delegates extra output initialization to clients.
bool isOutput = isOutputTensor(t);
Type elementType = rtp.getElementType();
if (!enc) {
// Non-annotated dense tensors.
BaseMemRefType denseTp = MemRefType::get(shape, elementType);
// TODO: if we unconditionally use fully dynamic layout here, it breaks
// some vectorization passes which requires static stride = 1.
// Is it possible to call vectorization pass after bufferization?
if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
Value denseVal =
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
// Dense outputs need special handling.
if (isOutput && updater)
denseVal = updater(builder, loc, denseVal, tensor);
valBuffer[t] = denseVal;
} else {
// Annotated sparse tensors.
// We also need the value buffer for all-dense annotated "sparse" tensors.
valBuffer[t] = genToValues(builder, loc, tensor);
}
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist
// some loop preparation from tensor iteration, but will also (undesirably)
// hoist the code ouside if-conditions.
}
Type indexType = builder.getIndexType();
Value c0 = constantZero(builder, loc, indexType);
for (TensorId t = 0, e = tensors.size(); t < e; t++) {
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
if (!rtp)
continue;
Level lvlRank = SparseTensorType(rtp).getLvlRank();
for (Level lvl = 0; lvl < lvlRank; lvl++) {
if (!dependentLvlMap[t][lvl].empty()) {
ArrayRef<std::pair<TensorId, Level>> depLvls = dependentLvlMap[t][lvl];
// Needs at least two operands to form a non-trivial affine expression.
assert(depLvls.size() > 1);
Value size = c0;
for (unsigned e = depLvls.size() - 1; e >= 1; e--) {
auto [dt, dd] = depLvls[e];
size = ADDI(size, lvlSizes[dt][dd]);
sliceSizes[t][lvl][e - 1] = size;
}
}
}
}
localInsertPos = builder.getInsertionPoint()->getPrevNode();
}
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
ArrayRef<TensorLevel> tidLvls) {
// TODO: sort
assert(loopSeqStack.size() == loopStack.size());
// Prepares for all the tensors used in the current loop sequence.
std::vector<std::tuple<TensorId, Level, bool>> slicedTids;
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
if (!dependentLvlMap[tid][lvl].empty()) {
bool fullyRed = genSliceBegin(builder, loc, tid, lvl);
slicedTids.emplace_back(tid, lvl, fullyRed);
} else {
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
}
}
// Universal Index starts from 0.
loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids));
}
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
assert(loopSeqStack.size() == loopStack.size() + 1);
const auto &slicedTids = loopSeqStack.back().second;
// Depending on whether the slice is resolved or not at current loop sequence,
// end them in different ways.
for (auto [tid, lvl, res] : slicedTids) {
if (!res) {
// If this is a unresolved-slice-driven loop, pops out the slice.
assert(sliceStack[tid].back().slicedOnLvl == lvl);
sliceStack[tid].pop_back();
} else {
if (!isDenseDLT(lvlTypes[tid][lvl])) {
// Else this is a resolved-slice, and advance posit similar to TACO.
Value c1 = C_IDX(1), c2 = C_IDX(2);
// pIdx += 2, we finished the current lvl, advance the pointer index of
// the previous level by two to skip the [pLo, pHi] for current level.
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
// TODO: we could probably use an SSA value for it.
Value nexP = ADDI(curP, c2);
builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
}
}
}
loopSeqStack.pop_back();
}
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
switch (a.getKind()) {
case AffineExprKind::DimId: {
// FIXME: since the one callsite in Sparsification passes in a
// level-expression, the `getPosition` must in fact be a `Dimension`.
// However, elsewhere we have been lead to expect that `loopIdToOrd`
// should be indexed by `LoopId`...
const auto loopId = a.cast<AffineDimExpr>().getPosition();
assert(loopId < loopIdToOrd.size());
return loopStack[loopIdToOrd[loopId]].iv;
}
case AffineExprKind::Add: {
auto binOp = a.cast<AffineBinaryOpExpr>();
return ADDI(genAffine(builder, loc, binOp.getLHS()),
genAffine(builder, loc, binOp.getRHS()));
}
case AffineExprKind::Mul: {
auto binOp = a.cast<AffineBinaryOpExpr>();
return MULI(genAffine(builder, loc, binOp.getLHS()),
genAffine(builder, loc, binOp.getRHS()));
}
case AffineExprKind::Constant: {
int64_t c = a.cast<AffineConstantExpr>().getValue();
return C_IDX(c);
}
default:
llvm_unreachable("unexpected affine subscript");
}
}
Operation *LoopEmitter::emitForLoopOverTensorAtLvl(
OpBuilder &builder, Location loc, TensorId tid, Level dstLvl, Value lo,
Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) ||
isCompressedWithHiDLT(lvlTypes[tid][dstLvl]) ||
isSingletonDLT(lvlTypes[tid][dstLvl]);
const auto reassoc = getCollapseReassociation(tid, dstLvl);
// TODO: support dynamic slices.
// Uses the first dimension here to build the loop bound (which is also the
// biggest range).
const Level srcLvl = reassoc.front();
Value step = C_IDX(1);
Operation *loop = nullptr;
Value iv;
if (isParallel) {
assert(collapseReassoc[tid] == nullptr);
scf::ParallelOp parOp =
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(parOp.getBody());
assert(parOp.getNumReductions() == reduc.size());
iv = parOp.getInductionVars()[0];
// In-place update on the reduction variable vector.
// Note that the init vals is not the actual reduction variables but instead
// used as a "special handle" to (temporarily) represent them. The
// expression on init vals will be moved into scf.reduce and replaced with
// the block arguments when exiting the loop (see exitForLoop). This is
// needed as we can not build the actual reduction block and get the actual
// reduction varaible before users fill parallel loop body.
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = parOp.getInitVals()[i];
loop = parOp;
} else {
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
builder.setInsertionPointToStart(forOp.getBody());
iv = forOp.getInductionVar();
// In-place update on the reduction variable vector.
assert(forOp.getNumRegionIterArgs() == reduc.size());
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = forOp.getRegionIterArg(i);
loop = forOp;
}
assert(loop && iv);
Value crd;
if (isSparseCond) {
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
// For COO, the position is the same across consecutive levels.
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
llvm::for_each(reassoc,
[this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; });
crd = genSparseCrd(builder, loc, tid, dstLvl);
} else {
// Dense tensor, the coordinate is the inducation variable.
crd = iv;
}
if (isSparseSlices[tid] && isSparseCond) {
// For sparse level slices, we need to filter out invalid coordinates that
// are not included in the slice.
SmallVector<Type> types;
for (Value red : reduc)
types.push_back(red.getType());
auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl);
bool hasReduc = !types.empty();
scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, pred,
/*else*/ hasReduc);
if (hasReduc) {
// scf.for (a) -> v
// %s = scf.if (a) -> v
// user-generated code.
// else
// yield a
// yield %s
YIELD(ifOp.getResults());
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
// On mismatch.
YIELD(reduc);
}
// Set the insertion point to matched branch.
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
crd = trans;
}
assert(crd);
coords[tid][dstLvl] = crd;
return loop;
}
Operation *LoopEmitter::emitWhileLoopOverSliceAtSparseLvl(
OpBuilder &builder, Location loc, Value pLo, Value pHi, Value offset,
Value sliceSize, TensorId tid, Level lvl, MutableArrayRef<Value> reduc) {
// TODO: we should generalize the method to support iteration over for
// normal slices as well to allow early break.
Operation *insertPoint = nullptr;
Operation *loop =
genSliceLvlTraverseLoop(
builder, loc, pLo, pHi, offset, sliceSize, tid, lvl, reduc,
/*genYield=*/false, // unaware of the yield values from user yet
[this, tid, lvl, reduc, offset,
&insertPoint](OpBuilder &builder, Location loc, Value iv,
MutableArrayRef<Value> innerReduc) {
assert(innerReduc.size() == reduc.size());
// Updates users' reduction variable inplace
for (unsigned i = 0, e = reduc.size(); i < e; i++)
reduc[i] = innerReduc[i];
// Loads the coordinates.
Value absC =
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
// We need to substract the offset to get relative coordinates.
// TODO: how to assert relC >=0 during runtime?
insertPoint = builder.create<arith::SubIOp>(loc, absC, offset);
posits[tid][lvl] = iv;
coords[tid][lvl] = insertPoint->getResult(0);
})
.first;
// Sets the insertionn pointer inside loop body.
builder.setInsertionPointAfter(insertPoint);
return loop;
}
Operation *LoopEmitter::enterLoopOverTensorAtLvl(OpBuilder &builder,
Location loc,
ArrayRef<TensorLevel> tidLvls,
MutableArrayRef<Value> reduc,
bool isParallel) {
// TODO: support multiple return on parallel for?
assert(!isParallel || reduc.size() <= 1);
bool isSparseCond = false, isSparseSliceCond = false;
auto [tid, lvl] = unpackTensorLevel(tidLvls.front());
// Finds out the tensor level that we should use to generate loops. Amongs all
// the tensor levels, there is at most one sparse tensor level.
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair
assert(!coords[t][l] || // We cannot re-enter the same level
!dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop
auto lvlType = lvlTypes[t][l];
// Must be a recognizable DLT.
assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) ||
isCompressedWithHiDLT(lvlType) || isSingletonDLT(lvlType));
// This is a slice-driven loop on sparse level.
if (!dependentLvlMap[t][l].empty() && !isDenseDLT(lvlType)) {
assert(!isSparseSliceCond && !isSparseCond);
isSparseSliceCond = true;
tid = t;
lvl = l;
continue;
}
bool isSparse = isCompressedDLT(lvlType) || isSingletonDLT(lvlType) ||
isCompressedWithHiDLT(lvlType);
// We can at most have one sparse input, otherwise, a while loop is
// required to co-iterate multiple sparse tensors.
assert(!isSparseCond || !isSparse);
assert(!isSparseSliceCond || !isSparseCond);
if (isSparse) {
tid = t;
lvl = l;
}
isSparseCond = isSparseCond || isSparse;
}
DimLevelType lvlType = lvlTypes[tid][lvl];
// TODO: Dense slice driven loop can be generated using for loop as well.
assert(!isSparseSliceCond || !isDenseDLT(lvlType));
bool isDenseSliceCond =
isDenseDLT(lvlType) && !dependentLvlMap[tid][lvl].empty();
// if the slice is fully reduced, we can now use TACO-based algorithm to
// iterate it.
Operation *l = nullptr;
// At most one tensor used as condition in for loop;
SmallVector<TensorLevel, 1> condTidLvl;
// There might be multiple dense slice driven tensor.
SmallVector<SliceLoopInfo> sliceDrivenInfo;
// Generates loops differently depending on whether we need a slice-driven
// loop or a simple level traversal loop.
if (isSparseSliceCond) {
bool fullyReduced = depFullyReduced(tid, lvl);
if (!fullyReduced) {
l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc);
} else {
// If the slice is fully reduced, we can now use TACO-based algorithm to
// iterate it.
l = emitWhileLoopOverSliceAtSparseLvl(
builder, loc, posits[tid][lvl], highs[tid][lvl],
getFinalSliceOnLvl(tid, lvl).offset, sliceSizes[tid][lvl].back(), tid,
lvl, reduc);
}
levelReducedDep[tid][lvl]++;
sliceDrivenInfo.emplace_back(tid, lvl, fullyReduced);
} else {
Value lo = isSparseCond ? posits[tid][lvl] // current offset
: loopSeqStack.back().first; // universal index
Value hi = highs[tid][lvl];
if (isDenseSliceCond) {
bool fullyReduced = depFullyReduced(tid, lvl);
Value sliceSz = sliceSizes[tid][lvl][sliceStack[tid].back().depth - 1];
// Adjust for loop hi for dense slice-driven loop.
if (fullyReduced) {
hi = sliceSz;
condTidLvl.push_back(makeTensorLevel(tid, lvl));
} else {
hi = SUBI(lvlSizes[tid][lvl], sliceSz);
hi = ADDI(hi, C_IDX(1));
}
} else {
condTidLvl.push_back(makeTensorLevel(tid, lvl));
}
l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, reduc,
isParallel);
}
Value iv = coords[tid][lvl];
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
// We only need to handle slice-driven loops on dense level here.
// If it is a slice-driven loop on sparse level, it needs a while loop to
// insert break statements, and it must have been handled correctly in L692.
if (!dependentLvlMap[t][l].empty() && isDenseDLT(lvlTypes[t][l])) {
// Pushes sliced levels to build correct LoopInfo.
bool fullyReduc = depFullyReduced(t, l);
SliceInfo &info = sliceStack[t].back();
if (fullyReduc) {
posits[t][l] = genAddress(builder, loc, t, l, ADDI(info.offset, iv));
} else {
// Puts sliced dense loop into LoopInfo so that LoopEmitter knows how to
// exit it.
sliceDrivenInfo.emplace_back(t, l, fullyReduc);
// Update the slice information as we enter the new loop.
assert(*info.slicedOnLvl == l);
info.minCrd = info.offset = iv;
info.isNonEmpty = constantI1(builder, loc, true);
levelReducedDep[t][l]++;
}
}
}
// NOTE: we can also prepare for next dim here in advance
// Pushes the loop into stack.
loopStack.emplace_back(condTidLvl, sliceDrivenInfo, l,
builder.getInsertionBlock(), iv, loopTag);
// Emit extra locals.
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
return l;
}
Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl(
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
AffineExpr affine, MutableArrayRef<Value> reduc) {
assert(isValidLevel(tid, lvl));
assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(lvlTypes[tid][lvl]));
// We can not re-enter the same level.
assert(!coords[tid][lvl]);
// TODO: We should instead use a whileOp for filter loop to allow early
// break when exceeding (for ordered levels).
// TODO: There are many other potiential opportunities that we might apply in
// the future. E.g., we could use binary search to locate positions.
const Value step = C_IDX(1);
const Value pLo = posits[tid][lvl];
const Value pHi = highs[tid][lvl];
scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc);
// In-place update on the reduction variable vector.
assert(forOp.getNumRegionIterArgs() == reduc.size());
for (int i = 0, e = reduc.size(); i < e; i++)
reduc[i] = forOp.getRegionIterArg(i);
builder.setInsertionPointToStart(forOp.getBody());
// The induction variable gives the position.
const Value pos = forOp.getInductionVar();
posits[tid][lvl] = pos;
// Generating a load on the coordinates array yields the crd.
const Value mem = coordinatesBuffers[tid][lvl];
const Value crd = genIndexLoad(builder, loc, mem, pos);
coords[tid][lvl] = crd;
// Generate an if-condition to filter out coordinates that are not
// equal to the result of the affine expression.
Value expected = genAffine(builder, loc, affine);
auto pred = CMPI(eq, crd, expected);
SmallVector<Type> types;
for (Value red : reduc) {
types.push_back(red.getType());
}
bool hasReduc = !types.empty();
scf::IfOp ifOp =
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
if (hasReduc) {
// scf.for (a) -> v
// %s = scf.if (a) -> v
// user-generated code.
// else
// yield a
// yield %s
YIELD(ifOp.getResults());
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
// On mismatch.
YIELD(reduc);
}
// Set the insert point to matched branch.
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
// NOTE: we can also prepare for next lvl here in advance
// Push the loop into stack
loopStack.emplace_back(ArrayRef<TensorLevel>(makeTensorLevel(tid, lvl)),
ArrayRef<SliceLoopInfo>(), forOp,
builder.getInsertionBlock(), coords[tid][lvl],
nullptr);
return forOp;
}
void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc,
TensorLevel tidLvl,
AffineExpr lvlExpr) {
auto [tid, lvl] = unpackTensorLevel(tidLvl);
assert(isDenseDLT(lvlTypes[tid][lvl]));
// For dense levels, the level-coordinate also serves as the position.
Value lvlCrd = genAffine(builder, loc, lvlExpr);
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd);
}
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
bool needsUniv, MutableArrayRef<Value> reduc) {
// NOTE: the slice driven tensor-related reduction variable must
// appear before normal tensors.
SmallVector<Type> types;
SmallVector<Value> operands;
// Construct the while-loop with a parameter for each coordinate.
const Type indexType = builder.getIndexType();
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
// TODO: support coiteration with slice driven tensors.
const auto lvlTp = lvlTypes[tid][lvl];
assert(dependentLvlMap[tid][lvl].empty() && "TODO: not yet implemented");
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
isCompressedWithHiDLT(lvlTp)) {
const auto reassoc = getCollapseReassociation(tid, lvl);
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
// This is the segment high for each non-unique levels.
types.push_back(indexType);
operands.push_back(C_IDX(0));
}
}
const auto pos = posits[tid][reassoc.front()];
assert(pos);
types.push_back(indexType);
operands.push_back(pos);
}
}
// The position where user-supplied reduction variable starts.
for (Value rec : reduc) {
types.push_back(rec.getType());
operands.push_back(rec);
}
if (needsUniv) {
types.push_back(indexType);
// Update universal index.
operands.push_back(loopSeqStack.back().first);
}
assert(types.size() == operands.size());
scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
SmallVector<Location> locs(types.size(), loc);
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
// Build the "before" region, which effectively consists
// of a conjunction of "i < upper" tests on all induction.
builder.setInsertionPointToStart(&whileOp.getBefore().front());
Value cond;
unsigned o = 0;
for (auto [t, lvl] : unpackTensorLevelRange(tidLvls)) {
const TensorId tid = t; // Why `t` can not be captured by lambda?
const auto lvlTp = lvlTypes[tid][lvl];
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
isCompressedWithHiDLT(lvlTp)) {
const auto reassoc = getCollapseReassociation(tid, lvl);
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
// Links the SSA chain for segHi.
segHi[tid][reassoc[i]] = after->getArgument(o++);
}
}
Value op1 = before->getArgument(o);
// We used the first level bound as the bound the collapsed set of levels.
Value op2 = highs[tid][reassoc.front()];
Value opc = CMPI(ult, op1, op2);
cond = cond ? ANDI(cond, opc) : opc;
// Update positions
Value pos = after->getArgument(o++);
// For COO, the position is the same across consecutive levels.
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) {
posits[tid][srcLvl] = pos;
});
}
}
builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
// Generates while body.
builder.setInsertionPointToStart(&whileOp.getAfter().front());
SmallVector<std::pair<Value, unsigned>> slicesPreds;
unsigned i = 0;
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
// Prepares for next level.
const auto lvlTp = lvlTypes[tid][lvl];
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
isCompressedWithHiDLT(lvlTp)) {
coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl);
if (isSparseSlices[tid]) {
auto [trans, pred] =
genSliceLegitPredicate(builder, loc, coords[tid][lvl], tid, lvl);
slicesPreds.emplace_back(pred, i);
// Updates to the relative coordinate to the slice.
coords[tid][lvl] = trans;
}
i++;
}
}
if (!slicesPreds.empty()) {
// Skips invalid loop iteration when slice coordinate is inapplicable.
SmallVector<Value> yields(after->getArguments());
// Generates a list of if statments
// pos = in_slice ? pos : pos + 1
// TODO: instead of always picking pos + 1, we should set pos = high to
// break to loop if the coordinates are larger than the slice size.
//
// This "idx" is the index into `llvm::zip(tids, lvls)`
for (auto [pred, idx] : slicesPreds) {
Value nextPos = ADDI(yields[idx], C_IDX(1));
yields[idx] = SELECT(pred, yields[idx], nextPos);
}
Value pred = slicesPreds.front().first;
for (int i = 1, e = slicesPreds.size(); i < e; i++) {
pred = ANDI(pred, slicesPreds[i].first);
}
auto ifOp = builder.create<scf::IfOp>(loc, types, pred, /*else*/ true);
ifOp->setAttr(getLoopEmitterLoopAttrName(),
StringAttr::get(builder.getContext(), "slice"));
YIELD(ifOp->getResults());
assert(types.size() == yields.size());
// If not all slices are legit
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
YIELD(yields);
// If all slices are legit, start the user generated code.
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
}
Value min;
// Finds the minimum coordinate
if (!needsUniv) {
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
const auto lvlTp = lvlTypes[tid][lvl];
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
isCompressedWithHiDLT(lvlTp)) {
const auto crd = coords[tid][lvl];
if (min) {
Value cmp = CMPI(ult, coords[tid][lvl], min);
min = SELECT(cmp, coords[tid][lvl], min);
} else {
min = crd;
}
}
}
} else {
assert(!min);
// Otherwise, universal index is the minimal pos.
min = after->getArguments().back();
}
// Sets up the loop stack.
loopStack.emplace_back(tidLvls, ArrayRef<SliceLoopInfo>(), whileOp,
builder.getInsertionBlock(), min, loopTag);
assert(loopStack.size() == loopSeqStack.size());
for (auto [tid, dstLvl] : unpackTensorLevelRange(tidLvls)) {
const auto reassoc = getCollapseReassociation(tid, dstLvl);
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
// TODO: Refactors this into smaller functions.
// NOTE: For all the collapsed level (except for the last one, that is why
// the loop ends with `reassoc.size() - 1`), as each iteration is advanced
// by the segment size of the last level, which does not always invalidate
// the segment size for the previous levels, thus we need to propagate the
// segment sizes across loop iterations and only forward if needed.
//
// E.g., for a COO tensor with the following coordinates array.
// (0, 0, 1),
// (0, 0, 2),
// (1, 1, 1),
// segHi[lvl=0] = segHi[lvl=1] = 2
// segHi[lvl=2] = 1,
// the first iteration does not invalidate segHi[0] and segHi[1]
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
const Level srcLvl = reassoc[i];
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
const Value pos = posits[tid][srcLvl];
const auto oldSegHi = segHi[tid][srcLvl];
assert(oldSegHi);
Value newSegHi = builder.create<arith::CmpIOp>(
loc, arith::CmpIPredicate::uge, pos, oldSegHi);
auto ifNewSegHi = builder.create<scf::IfOp>(loc, builder.getIndexType(),
newSegHi, true);
{
OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(ifNewSegHi.thenBlock());
YIELD(genSegmentHigh(builder, loc, tid, srcLvl, pos,
highs[tid][srcLvl]));
// Else, resues the same segment high.
builder.setInsertionPointToStart(ifNewSegHi.elseBlock());
YIELD(oldSegHi);
}
highs[tid][srcLvl + 1] = segHi[tid][srcLvl] = ifNewSegHi.getResult(0);
}
};
const auto srcLvl = reassoc.back();
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
segHi[tid][srcLvl] = genSegmentHigh(
builder, loc, tid, srcLvl, posits[tid][srcLvl], highs[tid][srcLvl]);
}
}
// Emits extra locals
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
// Updates reduction variables
assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
// In-place update on reduction variable.
for (unsigned i = 0, e = reduc.size(); i < e; i++)
reduc[i] = after->getArgument(o + i);
return whileOp;
}
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
TensorId tid, Level dstLvl) {
assert(isValidLevel(tid, dstLvl));
const auto lvlTp = lvlTypes[tid][dstLvl];
if (isDenseDLT(lvlTp))
return;
const Value c0 = C_IDX(0);
const Value c1 = C_IDX(1);
for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) {
// Either the first level, or the previous level has been set.
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
assert(srcLvl == 0 || posits[tid][srcLvl - 1]);
if (isDenseDLT(lvlTp))
continue;
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
const Value mem = positionsBuffers[tid][srcLvl];
Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
if (isCompressedWithHiDLT(lvlTp))
pLo = builder.create<arith::MulIOp>(loc, pLo, C_IDX(2));
posits[tid][srcLvl] = genIndexLoad(builder, loc, mem, pLo);
const Value pHi = ADDI(pLo, c1);
highs[tid][srcLvl] = genIndexLoad(builder, loc, mem, pHi);
return;
}
if (isSingletonDLT(lvlTp)) {
const Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
posits[tid][srcLvl] = pLo;
// If we are coiterating non-unique levels, then use pHi=segHi;
// otherwise use pHi=pLo+1.
// NOTE: Just because the level is non-unique, that does not
// guarantee that segHi is defined: because we only generate segHi
// whenever coiterating, in order to improve code quality for the
// non-coiterating cases.
const auto parentSegHi = segHi[tid][srcLvl - 1];
highs[tid][srcLvl] =
(!isUniqueDLT(lvlTypes[tid][srcLvl - 1]) && parentSegHi)
? parentSegHi
: ADDI(pLo, c1);
return;
}
}
llvm_unreachable("Unrecognized level-type!");
}
void LoopEmitter::emitExtraLocalsForTensorsAtDenseLvls(
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls) {
// Initialize dense positions. Note that we generate dense coordinates of the
// output tensor unconditionally, since they may not appear in the lattice,
// but may be needed for linearized codegen.
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
if (isDenseDLT(lvlTypes[tid][lvl])) {
// Slice-driven dense level should have be handled already.
if (!dependentLvlMap[tid][lvl].empty())
continue;
auto enc = getSparseTensorEncoding(tensors[tid].getType());
if (enc && !isSparseOutput(tid)) {
bool validPos = lvl == 0 || posits[tid][lvl - 1];
if (!validPos) {
// We might not find the pos for the sparse output tensor as it is
// unconditionally required by the sparsification.
assert(isOutputTensor(tid));
continue;
}
posits[tid][lvl] =
genAddress(builder, loc, tid, lvl, loopStack.back().iv);
// NOTE: we can also prepare for next lvl here in advance
}
}
}
}
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
MutableArrayRef<Value> reduc) {
const LoopInfo &loopInfo = loopStack.back();
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
SliceInfo &info = sliceStack[tid].back();
assert(isDenseDLT(lvlTypes[tid][lvl]));
assert(*info.slicedOnLvl == lvl && !reduced);
(void)reduced;
// Resets slices pointers as the resolved slices are invalidated after we
// moves forward to the next slice.
invalidateSliceIterIdx(rewriter, loc, tid, lvl);
info.minCrd = info.offset = info.isNonEmpty = Value();
levelReducedDep[tid][lvl]--;
}
if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
if (!reduc.empty()) {
assert(reduc.size() == forOp.getNumResults());
rewriter.create<scf::YieldOp>(loc, reduc);
}
// Exit the loop.
rewriter.setInsertionPointAfter(forOp);
// In-place update reduction variables.
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
reduc[i] = forOp.getResult(i);
} else {
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
if (!reduc.empty()) {
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
Operation *redExp = reduc.front().getDefiningOp();
// Reduction expression should have no use.
assert(redExp->getUses().empty());
// This must be a binary operation.
// NOTE: This is users' responsibilty to ensure the operation are
// commutative.
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
Value redVal = parOp.getInitVals().front();
Value curVal;
if (redExp->getOperand(0) == redVal)
curVal = redExp->getOperand(1);
else if (redExp->getOperand(1) == redVal)
curVal = redExp->getOperand(0);
// One of the operands must be the init value (which is also the
// previous reduction value).
assert(curVal);
#ifndef NDEBUG
// The reduction expression should be the only user of the reduction val
// inside the parallel for.
unsigned numUsers = 0;
for (Operation *op : redVal.getUsers()) {
if (op->getParentOp() == parOp)
numUsers++;
}
assert(numUsers == 1);
#endif // NDEBUG
rewriter.setInsertionPointAfter(redExp);
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
// Attach to the reduction op.
Block *redBlock = &redOp.getRegion().getBlocks().front();
rewriter.setInsertionPointToEnd(redBlock);
Operation *newRed = rewriter.clone(*redExp);
// Replaces arguments of the reduction expression by using the block
// arguments from scf.reduce.
rewriter.updateRootInPlace(
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
// Erases the out-dated reduction expression.
rewriter.eraseOp(redExp);
rewriter.setInsertionPointToEnd(redBlock);
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
}
rewriter.setInsertionPointAfter(parOp);
// In-place update reduction variables.
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
reduc[i] = parOp.getResult(i);
}
// Finished iterating a tensor, clean up
// We only do the clean up on for loop as while loops do not necessarily
// finish the iteration on a sparse tensor
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
// Reset to null.
coords[tid][lvl] = Value();
posits[tid][lvl] = Value();
// Dense level, high is fixed.
if (!isDenseDLT(lvlTypes[tid][lvl]))
highs[tid][lvl] = Value();
}
}
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
MutableArrayRef<Value> reduc) {
const LoopInfo &loopInfo = loopStack.back();
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
builder.setInsertionPointToEnd(loopInfo.userCodeBlock);
Value iv = loopInfo.iv;
// Finalize the induction. Note that the induction could be performed
// in the individual if-branches to avoid re-evaluating the conditions.
// However, that would result in a rather elaborate forest of yield
// instructions during code generation. Moreover, performing the induction
// after the if-statements more closely resembles code generated by TACO.
unsigned o = 0;
SmallVector<Value> operands;
unsigned delta = 0;
for (auto [tid, lvl, resolved] : loopInfo.sliceDrivenInfo) {
// TODO: handle dense.
assert(isCompressedDLT(lvlTypes[tid][lvl]));
levelReducedDep[tid][lvl]--;
if (!resolved) {
genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o);
continue;
}
// TODO: We need to distinguish coiterate loop with slice-driven loop and
// fully reduced while op for iterating one slices.
// FIXME: since we didn't implement coiteration, this must be iteration
// just on fully resolved slice.
assert(loopInfo.sliceDrivenInfo.size() == 1 && loopInfo.tidLvls.empty());
// The if guard to filter out out-range coordinates.
assert(llvm::isa<scf::IfOp>(builder.getInsertionBlock()->getParentOp()));
posits[tid][lvl] = whileOp->getResult(o++);
// FIXME: we are not using continue here since we do not support
// coiteration on slices. But it need to be treated similarly as the
// universal index.
o++; // skip continue flag.
// Since we did not push two results from whileOp. The size of the
// operands vector is smaller than the actual number of return values from
// the whileOp.
// It is because we are actually generating yield in the IfOp inside the
// whileOp to only iterates over inbound coordinates within the slices.
delta += 2;
};
Value one = C_IDX(1);
for (auto [tid, dstLvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
const auto lvlTp = lvlTypes[tid][dstLvl];
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
isCompressedWithHiDLT(lvlTp)) {
const auto reassoc = getCollapseReassociation(tid, dstLvl);
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
const Level srcLvl = reassoc[i];
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
operands.push_back(segHi[tid][srcLvl]);
o++;
}
}
const Value crd = coords[tid][dstLvl];
const Value pos = posits[tid][dstLvl];
Value cmp = CMPI(eq, crd, iv);
// If the loop contains a coiteration with non-unique level, we fast
// forward all the duplicated coords by setting the position to the
// segment high.
Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()])
? segHi[tid][reassoc.back()]
: ADDI(pos, one);
operands.push_back(SELECT(cmp, add, pos));
// Following loops continue iteration from the break point of the
// current while loop.
const Value newPos = whileOp->getResult(o++);
// We need to define a new local variable for `tid` to avoid
// warnings about "captured structured bindings are a C++20 extension".
// FIXME(wrengr): define a helper function to capture this idiom!
const TensorId newTid = tid;
llvm::for_each(reassoc, [this, newTid, newPos](Level srcLvl) {
posits[newTid][srcLvl] = newPos;
});
// The coordinate is invalid now.
coords[tid][dstLvl] = nullptr;
// The segment high is invalid now.
segHi[tid][dstLvl] = nullptr;
// highs remains unchanged.
}
}
// Reduction value from users.
for (auto &i : reduc) {
operands.push_back(i);
// In place update reduction variable.
i = whileOp->getResult(o++);
}
// An (optional) universal index.
if (operands.size() + delta < whileOp.getNumResults()) {
assert(operands.size() + delta + 1 == whileOp.getNumResults());
// The last one is the universial index.
operands.push_back(ADDI(iv, one));
// update the loop starting point of current loop sequence
loopSeqStack.back().first = whileOp->getResult(o++);
}
assert(o == operands.size() + delta);
YIELD(operands);
builder.setInsertionPointAfter(whileOp);
}
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
MutableArrayRef<Value> reduc) {
// Clean up the values, it would help use to discover potential bug at a
// earlier stage (instead of silently using a wrong value).
const LoopInfo &loopInfo = loopStack.back();
SmallVector<Value> red;
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
exitWhileLoop(rewriter, loc, reduc);
} else {
exitForLoop(rewriter, loc, reduc);
}
assert(loopStack.size() == loopSeqStack.size());
loopStack.pop_back();
}
//===----------------------------------------------------------------------===//
// Slice-driven loop related methods.
//===----------------------------------------------------------------------===//
unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const {
unsigned totalDependencies = dependentLvlMap[tid][lvl].size();
if (totalDependencies != 0) {
assert(totalDependencies >= 2);
return totalDependencies - levelReducedDep[tid][lvl];
}
return totalDependencies;
}
const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid,
Level lvl) {
// Finds the most-recent slice using a reverse iteration.
for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie;
it++) {
if (it->slicedOnLvl == lvl) { // the level matched
return *it;
}
}
llvm_unreachable("Failed to find sliceInfo");
}
// Generates a while loop to iterate over a slice sparse level as follows.
//
// while(loopLo < loopHi) {
// if (coords[loopLo] < offset + size) {
// body_builder
// } else {
// break;
// }
// loopLo ++;
// }
std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset,
Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield,
LoopBodyBuilder bodyBuilder) {
Value c1 = C_IDX(1);
Value sliceHi = ADDI(offset, sliceSizes[tid][lvl].back());
SmallVector<Value> reduc = {
loopLo, // loop lower bounds
constantI1(builder, loc, true), // continue
};
// Append user required reduction value.
reduc.append(userReduc.begin(), userReduc.end());
scf::WhileOp whileOp = builder.create<scf::WhileOp>(
loc, ValueRange(reduc).getTypes(), reduc,
/*beforeBuilder=*/
[loopHi](OpBuilder &builder, Location loc, ValueRange args) {
Value lo = args[0];
Value cont = args[1];
Value inBound = CMPI(ult, lo, loopHi);
Value cond = ANDI(cont, inBound);
// continue if not yet break nor out of bound.
builder.create<scf::ConditionOp>(loc, cond, args);
},
/*afterBuilder=*/
[this, c1, tid, lvl, sliceHi, genYield,
bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) {
Value iv = args[0];
Value coord =
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
Value cont = CMPI(ult, coord, sliceHi);
TypeRange types = args.drop_front(2).getTypes();
auto ifOp = builder.create<scf::IfOp>(loc, types, cont, true);
{
// 2 reduction variable maintained by us.
SmallVector<Value> ifRet = args.drop_front(2);
assert(ifRet.size() == args.size() - 2);
OpBuilder::InsertionGuard guard(builder);
// If coord >= sliceHi.
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
YIELD(ifRet);
// If coord < sliceHi.
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
// Delegates to users' callback.
bodyBuilder(builder, loc, iv, ifRet);
if (genYield) {
builder.setInsertionPointToEnd(&ifOp.getThenRegion().front());
YIELD(ifRet);
}
}
// Marks this speical ifOp to avoid sparisification finalizing it.
ifOp->setAttr(getLoopEmitterLoopAttrName(),
StringAttr::get(builder.getContext(), "slice"));
// Insertion point restored to after ifOp.
SmallVector<Value> yields;
// Increase induction variable.
yields.push_back(ADDI(iv, c1));
yields.push_back(cont);
yields.append(ifOp.getResults().begin(), ifOp.getResults().end());
YIELD(yields);
});
builder.setInsertionPointAfter(whileOp);
return std::make_pair(whileOp, whileOp.getResults().drop_front(2));
}
// Generates a loop nest that traverse all the unresolved levels in between.
// TODO: it can only handle all compressed tensors.
//
// for(int i = 0; i < slicePos.size(); i+=2) {
// loopLo = slicePos[i];
// loopHi = slicePos[i + 1];
//
// // Then the same loop generated by genSliceLvlTraverse above.
// while (loopLo < loopHI) {
// if (pos[loopLo] < sliceHi) {
// bodyBuilder();
// } else {
// break;
// }
// loopLo ++;
// }
// }
ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
OpBuilder &builder, Location loc, TensorId tid,
ArrayRef<const SliceInfo *> unResLvls,
std::optional<std::pair<TensorId, Level>> firstResLvl, ValueRange userReduc,
LoopBodyBuilder bodyBuilder) {
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
Value pos = c0;
OpBuilder::InsertPoint ip;
SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
scf::ForOp outerMost = nullptr; // the outtermost loop.
if (firstResLvl.has_value()) {
// Overwrite position when the first level is fully resolved.
pos = posits[firstResLvl->first][firstResLvl->second];
ip = builder.saveInsertionPoint();
} else {
const SliceInfo &frontSlice = *unResLvls.back();
Level firstLvl = *frontSlice.slicedOnLvl;
if (!lvlFullyResolved(tid, firstLvl)) {
if (isCompressedDLT(lvlTypes[tid][firstLvl])) {
unsigned depth = frontSlice.depth - 1;
Value offset = frontSlice.offset;
Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
outerMost = builder.create<scf::ForOp>(
loc, c2, mSz, c2, innerArgs,
[this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
&innerArgs](OpBuilder &builder, Location loc, Value iv,
ValueRange iterArgs) {
// generate traversal for each level.
Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv);
Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1));
ValueRange itArgs =
genSliceLvlTraverseLoop(
builder, loc, loopLo, loopHi, offset,
sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs,
false,
[&](OpBuilder &builder, Location, Value iv,
MutableArrayRef<Value> reduc) {
ip = builder.saveInsertionPoint();
pos = iv;
innerArgs.assign(reduc.begin(), reduc.end());
})
.second;
YIELD(itArgs);
});
} else if (isDenseDLT(lvlTypes[tid][firstLvl])) {
assert(firstLvl == 0); // This must be the first level.
Value lb = frontSlice.offset;
Value sliceSz =
sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1];
Value ub = ADDI(lb, sliceSz);
outerMost = builder.create<scf::ForOp>(
loc, lb, ub, c1, innerArgs,
[&](OpBuilder &builder, Location loc, Value iv,
ValueRange iterArgs) {
ip = builder.saveInsertionPoint();
pos = iv;
innerArgs.assign(iterArgs.begin(), iterArgs.end());
});
}
// We generated the loop for the first slice above, now remove it.
unResLvls = unResLvls.drop_back();
}
}
// Reset the insertion point into the loop body.
builder.restoreInsertionPoint(ip);
if (!unResLvls.empty()) {
// Fills in dense slices levels in between.
SmallVector<Value> lbs, ubs, steps, lvlSzs;
for (const SliceInfo *slice : llvm::reverse(unResLvls)) {
Level sliceLvl = *slice->slicedOnLvl;
assert(isDenseDLT(lvlTypes[tid][sliceLvl]));
Value offset = slice->offset;
Value sliceSz = sliceSizes[tid][sliceLvl][slice->depth - 1];
lbs.push_back(offset);
ubs.push_back(ADDI(offset, sliceSz));
steps.push_back(c1);
lvlSzs.push_back(lvlSizes[tid][sliceLvl]);
}
auto denseNest =
scf::buildLoopNest(builder, loc, lbs, ubs, steps, innerArgs,
[&innerArgs, &lvlSzs, &pos, bodyBuilder](
OpBuilder &builder, Location loc, ValueRange ivs,
ValueRange iterArgs) -> scf::ValueVector {
for (auto em : llvm::enumerate(ivs)) {
// Linearizes postion: pos = (pos * lvlsize) +
// iv;
pos = MULI(pos, lvlSzs[em.index()]);
pos = ADDI(pos, em.value());
}
innerArgs.assign(iterArgs.begin(), iterArgs.end());
// Generates user request loop body.
bodyBuilder(builder, loc, pos, innerArgs);
return innerArgs;
});
if (!outerMost) {
// If the outermost loop has not been set, this is the outermost loop.
outerMost = denseNest.loops.front();
} else {
// Otherwise we need to generate yield operations to link the SSA chain.
YIELD(denseNest.results);
}
} else {
assert(outerMost);
// Generates user request loop body.
bodyBuilder(builder, loc, pos, innerArgs);
YIELD(innerArgs);
}
assert(outerMost);
// Insert after current while operation.
builder.setInsertionPointAfter(outerMost);
return outerMost.getResults();
}
void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
TensorId tid, Level lvl) {
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3),
c4 = C_IDX(4);
if (isDenseDLT(lvlTypes[tid][lvl])) {
// Dense slice begin is trivial.
sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0,
/*nonEmpty=*/constantI1(builder, loc, true),
lvl, /*depth=*/1);
return;
}
Value size = sliceSizes[tid][lvl][0];
Value sPtrBuf = slicePosBuffer[tid][lvl][0];
Value pHi, pLo;
if (lvl == 0) {
pLo = c0;
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1);
} else {
pLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
posits[tid][lvl - 1]);
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
ADDI(posits[tid][lvl - 1], c1));
}
// Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi]
builder.create<memref::StoreOp>(loc, c4, sPtrBuf, c0); // memSize = 4
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1); // index = 0
builder.create<memref::StoreOp>(loc, pLo, sPtrBuf, c2); // pLo
builder.create<memref::StoreOp>(loc, pHi, sPtrBuf, c3); // pHi
// This is an non empty tensor if 0 < pHi.
Value isNonEmpty = CMPI(ult, c0, pHi);
// The minimal coord must be at the first on ordered level.
// FIXME: Technically we should load the coord only when the slice is
// nonempty. though we assume that even on empty sparse tensors, a non-empty
// ptr/idx buffer is allocated for each level so it would not cause OOB to
// avoid generating a ifOp here.
Value minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0);
// FIXME: We need the relative offset related to the base slice.
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, /*depth=*/1);
}
// Fills in the slicePosBuffer before slice-driven loop begin.
// TODO: it can only handle all compressed tensors.
//
// // Loop generated by `genUnResolvedSliceTreeTraverse`
// for(int i = 0; i < slicePos.size(); i+=2) {
// loopLo = slicePos[i];
// loopHi = slicePos[i + 1];
// minCrd = max;
// while (loopLo < loopHi) {
// if (pos[loopLo] < sliceHi) {
// // bodyBuilder
// slicePos[tid].push_back(pos[loopLo]);
// slicePos[tid].push_back(pos[loopLo + 1]);
// minCrd = min(minCrd, crd[pos[loopLo]]);
// } else {
// break;
// }
// loopLo ++;
// }
// }
void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
TensorId tid, Level lvl) {
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
unsigned depth = levelReducedDep[tid][lvl];
Value size = sliceSizes[tid][lvl][depth];
// Dense slice begin is trivial
if (isDenseDLT(lvlTypes[tid][lvl])) {
sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl,
depth + 1);
return;
}
assert(isCompressedDLT(lvlTypes[tid][lvl]));
// Unhandled Cases:
//
// 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
// variable need to be reduced on the same level).
//
// 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
// simple dim expression in between).
assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
// Check slice stack integrity.
assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth);
SmallVector<const SliceInfo *> unResSlices;
std::optional<std::pair<TensorId, Level>> firstResLvl;
for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
Level prevLvl = curLvl - 1;
if (lvlFullyResolved(tid, prevLvl)) {
firstResLvl = std::make_pair(tid, prevLvl);
break;
}
unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl));
if (!isDenseDLT(lvlTypes[tid][prevLvl])) {
break;
}
}
assert(!unResSlices.empty() &&
!lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl));
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
SmallVector<Value, 3> reduc = {
constantI1(builder, loc, false), // isNonEmpty
lvlSizes[tid][lvl], // minCoord
c2, // memSize
};
ValueRange result = genUnResolvedSliceTreeTraverse(
builder, loc, tid, unResSlices, firstResLvl, reduc,
[this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc,
Value iv,
MutableArrayRef<Value> reduc) {
Value &nonEmpty = reduc[0];
Value &minCrd = reduc[1];
Value &curMemSz = reduc[2];
Value pHi = ADDI(iv, c1);
Value sPLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv);
Value sPHi =
genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi);
// isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is one
// non-empty lvl, the slice is non-empty.
Value lvlNonEmpty = CMPI(ult, sPLo, sPHi);
nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty);
// Update the minimum coordinate.
auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(),
lvlNonEmpty, true);
{
// Generate Code as follows.
//
// if (nonEmpty) {
// minCrd = min(minCrd, crd[pos[pLo]]);
// }
OpBuilder::InsertionGuard guard(builder);
builder.setInsertionPointToStart(ifNonEmpty.thenBlock());
Value curC =
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], sPLo);
Value isSmaller = CMPI(ult, curC, minCrd);
Value newMin = SELECT(isSmaller, curC, minCrd);
YIELD(newMin);
builder.setInsertionPointToStart(ifNonEmpty.elseBlock());
YIELD(minCrd);
}
minCrd = ifNonEmpty.getResult(0);
builder.create<memref::StoreOp>(loc, sPLo, sPtrBuf, curMemSz);
Value nxtMemSize = ADDI(curMemSz, c1);
builder.create<memref::StoreOp>(loc, sPHi, sPtrBuf, nxtMemSize);
// curMemSize += 2
curMemSz = ADDI(curMemSz, c2);
});
Value isNonEmpty = result[0];
Value minCrd = result[1];
// Two metadata [memSize, idx].
// TODO: Can use an SSA value for these two metadata
builder.create<memref::StoreOp>(loc, result[2], sPtrBuf, c0);
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1);
// FIXME: we need the relative offset related to the base slice.
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, depth + 1);
}
bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
Level lvl) {
Value c1 = C_IDX(1), c2 = C_IDX(2);
if (depFullyReduced(tid, lvl)) {
// Do not need to prepare for slice driven loop on dense level after it is
// fully reduced.
if (isDenseDLT(lvlTypes[tid][lvl]))
return true;
// If constraints on the tensor is fully resolved. We do not need to
// generates slice begin any more, instead we fall back to TACO-based
// algorithm to (co)iterates over the slice.
Value pLoPtr =
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1);
pLoPtr = ADDI(pLoPtr, c2);
Value pHiPtr = ADDI(pLoPtr, c1);
posits[tid][lvl] =
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pLoPtr);
highs[tid][lvl] =
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pHiPtr);
return true;
}
// Only when the level is sorted, the next-non-empty slice can be computed
// efficiently.
const DimLevelType lvlType = lvlTypes[tid][lvl];
assert(isOrderedDLT(lvlType));
if (isSingletonDLT(lvlType)) {
llvm_unreachable("TODO: dense level should be easy to support, while "
"singleton level requres more efforts");
}
assert(!dependentLvlMap[tid][lvl].empty());
assert(!sliceStack[tid].empty());
const SliceInfo &sliceInfo = sliceStack[tid].back();
auto baseEnc = getSparseTensorEncoding(tensors[tid].getType());
if (baseEnc.isSlice())
llvm_unreachable("TODO: not yet implemented");
// Generate caches required to fast compute next-non-empty slices with
// increasing offset for slice-base loop.
// We do not need cache for dense levels.
if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseDLT(lvlType)) {
OpBuilder::InsertionGuard guard(builder);
// The buffer can be reused, and the size is loop invariant: it only depends
// on the iteration graph's toposort.
builder.setInsertionPointAfter(localInsertPos);
Value bufSize = C_IDX(1);
Value c2 = C_IDX(2);
// Accumlates the size required to cache the pLo for the slice.
// E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second
// level. We at most need to a memref<d0xindex>.
// NOTE: this is apperantly an over-approximation when the previous
// level is compressed, and we can compute a precise memory size
// inside the loops. But that would also requires us to allocate/free
// memorys in loops.
// TODO: Maybe using allocaScopeOp inside the loop to resolve the issue?
for (Level curLevel = lvl;
curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) {
auto depth = remDepOnLevel(tid, curLevel - 1);
assert(sliceSizes[tid][lvl].size() >= depth);
Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1);
bufSize = MULI(bufSize, sz);
}
// For a pair of [pLo, pHi]. Note that we can not compress pHi because slice
// creates segments in the index buffer so that the pHi for the current
// level is no longer the pLo for the next level.
bufSize = MULI(bufSize, c2);
// Additional two metadata {memSize, idx} at head.
bufSize = ADDI(bufSize, c2);
llvm::for_each(
slicePosBuffer[tid][lvl], [bufSize, loc, &builder](Value &cache) {
cache = genAlloca(builder, loc, bufSize, builder.getIndexType());
});
}
if (sliceInfo.isInitialTensor() ||
(lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) {
// First level or previous level has been full resolved.
genResolvedSliceBegin(builder, loc, tid, lvl);
} else {
// The previous level has not been full resolved.
genUnResolvedSliceBegin(builder, loc, tid, lvl);
}
return false;
}
void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc,
TensorId tid, Level lvl) {
for (unsigned i = 0; i <= lvl; i++) {
if (!isDenseDLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) {
builder.create<memref::StoreOp>(loc, C_IDX(0),
slicePosBuffer[tid][i].back(), C_IDX(1));
}
}
}
void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
const Operation *op, TensorId tid,
Level lvl,
SmallVectorImpl<Value> &operands,
unsigned &retIdx) {
if (!isCompressedDLT(lvlTypes[tid][lvl]))
llvm_unreachable("TODO");
// else generate code to compute next non empty slice.
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
auto whileOp = llvm::cast<scf::WhileOp>(op);
SliceInfo &info = sliceStack[tid].back();
assert(info.slicedOnLvl == lvl);
//
// We forward to the next non empty slice by
// if (minCrd > offset) {
// offset += 1
// } else {
// minCrd = nextMinInSlice();
// offset = minCrd - size + 1;
// }
//
// if (offset + size > parents.size)
// isNonEmpty = false;
//
Value absOffset = info.offset;
// Resets slices pointers as the resolved slices are invalidated after we
// moves forward to the next slice.
invalidateSliceIterIdx(builder, loc, tid, lvl);
SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset};
Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1];
Value fastPathP = CMPI(ugt, info.minCrd, absOffset);
auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(),
fastPathP, true);
{
OpBuilder::InsertionGuard guard(builder);
// Take the fast path
// if (minCrd > offset) {
// return offset += 1
// }
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
reduc[2] = ADDI(absOffset, c1);
// Yield offset + 1.
YIELD(reduc);
// else /*minCrd == offset*/ {
// for (i = 0; i < slicePos.size(); i+=2) {
// if (crd[pos[slicePos[i]]] == minCrd) {
// slicePos[i]++;
// }
// minCrd=min(minCrd, crd[pos[slicePos[i]]]);
// }
// offset = minCrd - size + 1;
// }
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
reduc[2] = absOffset; // restore value.
Value pSt = c2; // pointer starting index
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
reduc[0] = lvlSizes[tid][lvl]; // next min coord
reduc[1] = constantI1(builder, loc, false); // isNonEmpty
auto loopArgs = static_cast<ValueRange>(reduc).drop_back();
auto forOp = scf::buildLoopNest(
builder, loc, pSt, mSz, c2, loopArgs,
[this, tid, lvl, c1, sPtrBuf,
&info](OpBuilder &builder, Location loc, ValueRange ivs,
ValueRange iterArgs) -> scf::ValueVector {
Value curMinCrd = iterArgs[0];
Value isNonEmpty = iterArgs[1];
Type idxTp = builder.getIndexType();
Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front());
Value pHi =
genIndexLoad(builder, loc, sPtrBuf, ADDI(ivs.front(), c1));
//
// if (pLo < pHi) // Only loads when inbound.
// coord = load[pLo]
// if coord == minCrd
// pLo += 1
//
// if (pLo < pHi)
// curMinCrd = min(curMinCrd, load[pLo])
//
Value pred = CMPI(ult, pLo, pHi);
auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true);
/* if pLo < pHi */ {
builder.setInsertionPointToStart(&advPLo.getThenRegion().front());
// coord = load[pLo]
Value coord =
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo);
Value pred = CMPI(eq, coord, info.minCrd);
auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true);
/* if coord == minCrd */ {
builder.setInsertionPointToStart(
&ifEqual.getThenRegion().front());
Value newPlo = ADDI(pLo, c1);
// Updates the cache.
builder.create<memref::StoreOp>(loc, newPlo, sPtrBuf,
ivs.front());
YIELD(newPlo);
}
/* else coord != minCrd */ {
builder.setInsertionPointToStart(
&ifEqual.getElseRegion().front());
YIELD(pLo);
}
builder.setInsertionPointAfter(ifEqual);
YIELD(ifEqual.getResults());
}
/* else pLo >= pHi */ {
builder.setInsertionPointToStart(&advPLo.getElseRegion().front());
YIELD(pLo);
}
builder.setInsertionPointAfter(advPLo);
pLo = advPLo.getResult(0);
Value lvlNonEmpty = CMPI(ult, pLo, pHi);
// Update minCrds
auto newMin =
builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true);
builder.setInsertionPointToStart(&newMin.getThenRegion().front());
YIELD(genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo));
builder.setInsertionPointToStart(&newMin.getElseRegion().front());
YIELD(curMinCrd);
builder.setInsertionPointAfter(newMin);
// isNonEmpty = isNonEmpty || lvlNonEmpty
isNonEmpty =
builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty);
curMinCrd = builder.create<arith::SelectOp>(
loc, CMPI(ult, newMin.getResult(0), curMinCrd),
newMin.getResult(0), curMinCrd);
return {curMinCrd, isNonEmpty};
});
builder.setInsertionPointAfter(forOp.loops.front());
// minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0
Value tmp = ADDI(forOp.results.front(), c1);
Value minOffset = SUBI(tmp, sliceSizes[tid][lvl][info.depth - 1]);
Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]);
minOffset = SELECT(p, minOffset, c0);
SmallVector<Value, 3> yields;
yields.assign(forOp.results.begin(), forOp.results.end());
yields.push_back(minOffset);
YIELD(yields);
}
Value nextMinCrd = ifOp.getResults()[0];
Value nextNonEmpty = ifOp.getResults()[1];
// The next offset should at least be offset + 1;
Value minOffset = ifOp.getResults()[2];
Value nxOffset = ADDI(info.offset, c1);
Value maxPred = CMPI(ugt, minOffset, nxOffset);
Value nextAbsOffset = SELECT(maxPred, minOffset, nxOffset);
Value sliceUB = ADDI(nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]);
// FIXME: this only works if there is only one parent.
assert(info.depth - 1 == 0);
// nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound.
nextNonEmpty = ANDI(nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl]));
// FIXME: compute relative offset.
assert(info.depth - 1 == 0);
Value nextRelOffset = nextAbsOffset;
nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
operands.push_back(nextNonEmpty);
operands.push_back(nextMinCrd);
operands.push_back(nextAbsOffset); // we push the absolute offset.
// Update the slice stack.
info.isNonEmpty = whileOp.getResult(retIdx++);
info.minCrd = whileOp.getResult(retIdx++);
info.offset = whileOp.getResult(retIdx++);
}
Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl(
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
MutableArrayRef<Value> reduc) {
assert(!depFullyReduced(tid, lvl));
SliceInfo &sliceInfo = sliceStack[tid].back();
assert(sliceInfo.slicedOnLvl == lvl);
// The order matters!
SmallVector<Value, 3> operands{sliceInfo.isNonEmpty, sliceInfo.minCrd,
sliceInfo.offset};
// number of reduction maintained by us.
size_t numMetaReduc = operands.size();
// Append user-required reduction values.
operands.append(reduc.begin(), reduc.end());
assert(operands.size() == numMetaReduc + reduc.size());
// while (slice.nonEmpty()) {
// bodyBuilder();
// SliceNext();
// }
auto whileOp = builder.create<scf::WhileOp>(
loc, ValueRange(operands).getTypes(), operands,
/*beforeBuilder=*/
[](OpBuilder &builder, Location loc, ValueRange args) {
builder.create<scf::ConditionOp>(loc, /*isNonEmpty*/ args[0], args);
},
/*afterBuilder=*/
[this, tid, lvl, reduc, numMetaReduc,
&sliceInfo](OpBuilder &builder, Location loc, ValueRange args) {
assert(args.size() == reduc.size() + numMetaReduc);
sliceInfo.isNonEmpty = args[0];
sliceInfo.minCrd = args[1];
sliceInfo.offset = args[2];
// The slice offset is used to coiterate with other tensors'
// coordinates.
Value c = sliceInfo.offset;
if (sliceInfo.depth > 1) {
// Coord is the relative offset related to its parents.
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
llvm_unreachable("TODO: not yet implement");
}
coords[tid][lvl] = c;
for (unsigned i = 0, e = reduc.size(); i < e; i++)
reduc[i] = args[i + numMetaReduc];
});
// Set the insertion point to while loop body.
builder.setInsertionPointToEnd(&whileOp.getAfter().front());
return whileOp;
}
#undef CMPI
#undef C_IDX
#undef YIELD
#undef ADDI
#undef ANDI
#undef SUBI
#undef MULI
#undef SELECT