
This is a major step along the way towards the new STEA design. While a great deal of this patch is simple renaming, there are several significant changes as well. I've done my best to ensure that this patch retains the previous behavior and error-conditions, even though those are at odds with the eventual intended semantics of the `dimToLvl` mapping. Since the majority of the compiler does not yet support non-permutations, I've also added explicit assertions in places that previously had implicitly assumed it was dealing with permutations. Reviewed By: aartbik Differential Revision: https://reviews.llvm.org/D151505
2121 lines
85 KiB
C++
2121 lines
85 KiB
C++
//===- LoopEmitter.cpp ----------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "LoopEmitter.h"
|
|
#include "CodegenUtils.h"
|
|
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/SCF/IR/SCF.h"
|
|
#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::sparse_tensor;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local shorthand macros
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#define CMPI(p, l, r) \
|
|
(builder.create<arith::CmpIOp>(loc, arith::CmpIPredicate::p, (l), (r)) \
|
|
.getResult())
|
|
|
|
#define C_IDX(v) (constantIndex(builder, loc, (v)))
|
|
#define YIELD(vs) (builder.create<scf::YieldOp>(loc, (vs)))
|
|
#define ADDI(lhs, rhs) (builder.create<arith::AddIOp>(loc, (lhs), (rhs)))
|
|
#define ANDI(lhs, rhs) (builder.create<arith::AndIOp>(loc, (lhs), (rhs)))
|
|
#define SUBI(lhs, rhs) (builder.create<arith::SubIOp>(loc, (lhs), (rhs)))
|
|
#define MULI(lhs, rhs) (builder.create<arith::MulIOp>(loc, (lhs), (rhs)))
|
|
#define SELECT(c, l, r) (builder.create<arith::SelectOp>(loc, (c), (l), (r)))
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// File local helper functions.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
static Value genSliceOffset(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
// FIXME: `toOrigDim` is deprecated
|
|
return createOrFoldSliceOffsetOp(builder, loc, tensor, toOrigDim(enc, lvl));
|
|
}
|
|
|
|
static Value genSliceStride(OpBuilder &builder, Location loc, Value tensor,
|
|
Level lvl) {
|
|
auto enc = getSparseTensorEncoding(tensor.getType());
|
|
// FIXME: `toOrigDim` is deprecated
|
|
return createOrFoldSliceStrideOp(builder, loc, tensor, toOrigDim(enc, lvl));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the slice to the coordinate relative
|
|
/// to the underlying tensor.
|
|
// FIXME: that description says "sliceCrd -> tensorCrd"; but the function
|
|
// name suggests it should be "tensorCrd -> sliceCrd".
|
|
static Value toSliceCrd(OpBuilder &builder, Location loc, Value crd,
|
|
Value offset, Value stride, Value tensor, Level lvl) {
|
|
// tensorCrd = sliceCrd * stride + offset
|
|
return ADDI(MULI(crd, stride), offset);
|
|
}
|
|
|
|
/// Generates code to compute the *absolute* offset of the slice based on the
|
|
/// provide minimum coordinates in the slice.
|
|
/// E.g., when reducing d0 + d1 + d2, we need two slices to fully reduced the
|
|
/// expression, i,e, s1 = slice(T, d0), s2 = slice(s1, d1). The *absolute*
|
|
/// offset is the offset computed relative to the initial tensors T.
|
|
///
|
|
/// When isNonEmpty == true, the computed offset is meaningless and should not
|
|
/// be used during runtime, the method generates code to return 0 currently in
|
|
/// that case.
|
|
///
|
|
/// offset = isNonEmpty && minCrd >= size ? minCrd - size + 1 : 0;
|
|
static Value offsetFromMinCoord(OpBuilder &builder, Location loc, Value minCrd,
|
|
Value size, Value isNonEmpty) {
|
|
Value geSize = CMPI(uge, minCrd, size);
|
|
Value pred = ANDI(isNonEmpty, geSize);
|
|
// Computes minCrd - size + 1
|
|
Value mms = SUBI(ADDI(minCrd, C_IDX(1)), size);
|
|
// This is the absolute offset related to the underly tensor.
|
|
return SELECT(pred, mms, C_IDX(0));
|
|
}
|
|
|
|
/// Converts a coordinate relative to the underlying tensor to the coordinate
|
|
/// relative to the slice, returns a extra reminder value
|
|
// FIXME: that description says "tensorCrd -> sliceCrd"; but the function
|
|
// name suggests it should be "sliceCrd -> tensorCrd".
|
|
static std::pair<Value, Value> fromSliceCrd(OpBuilder &builder, Location loc,
|
|
Value crd, Value offset,
|
|
Value stride, Value tensor,
|
|
Level lvl) {
|
|
// sliceCrd = (tensorCrd - offset) / stride
|
|
crd = SUBI(crd, offset);
|
|
Value rem = builder.create<arith::RemUIOp>(loc, crd, stride);
|
|
crd = builder.create<arith::DivUIOp>(loc, crd, stride);
|
|
return std::make_pair(crd, rem);
|
|
}
|
|
|
|
std::pair<Value, Value>
|
|
LoopEmitter::genSliceLegitPredicate(OpBuilder &builder, Location loc, Value crd,
|
|
TensorId tid, Level lvl) {
|
|
assert(isSparseSlices[tid]);
|
|
Value slice = tensors[tid];
|
|
Value offset = sliceOffsets[tid][lvl];
|
|
Value stride = sliceStrides[tid][lvl];
|
|
auto enc = getSparseTensorEncoding(slice.getType());
|
|
|
|
const auto [newCrd, crdRem] =
|
|
fromSliceCrd(builder, loc, crd, offset, stride, slice, lvl);
|
|
|
|
SmallVector<Value, 3> conds; // at most 3 conditions
|
|
|
|
// First, coord >= offset (skip the check if offset is known to be 0).
|
|
if (auto staticOffset = enc.getStaticLvlSliceOffset(lvl);
|
|
!(staticOffset.has_value() && *staticOffset == 0)) {
|
|
auto geOffset = CMPI(uge, crd, offset);
|
|
conds.push_back(geOffset);
|
|
}
|
|
|
|
// Second, coord_in_slice < length
|
|
auto ltLength = CMPI(ult, newCrd, lvlSizes[tid][lvl]);
|
|
conds.push_back(ltLength);
|
|
|
|
// Third, rem == 0 (skip the check if stride is known to be 1).
|
|
if (auto staticStride = enc.getStaticLvlSliceStride(lvl);
|
|
!(staticStride.has_value() && *staticStride == 1)) {
|
|
auto fitStride = CMPI(eq, crdRem, C_IDX(0));
|
|
conds.push_back(fitStride);
|
|
}
|
|
|
|
// Must meet all condition to be a valid coordinate in slice.
|
|
auto pred = conds.front();
|
|
for (auto cond : ValueRange(conds).drop_front())
|
|
pred = ANDI(pred, cond);
|
|
|
|
return {newCrd, pred};
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Sparse tensor loop emitter class implementations
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
Value LoopEmitter::genAddress(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl, Value crd) {
|
|
Value pos = lvl == 0 ? C_IDX(0) : posits[tid][lvl - 1];
|
|
Value mul = MULI(highs[tid][lvl], pos);
|
|
if (isSparseSlices[tid])
|
|
crd = toSliceCrd(builder, loc, crd, sliceOffsets[tid][lvl],
|
|
sliceStrides[tid][lvl], tensors[tid], lvl);
|
|
Value add = ADDI(mul, crd);
|
|
return add;
|
|
}
|
|
|
|
Value LoopEmitter::genSegmentHigh(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl, Value pLo,
|
|
Value pHi) {
|
|
const auto coordinates = coordinatesBuffers[tid][lvl];
|
|
const auto sameCrd = genIndexLoad(builder, loc, coordinates, pLo);
|
|
auto whileOp = builder.create<scf::WhileOp>(
|
|
loc, builder.getIndexType(), pLo,
|
|
/*beforeBuilder=*/
|
|
[pHi, coordinates, sameCrd](OpBuilder &builder, Location loc,
|
|
ValueRange ivs) {
|
|
const auto pos = ivs[0];
|
|
Value inBound = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::ult, pos, pHi);
|
|
auto ifInBound =
|
|
builder.create<scf::IfOp>(loc, builder.getI1Type(), inBound, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Load the next coordinates only when inbound (to avoid OOB
|
|
// acccesses).
|
|
builder.setInsertionPointToStart(ifInBound.thenBlock());
|
|
Value crd = genIndexLoad(builder, loc, coordinates, pos);
|
|
Value isSameCrd = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::eq, crd, sameCrd);
|
|
YIELD(isSameCrd);
|
|
// Else, the position is out of bound, yield false to terminate the
|
|
// loop.
|
|
builder.setInsertionPointToStart(ifInBound.elseBlock());
|
|
YIELD(constantI1(builder, loc, false));
|
|
}
|
|
builder.create<scf::ConditionOp>(loc, ifInBound.getResults()[0], ivs);
|
|
},
|
|
/*afterBuilder=*/
|
|
[](OpBuilder &builder, Location loc, ValueRange ivs) {
|
|
// pos ++
|
|
Value nextPos = ADDI(ivs[0], C_IDX(1));
|
|
YIELD(nextPos);
|
|
});
|
|
// Return the segment high.
|
|
return whileOp.getResult(0);
|
|
}
|
|
|
|
Value LoopEmitter::genSparseCrd(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level dstLvl) {
|
|
Value crd = C_IDX(0);
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
const unsigned reassocSize = reassoc.size();
|
|
for (unsigned i = 0; i < reassocSize; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
// A load on the coordinates array yields the coordinate.
|
|
const Value mem = coordinatesBuffers[tid][srcLvl];
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
const Value pos = posits[tid][dstLvl];
|
|
const Value off = genIndexLoad(builder, loc, mem, pos);
|
|
// Linearized the coordinates within the same collapse reassociation.
|
|
crd = ADDI(crd, off);
|
|
if (i != reassocSize - 1) {
|
|
crd = MULI(crd, this->lvlSizes[tid][reassoc[i + 1]]);
|
|
}
|
|
}
|
|
return crd;
|
|
}
|
|
|
|
LoopEmitter::LoopEmitter(ValueRange tensors, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, ArrayRef<LoopId> topSort,
|
|
DependentLvlGetter dimGetter) {
|
|
initialize(tensors, loopTag, hasOutput, isSparseOut, topSort, dimGetter);
|
|
}
|
|
|
|
void LoopEmitter::initialize(ValueRange ts, StringAttr loopTag, bool hasOutput,
|
|
bool isSparseOut, ArrayRef<LoopId> topSort,
|
|
DependentLvlGetter dimGetter) {
|
|
// First initialize the top-level type of the fields.
|
|
this->loopTag = loopTag;
|
|
this->hasOutput = hasOutput;
|
|
this->isSparseOut = isSparseOut;
|
|
|
|
const unsigned numTensors = ts.size();
|
|
this->tensors.assign(ts.begin(), ts.end());
|
|
this->lvlTypes.assign(numTensors, std::vector<DimLevelType>());
|
|
this->lvlSizes.assign(numTensors, std::vector<Value>());
|
|
this->highs.assign(numTensors, std::vector<Value>());
|
|
this->segHi.assign(numTensors, std::vector<Value>());
|
|
this->posits.assign(numTensors, std::vector<Value>());
|
|
this->coords.assign(numTensors, std::vector<Value>());
|
|
this->positionsBuffers.assign(numTensors, std::vector<Value>());
|
|
this->coordinatesBuffers.assign(numTensors, std::vector<Value>());
|
|
this->valBuffer.assign(numTensors, nullptr);
|
|
this->collapseReassoc.assign(numTensors, nullptr);
|
|
this->isSparseSlices.assign(numTensors, false);
|
|
this->sliceOffsets.assign(numTensors, std::vector<Value>());
|
|
this->sliceStrides.assign(numTensors, std::vector<Value>());
|
|
|
|
const LoopOrd numLoops = topSort.size();
|
|
// These zeros will be overwritten below, but we need to initialize
|
|
// them to something since we'll need random-access assignment.
|
|
this->loopIdToOrd.assign(numLoops, 0);
|
|
this->loopStack.reserve(numLoops);
|
|
this->loopSeqStack.reserve(numLoops);
|
|
|
|
// Index-reduction related fields.
|
|
this->dependentLvlMap.assign(
|
|
numTensors, std::vector<std::vector<std::pair<TensorId, Level>>>());
|
|
this->slicePosBuffer.assign(numTensors, std::vector<std::vector<Value>>());
|
|
this->sliceSizes.assign(numTensors, std::vector<std::vector<Value>>());
|
|
this->sliceStack.assign(numTensors, std::vector<SliceInfo>());
|
|
this->levelReducedDep.assign(numTensors, std::vector<unsigned>());
|
|
|
|
// Initialize nested types of `TensorId`-indexed fields.
|
|
for (TensorId tid = 0; tid < numTensors; tid++) {
|
|
const Value t = tensors[tid];
|
|
// a scalar or 0-dimension tensors
|
|
if (isZeroRankedTensorOrScalar(t.getType()))
|
|
continue;
|
|
|
|
auto rtp = getRankedTensorType(t);
|
|
if (auto reshape = t.getDefiningOp<tensor::CollapseShapeOp>();
|
|
isUniqueCOOType(rtp) && reshape) {
|
|
// TODO: Supports more kinds of sparse tensors.
|
|
// FIXME: We should instead lower reshape operations on sparse tensors to
|
|
// view change.
|
|
collapseReassoc[tid] = reshape.getReassociation();
|
|
rtp = reshape.getSrcType();
|
|
// Overwrites the tensor to the source tensor of reshape operations.
|
|
tensors[tid] = reshape.getSrc();
|
|
}
|
|
const SparseTensorType stt(rtp);
|
|
const Level lvlRank = stt.getLvlRank();
|
|
// We always treat sparse output tensor as dense so that we always iterate
|
|
// it based on lvl size.
|
|
if (stt.hasEncoding() && !(isOutputTensor(tid) && isSparseOut)) {
|
|
const auto enc = stt.getEncoding();
|
|
isSparseSlices[tid] = enc.isSlice();
|
|
for (auto lvlTp : enc.getLvlTypes())
|
|
lvlTypes[tid].push_back(lvlTp);
|
|
} else {
|
|
lvlTypes[tid].assign(lvlRank, DimLevelType::Dense);
|
|
}
|
|
|
|
// Initialize using empty value.
|
|
lvlSizes[tid].assign(lvlRank, Value());
|
|
highs[tid].assign(lvlRank, Value());
|
|
segHi[tid].assign(lvlRank, Value());
|
|
posits[tid].assign(lvlRank, Value());
|
|
coords[tid].assign(lvlRank, Value());
|
|
positionsBuffers[tid].assign(lvlRank, Value());
|
|
coordinatesBuffers[tid].assign(lvlRank, Value());
|
|
sliceOffsets[tid].assign(lvlRank, Value());
|
|
sliceStrides[tid].assign(lvlRank, Value());
|
|
|
|
// Slice-driven loops related initialization.
|
|
levelReducedDep[tid].assign(lvlRank, 0);
|
|
dependentLvlMap[tid].assign(lvlRank,
|
|
std::vector<std::pair<TensorId, Level>>());
|
|
slicePosBuffer[tid].assign(lvlRank, std::vector<Value>());
|
|
sliceSizes[tid].assign(lvlRank, std::vector<Value>());
|
|
sliceStack[tid].emplace_back(/*minCrd=*/Value(),
|
|
/*offset=*/Value(), /*isNonEmpty*/ Value(),
|
|
std::nullopt, 0);
|
|
if (dimGetter) {
|
|
auto reassoc = collapseReassoc[tid];
|
|
Level dstRank = reassoc ? reassoc.size() : lvlRank;
|
|
for (Level l = 0; l < dstRank; l++) {
|
|
dependentLvlMap[tid][l] = dimGetter(tid, l);
|
|
unsigned depends = dependentLvlMap[tid][l].size();
|
|
if (depends == 0)
|
|
continue;
|
|
// TODO: View-base collapse and dependent index reduction are not
|
|
// compatible right now.
|
|
assert(!reassoc);
|
|
// We need `depends - 1` slices to fully the affine expression.
|
|
sliceSizes[tid][l].assign(depends - 1, nullptr);
|
|
slicePosBuffer[tid][l].assign(depends - 1, nullptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Construct the inverse of the `topSort` from the sparsifier.
|
|
// This is needed to map `AffineDimExpr`s back to the `LoopOrd`
|
|
// used in loop emitter.
|
|
// FIXME: This map should be maintained outside loop emitter.
|
|
for (LoopOrd n = 0; n < numLoops; n++)
|
|
loopIdToOrd[topSort[n]] = n;
|
|
}
|
|
|
|
void LoopEmitter::initializeLoopEmit(OpBuilder &builder, Location loc,
|
|
LoopEmitter::OutputUpdater updater) {
|
|
// For every tensor:
|
|
// * get the values buffer.
|
|
// * For every level:
|
|
// * get the positions and coordinates buffers
|
|
// * get/compute the level-size, which is also used as the upper-bound
|
|
// on positions.
|
|
for (TensorId t = 0, numTensors = getNumTensors(); t < numTensors; t++) {
|
|
const Value tensor = tensors[t];
|
|
const auto rtp = dyn_cast<RankedTensorType>(tensor.getType());
|
|
if (!rtp)
|
|
// Skips only scalar, zero ranked tensor still need to be bufferized and
|
|
// (probably) filled with zeros by users.
|
|
continue;
|
|
// FIXME: the definition of `lvlRank` looks more like a dim-rank;
|
|
// but the variable is used as a level everywhere below, which
|
|
// suggests there may be some dim/lvl confusion going on here.
|
|
const Level lvlRank = rtp.getRank();
|
|
const auto shape = rtp.getShape();
|
|
const auto enc = getSparseTensorEncoding(rtp);
|
|
const Level cooStart = enc ? getCOOStart(enc) : lvlRank;
|
|
// Scan all levels of current tensor.
|
|
for (Level l = 0; l < lvlRank; l++) {
|
|
// This should be called only once at beginning.
|
|
assert(!positionsBuffers[t][l] && !coordinatesBuffers[t][l] &&
|
|
!highs[t][l]);
|
|
const auto lvlTp = lvlTypes[t][l];
|
|
// Handle sparse storage schemes.
|
|
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
|
|
// Generate sparse primitives to obtain positions and coordinates.
|
|
positionsBuffers[t][l] = genToPositions(builder, loc, tensor, l);
|
|
coordinatesBuffers[t][l] =
|
|
genToCoordinates(builder, loc, tensor, l, cooStart);
|
|
} else if (isSingletonDLT(lvlTp)) {
|
|
// Singleton level, fetch coordinates.
|
|
coordinatesBuffers[t][l] =
|
|
genToCoordinates(builder, loc, tensor, l, cooStart);
|
|
} else {
|
|
// Dense level, nothing to fetch.
|
|
assert(isDenseDLT(lvlTp));
|
|
}
|
|
|
|
// FIXME: `toOrigDim` is deprecated. For now this relies on the
|
|
// 1:1 mapping between levels and dimensions, since nowhere else
|
|
// in the code supports non-permutations yet either.
|
|
Value lvlSz = mlir::linalg::createOrFoldDimOp(builder, loc, tensor,
|
|
toOrigDim(enc, l));
|
|
// Find upper bound in current dimension.
|
|
highs[t][l] = lvlSizes[t][l] = lvlSz;
|
|
if (isSparseSlices[t]) {
|
|
sliceOffsets[t][l] = genSliceOffset(builder, loc, tensors[t], l);
|
|
sliceStrides[t][l] = genSliceStride(builder, loc, tensors[t], l);
|
|
}
|
|
}
|
|
|
|
// Perform the required bufferization. Dense inputs materialize
|
|
// from the input tensors. Sparse inputs use sparse primitives to obtain the
|
|
// values.
|
|
// Delegates extra output initialization to clients.
|
|
bool isOutput = isOutputTensor(t);
|
|
Type elementType = rtp.getElementType();
|
|
if (!enc) {
|
|
// Non-annotated dense tensors.
|
|
BaseMemRefType denseTp = MemRefType::get(shape, elementType);
|
|
|
|
// TODO: if we unconditionally use fully dynamic layout here, it breaks
|
|
// some vectorization passes which requires static stride = 1.
|
|
// Is it possible to call vectorization pass after bufferization?
|
|
if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
|
|
denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
|
|
|
|
Value denseVal =
|
|
builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
|
|
// Dense outputs need special handling.
|
|
if (isOutput && updater)
|
|
denseVal = updater(builder, loc, denseVal, tensor);
|
|
|
|
valBuffer[t] = denseVal;
|
|
} else {
|
|
// Annotated sparse tensors.
|
|
// We also need the value buffer for all-dense annotated "sparse" tensors.
|
|
valBuffer[t] = genToValues(builder, loc, tensor);
|
|
}
|
|
// NOTE: we can also prepare for 0 lvl here in advance, this will hoist
|
|
// some loop preparation from tensor iteration, but will also (undesirably)
|
|
// hoist the code ouside if-conditions.
|
|
}
|
|
|
|
Type indexType = builder.getIndexType();
|
|
Value c0 = constantZero(builder, loc, indexType);
|
|
for (TensorId t = 0, e = tensors.size(); t < e; t++) {
|
|
auto rtp = dyn_cast<RankedTensorType>(tensors[t].getType());
|
|
if (!rtp)
|
|
continue;
|
|
|
|
Level lvlRank = SparseTensorType(rtp).getLvlRank();
|
|
for (Level lvl = 0; lvl < lvlRank; lvl++) {
|
|
if (!dependentLvlMap[t][lvl].empty()) {
|
|
ArrayRef<std::pair<TensorId, Level>> depLvls = dependentLvlMap[t][lvl];
|
|
// Needs at least two operands to form a non-trivial affine expression.
|
|
assert(depLvls.size() > 1);
|
|
|
|
Value size = c0;
|
|
for (unsigned e = depLvls.size() - 1; e >= 1; e--) {
|
|
auto [dt, dd] = depLvls[e];
|
|
size = ADDI(size, lvlSizes[dt][dd]);
|
|
sliceSizes[t][lvl][e - 1] = size;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
localInsertPos = builder.getInsertionPoint()->getPrevNode();
|
|
}
|
|
|
|
void LoopEmitter::enterNewLoopSeq(OpBuilder &builder, Location loc,
|
|
ArrayRef<TensorLevel> tidLvls) {
|
|
// TODO: sort
|
|
assert(loopSeqStack.size() == loopStack.size());
|
|
// Prepares for all the tensors used in the current loop sequence.
|
|
std::vector<std::tuple<TensorId, Level, bool>> slicedTids;
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
if (!dependentLvlMap[tid][lvl].empty()) {
|
|
bool fullyRed = genSliceBegin(builder, loc, tid, lvl);
|
|
slicedTids.emplace_back(tid, lvl, fullyRed);
|
|
} else {
|
|
prepareLoopOverTensorAtLvl(builder, loc, tid, lvl);
|
|
}
|
|
}
|
|
|
|
// Universal Index starts from 0.
|
|
loopSeqStack.emplace_back(C_IDX(0), std::move(slicedTids));
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoopSeq(OpBuilder &builder, Location loc) {
|
|
assert(loopSeqStack.size() == loopStack.size() + 1);
|
|
|
|
const auto &slicedTids = loopSeqStack.back().second;
|
|
|
|
// Depending on whether the slice is resolved or not at current loop sequence,
|
|
// end them in different ways.
|
|
for (auto [tid, lvl, res] : slicedTids) {
|
|
if (!res) {
|
|
// If this is a unresolved-slice-driven loop, pops out the slice.
|
|
assert(sliceStack[tid].back().slicedOnLvl == lvl);
|
|
sliceStack[tid].pop_back();
|
|
} else {
|
|
if (!isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Else this is a resolved-slice, and advance posit similar to TACO.
|
|
Value c1 = C_IDX(1), c2 = C_IDX(2);
|
|
// pIdx += 2, we finished the current lvl, advance the pointer index of
|
|
// the previous level by two to skip the [pLo, pHi] for current level.
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
|
|
Value curP = genIndexLoad(builder, loc, sPtrBuf, c1);
|
|
// TODO: we could probably use an SSA value for it.
|
|
Value nexP = ADDI(curP, c2);
|
|
builder.create<memref::StoreOp>(loc, nexP, sPtrBuf, c1);
|
|
}
|
|
}
|
|
}
|
|
loopSeqStack.pop_back();
|
|
}
|
|
|
|
Value LoopEmitter::genAffine(OpBuilder &builder, Location loc, AffineExpr a) {
|
|
switch (a.getKind()) {
|
|
case AffineExprKind::DimId: {
|
|
// FIXME: since the one callsite in Sparsification passes in a
|
|
// level-expression, the `getPosition` must in fact be a `Dimension`.
|
|
// However, elsewhere we have been lead to expect that `loopIdToOrd`
|
|
// should be indexed by `LoopId`...
|
|
const auto loopId = a.cast<AffineDimExpr>().getPosition();
|
|
assert(loopId < loopIdToOrd.size());
|
|
return loopStack[loopIdToOrd[loopId]].iv;
|
|
}
|
|
case AffineExprKind::Add: {
|
|
auto binOp = a.cast<AffineBinaryOpExpr>();
|
|
return ADDI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Mul: {
|
|
auto binOp = a.cast<AffineBinaryOpExpr>();
|
|
return MULI(genAffine(builder, loc, binOp.getLHS()),
|
|
genAffine(builder, loc, binOp.getRHS()));
|
|
}
|
|
case AffineExprKind::Constant: {
|
|
int64_t c = a.cast<AffineConstantExpr>().getValue();
|
|
return C_IDX(c);
|
|
}
|
|
default:
|
|
llvm_unreachable("unexpected affine subscript");
|
|
}
|
|
}
|
|
|
|
Operation *LoopEmitter::emitForLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level dstLvl, Value lo,
|
|
Value hi, MutableArrayRef<Value> reduc, bool isParallel) {
|
|
bool isSparseCond = isCompressedDLT(lvlTypes[tid][dstLvl]) ||
|
|
isCompressedWithHiDLT(lvlTypes[tid][dstLvl]) ||
|
|
isSingletonDLT(lvlTypes[tid][dstLvl]);
|
|
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
// TODO: support dynamic slices.
|
|
// Uses the first dimension here to build the loop bound (which is also the
|
|
// biggest range).
|
|
const Level srcLvl = reassoc.front();
|
|
Value step = C_IDX(1);
|
|
|
|
Operation *loop = nullptr;
|
|
Value iv;
|
|
if (isParallel) {
|
|
assert(collapseReassoc[tid] == nullptr);
|
|
scf::ParallelOp parOp =
|
|
builder.create<scf::ParallelOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(parOp.getBody());
|
|
assert(parOp.getNumReductions() == reduc.size());
|
|
iv = parOp.getInductionVars()[0];
|
|
|
|
// In-place update on the reduction variable vector.
|
|
// Note that the init vals is not the actual reduction variables but instead
|
|
// used as a "special handle" to (temporarily) represent them. The
|
|
// expression on init vals will be moved into scf.reduce and replaced with
|
|
// the block arguments when exiting the loop (see exitForLoop). This is
|
|
// needed as we can not build the actual reduction block and get the actual
|
|
// reduction varaible before users fill parallel loop body.
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = parOp.getInitVals()[i];
|
|
loop = parOp;
|
|
} else {
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, lo, hi, step, reduc);
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
iv = forOp.getInductionVar();
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
loop = forOp;
|
|
}
|
|
assert(loop && iv);
|
|
|
|
Value crd;
|
|
if (isSparseCond) {
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
// For COO, the position is the same across consecutive levels.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
llvm::for_each(reassoc,
|
|
[this, tid, iv](Level srcLvl) { posits[tid][srcLvl] = iv; });
|
|
crd = genSparseCrd(builder, loc, tid, dstLvl);
|
|
} else {
|
|
// Dense tensor, the coordinate is the inducation variable.
|
|
crd = iv;
|
|
}
|
|
|
|
if (isSparseSlices[tid] && isSparseCond) {
|
|
// For sparse level slices, we need to filter out invalid coordinates that
|
|
// are not included in the slice.
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc)
|
|
types.push_back(red.getType());
|
|
|
|
auto [trans, pred] = genSliceLegitPredicate(builder, loc, crd, tid, srcLvl);
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp = builder.create<scf::IfOp>(loc, types, pred,
|
|
/*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insertion point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
crd = trans;
|
|
}
|
|
|
|
assert(crd);
|
|
coords[tid][dstLvl] = crd;
|
|
return loop;
|
|
}
|
|
|
|
Operation *LoopEmitter::emitWhileLoopOverSliceAtSparseLvl(
|
|
OpBuilder &builder, Location loc, Value pLo, Value pHi, Value offset,
|
|
Value sliceSize, TensorId tid, Level lvl, MutableArrayRef<Value> reduc) {
|
|
// TODO: we should generalize the method to support iteration over for
|
|
// normal slices as well to allow early break.
|
|
Operation *insertPoint = nullptr;
|
|
Operation *loop =
|
|
genSliceLvlTraverseLoop(
|
|
builder, loc, pLo, pHi, offset, sliceSize, tid, lvl, reduc,
|
|
/*genYield=*/false, // unaware of the yield values from user yet
|
|
[this, tid, lvl, reduc, offset,
|
|
&insertPoint](OpBuilder &builder, Location loc, Value iv,
|
|
MutableArrayRef<Value> innerReduc) {
|
|
assert(innerReduc.size() == reduc.size());
|
|
// Updates users' reduction variable inplace
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = innerReduc[i];
|
|
// Loads the coordinates.
|
|
Value absC =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
|
|
|
|
// We need to substract the offset to get relative coordinates.
|
|
// TODO: how to assert relC >=0 during runtime?
|
|
insertPoint = builder.create<arith::SubIOp>(loc, absC, offset);
|
|
posits[tid][lvl] = iv;
|
|
coords[tid][lvl] = insertPoint->getResult(0);
|
|
})
|
|
.first;
|
|
// Sets the insertionn pointer inside loop body.
|
|
builder.setInsertionPointAfter(insertPoint);
|
|
return loop;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterLoopOverTensorAtLvl(OpBuilder &builder,
|
|
Location loc,
|
|
ArrayRef<TensorLevel> tidLvls,
|
|
MutableArrayRef<Value> reduc,
|
|
bool isParallel) {
|
|
// TODO: support multiple return on parallel for?
|
|
assert(!isParallel || reduc.size() <= 1);
|
|
bool isSparseCond = false, isSparseSliceCond = false;
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvls.front());
|
|
|
|
// Finds out the tensor level that we should use to generate loops. Amongs all
|
|
// the tensor levels, there is at most one sparse tensor level.
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
assert(lvlTypes[t].size() > l); // Must be a valid tid, dim pair
|
|
assert(!coords[t][l] || // We cannot re-enter the same level
|
|
!dependentLvlMap[t][l].empty()); // unless it is a slice-driver loop
|
|
auto lvlType = lvlTypes[t][l];
|
|
// Must be a recognizable DLT.
|
|
assert(isDenseDLT(lvlType) || isCompressedDLT(lvlType) ||
|
|
isCompressedWithHiDLT(lvlType) || isSingletonDLT(lvlType));
|
|
|
|
// This is a slice-driven loop on sparse level.
|
|
if (!dependentLvlMap[t][l].empty() && !isDenseDLT(lvlType)) {
|
|
assert(!isSparseSliceCond && !isSparseCond);
|
|
isSparseSliceCond = true;
|
|
tid = t;
|
|
lvl = l;
|
|
continue;
|
|
}
|
|
|
|
bool isSparse = isCompressedDLT(lvlType) || isSingletonDLT(lvlType) ||
|
|
isCompressedWithHiDLT(lvlType);
|
|
// We can at most have one sparse input, otherwise, a while loop is
|
|
// required to co-iterate multiple sparse tensors.
|
|
assert(!isSparseCond || !isSparse);
|
|
assert(!isSparseSliceCond || !isSparseCond);
|
|
if (isSparse) {
|
|
tid = t;
|
|
lvl = l;
|
|
}
|
|
isSparseCond = isSparseCond || isSparse;
|
|
}
|
|
|
|
DimLevelType lvlType = lvlTypes[tid][lvl];
|
|
// TODO: Dense slice driven loop can be generated using for loop as well.
|
|
assert(!isSparseSliceCond || !isDenseDLT(lvlType));
|
|
bool isDenseSliceCond =
|
|
isDenseDLT(lvlType) && !dependentLvlMap[tid][lvl].empty();
|
|
// if the slice is fully reduced, we can now use TACO-based algorithm to
|
|
// iterate it.
|
|
|
|
Operation *l = nullptr;
|
|
|
|
// At most one tensor used as condition in for loop;
|
|
SmallVector<TensorLevel, 1> condTidLvl;
|
|
// There might be multiple dense slice driven tensor.
|
|
SmallVector<SliceLoopInfo> sliceDrivenInfo;
|
|
|
|
// Generates loops differently depending on whether we need a slice-driven
|
|
// loop or a simple level traversal loop.
|
|
if (isSparseSliceCond) {
|
|
bool fullyReduced = depFullyReduced(tid, lvl);
|
|
if (!fullyReduced) {
|
|
l = emitSliceDrivenLoopOverTensorAtLvl(builder, loc, tid, lvl, reduc);
|
|
} else {
|
|
// If the slice is fully reduced, we can now use TACO-based algorithm to
|
|
// iterate it.
|
|
l = emitWhileLoopOverSliceAtSparseLvl(
|
|
builder, loc, posits[tid][lvl], highs[tid][lvl],
|
|
getFinalSliceOnLvl(tid, lvl).offset, sliceSizes[tid][lvl].back(), tid,
|
|
lvl, reduc);
|
|
}
|
|
levelReducedDep[tid][lvl]++;
|
|
sliceDrivenInfo.emplace_back(tid, lvl, fullyReduced);
|
|
} else {
|
|
Value lo = isSparseCond ? posits[tid][lvl] // current offset
|
|
: loopSeqStack.back().first; // universal index
|
|
Value hi = highs[tid][lvl];
|
|
if (isDenseSliceCond) {
|
|
bool fullyReduced = depFullyReduced(tid, lvl);
|
|
Value sliceSz = sliceSizes[tid][lvl][sliceStack[tid].back().depth - 1];
|
|
// Adjust for loop hi for dense slice-driven loop.
|
|
if (fullyReduced) {
|
|
hi = sliceSz;
|
|
condTidLvl.push_back(makeTensorLevel(tid, lvl));
|
|
} else {
|
|
hi = SUBI(lvlSizes[tid][lvl], sliceSz);
|
|
hi = ADDI(hi, C_IDX(1));
|
|
}
|
|
} else {
|
|
condTidLvl.push_back(makeTensorLevel(tid, lvl));
|
|
}
|
|
l = emitForLoopOverTensorAtLvl(builder, loc, tid, lvl, lo, hi, reduc,
|
|
isParallel);
|
|
}
|
|
Value iv = coords[tid][lvl];
|
|
for (auto [t, l] : unpackTensorLevelRange(tidLvls)) {
|
|
// We only need to handle slice-driven loops on dense level here.
|
|
// If it is a slice-driven loop on sparse level, it needs a while loop to
|
|
// insert break statements, and it must have been handled correctly in L692.
|
|
if (!dependentLvlMap[t][l].empty() && isDenseDLT(lvlTypes[t][l])) {
|
|
// Pushes sliced levels to build correct LoopInfo.
|
|
bool fullyReduc = depFullyReduced(t, l);
|
|
SliceInfo &info = sliceStack[t].back();
|
|
if (fullyReduc) {
|
|
posits[t][l] = genAddress(builder, loc, t, l, ADDI(info.offset, iv));
|
|
} else {
|
|
// Puts sliced dense loop into LoopInfo so that LoopEmitter knows how to
|
|
// exit it.
|
|
sliceDrivenInfo.emplace_back(t, l, fullyReduc);
|
|
// Update the slice information as we enter the new loop.
|
|
assert(*info.slicedOnLvl == l);
|
|
info.minCrd = info.offset = iv;
|
|
info.isNonEmpty = constantI1(builder, loc, true);
|
|
levelReducedDep[t][l]++;
|
|
}
|
|
}
|
|
}
|
|
// NOTE: we can also prepare for next dim here in advance
|
|
// Pushes the loop into stack.
|
|
loopStack.emplace_back(condTidLvl, sliceDrivenInfo, l,
|
|
builder.getInsertionBlock(), iv, loopTag);
|
|
// Emit extra locals.
|
|
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
|
|
return l;
|
|
}
|
|
|
|
Operation *LoopEmitter::enterFilterLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
AffineExpr affine, MutableArrayRef<Value> reduc) {
|
|
assert(isValidLevel(tid, lvl));
|
|
assert(!affine.isa<AffineDimExpr>() && !isDenseDLT(lvlTypes[tid][lvl]));
|
|
// We can not re-enter the same level.
|
|
assert(!coords[tid][lvl]);
|
|
|
|
// TODO: We should instead use a whileOp for filter loop to allow early
|
|
// break when exceeding (for ordered levels).
|
|
// TODO: There are many other potiential opportunities that we might apply in
|
|
// the future. E.g., we could use binary search to locate positions.
|
|
const Value step = C_IDX(1);
|
|
const Value pLo = posits[tid][lvl];
|
|
const Value pHi = highs[tid][lvl];
|
|
scf::ForOp forOp = builder.create<scf::ForOp>(loc, pLo, pHi, step, reduc);
|
|
|
|
// In-place update on the reduction variable vector.
|
|
assert(forOp.getNumRegionIterArgs() == reduc.size());
|
|
for (int i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = forOp.getRegionIterArg(i);
|
|
|
|
builder.setInsertionPointToStart(forOp.getBody());
|
|
// The induction variable gives the position.
|
|
const Value pos = forOp.getInductionVar();
|
|
posits[tid][lvl] = pos;
|
|
// Generating a load on the coordinates array yields the crd.
|
|
const Value mem = coordinatesBuffers[tid][lvl];
|
|
const Value crd = genIndexLoad(builder, loc, mem, pos);
|
|
coords[tid][lvl] = crd;
|
|
|
|
// Generate an if-condition to filter out coordinates that are not
|
|
// equal to the result of the affine expression.
|
|
Value expected = genAffine(builder, loc, affine);
|
|
auto pred = CMPI(eq, crd, expected);
|
|
SmallVector<Type> types;
|
|
for (Value red : reduc) {
|
|
types.push_back(red.getType());
|
|
}
|
|
|
|
bool hasReduc = !types.empty();
|
|
scf::IfOp ifOp =
|
|
builder.create<scf::IfOp>(loc, types, pred, /*else*/ hasReduc);
|
|
if (hasReduc) {
|
|
// scf.for (a) -> v
|
|
// %s = scf.if (a) -> v
|
|
// user-generated code.
|
|
// else
|
|
// yield a
|
|
// yield %s
|
|
YIELD(ifOp.getResults());
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
// On mismatch.
|
|
YIELD(reduc);
|
|
}
|
|
// Set the insert point to matched branch.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
// Push the loop into stack
|
|
loopStack.emplace_back(ArrayRef<TensorLevel>(makeTensorLevel(tid, lvl)),
|
|
ArrayRef<SliceLoopInfo>(), forOp,
|
|
builder.getInsertionBlock(), coords[tid][lvl],
|
|
nullptr);
|
|
return forOp;
|
|
}
|
|
|
|
void LoopEmitter::genDenseAffineAddress(OpBuilder &builder, Location loc,
|
|
TensorLevel tidLvl,
|
|
AffineExpr lvlExpr) {
|
|
auto [tid, lvl] = unpackTensorLevel(tidLvl);
|
|
assert(isDenseDLT(lvlTypes[tid][lvl]));
|
|
// For dense levels, the level-coordinate also serves as the position.
|
|
Value lvlCrd = genAffine(builder, loc, lvlExpr);
|
|
posits[tid][lvl] = genAddress(builder, loc, tid, lvl, lvlCrd);
|
|
}
|
|
|
|
Operation *LoopEmitter::enterCoIterationOverTensorsAtLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls,
|
|
bool needsUniv, MutableArrayRef<Value> reduc) {
|
|
// NOTE: the slice driven tensor-related reduction variable must
|
|
// appear before normal tensors.
|
|
SmallVector<Type> types;
|
|
SmallVector<Value> operands;
|
|
// Construct the while-loop with a parameter for each coordinate.
|
|
const Type indexType = builder.getIndexType();
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
// TODO: support coiteration with slice driven tensors.
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
assert(dependentLvlMap[tid][lvl].empty() && "TODO: not yet implemented");
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, lvl);
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
|
|
// This is the segment high for each non-unique levels.
|
|
types.push_back(indexType);
|
|
operands.push_back(C_IDX(0));
|
|
}
|
|
}
|
|
const auto pos = posits[tid][reassoc.front()];
|
|
assert(pos);
|
|
types.push_back(indexType);
|
|
operands.push_back(pos);
|
|
}
|
|
}
|
|
// The position where user-supplied reduction variable starts.
|
|
for (Value rec : reduc) {
|
|
types.push_back(rec.getType());
|
|
operands.push_back(rec);
|
|
}
|
|
if (needsUniv) {
|
|
types.push_back(indexType);
|
|
// Update universal index.
|
|
operands.push_back(loopSeqStack.back().first);
|
|
}
|
|
assert(types.size() == operands.size());
|
|
scf::WhileOp whileOp = builder.create<scf::WhileOp>(loc, types, operands);
|
|
|
|
SmallVector<Location> locs(types.size(), loc);
|
|
Block *before = builder.createBlock(&whileOp.getBefore(), {}, types, locs);
|
|
Block *after = builder.createBlock(&whileOp.getAfter(), {}, types, locs);
|
|
|
|
// Build the "before" region, which effectively consists
|
|
// of a conjunction of "i < upper" tests on all induction.
|
|
builder.setInsertionPointToStart(&whileOp.getBefore().front());
|
|
Value cond;
|
|
unsigned o = 0;
|
|
for (auto [t, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const TensorId tid = t; // Why `t` can not be captured by lambda?
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, lvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
if (!isUniqueDLT(lvlTypes[tid][reassoc[i]])) {
|
|
// Links the SSA chain for segHi.
|
|
segHi[tid][reassoc[i]] = after->getArgument(o++);
|
|
}
|
|
}
|
|
Value op1 = before->getArgument(o);
|
|
// We used the first level bound as the bound the collapsed set of levels.
|
|
Value op2 = highs[tid][reassoc.front()];
|
|
Value opc = CMPI(ult, op1, op2);
|
|
cond = cond ? ANDI(cond, opc) : opc;
|
|
// Update positions
|
|
Value pos = after->getArgument(o++);
|
|
// For COO, the position is the same across consecutive levels.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
llvm::for_each(reassoc, [this, tid, pos](Level srcLvl) {
|
|
posits[tid][srcLvl] = pos;
|
|
});
|
|
}
|
|
}
|
|
builder.create<scf::ConditionOp>(loc, cond, before->getArguments());
|
|
|
|
// Generates while body.
|
|
builder.setInsertionPointToStart(&whileOp.getAfter().front());
|
|
|
|
SmallVector<std::pair<Value, unsigned>> slicesPreds;
|
|
unsigned i = 0;
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
// Prepares for next level.
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
coords[tid][lvl] = genSparseCrd(builder, loc, tid, lvl);
|
|
if (isSparseSlices[tid]) {
|
|
auto [trans, pred] =
|
|
genSliceLegitPredicate(builder, loc, coords[tid][lvl], tid, lvl);
|
|
slicesPreds.emplace_back(pred, i);
|
|
// Updates to the relative coordinate to the slice.
|
|
coords[tid][lvl] = trans;
|
|
}
|
|
i++;
|
|
}
|
|
}
|
|
|
|
if (!slicesPreds.empty()) {
|
|
// Skips invalid loop iteration when slice coordinate is inapplicable.
|
|
SmallVector<Value> yields(after->getArguments());
|
|
// Generates a list of if statments
|
|
// pos = in_slice ? pos : pos + 1
|
|
// TODO: instead of always picking pos + 1, we should set pos = high to
|
|
// break to loop if the coordinates are larger than the slice size.
|
|
//
|
|
// This "idx" is the index into `llvm::zip(tids, lvls)`
|
|
for (auto [pred, idx] : slicesPreds) {
|
|
Value nextPos = ADDI(yields[idx], C_IDX(1));
|
|
yields[idx] = SELECT(pred, yields[idx], nextPos);
|
|
}
|
|
|
|
Value pred = slicesPreds.front().first;
|
|
for (int i = 1, e = slicesPreds.size(); i < e; i++) {
|
|
pred = ANDI(pred, slicesPreds[i].first);
|
|
}
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types, pred, /*else*/ true);
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
YIELD(ifOp->getResults());
|
|
assert(types.size() == yields.size());
|
|
// If not all slices are legit
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(yields);
|
|
|
|
// If all slices are legit, start the user generated code.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
}
|
|
|
|
Value min;
|
|
// Finds the minimum coordinate
|
|
if (!needsUniv) {
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const auto lvlTp = lvlTypes[tid][lvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto crd = coords[tid][lvl];
|
|
if (min) {
|
|
Value cmp = CMPI(ult, coords[tid][lvl], min);
|
|
min = SELECT(cmp, coords[tid][lvl], min);
|
|
} else {
|
|
min = crd;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
assert(!min);
|
|
// Otherwise, universal index is the minimal pos.
|
|
min = after->getArguments().back();
|
|
}
|
|
|
|
// Sets up the loop stack.
|
|
loopStack.emplace_back(tidLvls, ArrayRef<SliceLoopInfo>(), whileOp,
|
|
builder.getInsertionBlock(), min, loopTag);
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
|
|
for (auto [tid, dstLvl] : unpackTensorLevelRange(tidLvls)) {
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
// TODO: Refactors this into smaller functions.
|
|
// NOTE: For all the collapsed level (except for the last one, that is why
|
|
// the loop ends with `reassoc.size() - 1`), as each iteration is advanced
|
|
// by the segment size of the last level, which does not always invalidate
|
|
// the segment size for the previous levels, thus we need to propagate the
|
|
// segment sizes across loop iterations and only forward if needed.
|
|
//
|
|
// E.g., for a COO tensor with the following coordinates array.
|
|
// (0, 0, 1),
|
|
// (0, 0, 2),
|
|
// (1, 1, 1),
|
|
// segHi[lvl=0] = segHi[lvl=1] = 2
|
|
// segHi[lvl=2] = 1,
|
|
// the first iteration does not invalidate segHi[0] and segHi[1]
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
const Value pos = posits[tid][srcLvl];
|
|
const auto oldSegHi = segHi[tid][srcLvl];
|
|
assert(oldSegHi);
|
|
Value newSegHi = builder.create<arith::CmpIOp>(
|
|
loc, arith::CmpIPredicate::uge, pos, oldSegHi);
|
|
auto ifNewSegHi = builder.create<scf::IfOp>(loc, builder.getIndexType(),
|
|
newSegHi, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointToStart(ifNewSegHi.thenBlock());
|
|
YIELD(genSegmentHigh(builder, loc, tid, srcLvl, pos,
|
|
highs[tid][srcLvl]));
|
|
// Else, resues the same segment high.
|
|
builder.setInsertionPointToStart(ifNewSegHi.elseBlock());
|
|
YIELD(oldSegHi);
|
|
}
|
|
highs[tid][srcLvl + 1] = segHi[tid][srcLvl] = ifNewSegHi.getResult(0);
|
|
}
|
|
};
|
|
const auto srcLvl = reassoc.back();
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
segHi[tid][srcLvl] = genSegmentHigh(
|
|
builder, loc, tid, srcLvl, posits[tid][srcLvl], highs[tid][srcLvl]);
|
|
}
|
|
}
|
|
|
|
// Emits extra locals
|
|
emitExtraLocalsForTensorsAtDenseLvls(builder, loc, tidLvls);
|
|
|
|
// Updates reduction variables
|
|
assert(after->getNumArguments() == o + reduc.size() + (needsUniv ? 1 : 0));
|
|
// In-place update on reduction variable.
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = after->getArgument(o + i);
|
|
|
|
return whileOp;
|
|
}
|
|
|
|
void LoopEmitter::prepareLoopOverTensorAtLvl(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level dstLvl) {
|
|
assert(isValidLevel(tid, dstLvl));
|
|
const auto lvlTp = lvlTypes[tid][dstLvl];
|
|
|
|
if (isDenseDLT(lvlTp))
|
|
return;
|
|
|
|
const Value c0 = C_IDX(0);
|
|
const Value c1 = C_IDX(1);
|
|
for (const Level srcLvl : getCollapseReassociation(tid, dstLvl)) {
|
|
// Either the first level, or the previous level has been set.
|
|
/// FIXME: See the [CLARIFY_POSITS_LVL] note in the header.
|
|
assert(srcLvl == 0 || posits[tid][srcLvl - 1]);
|
|
if (isDenseDLT(lvlTp))
|
|
continue;
|
|
if (isCompressedDLT(lvlTp) || isCompressedWithHiDLT(lvlTp)) {
|
|
const Value mem = positionsBuffers[tid][srcLvl];
|
|
|
|
Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
|
|
if (isCompressedWithHiDLT(lvlTp))
|
|
pLo = builder.create<arith::MulIOp>(loc, pLo, C_IDX(2));
|
|
posits[tid][srcLvl] = genIndexLoad(builder, loc, mem, pLo);
|
|
|
|
const Value pHi = ADDI(pLo, c1);
|
|
highs[tid][srcLvl] = genIndexLoad(builder, loc, mem, pHi);
|
|
return;
|
|
}
|
|
if (isSingletonDLT(lvlTp)) {
|
|
const Value pLo = srcLvl == 0 ? c0 : posits[tid][srcLvl - 1];
|
|
posits[tid][srcLvl] = pLo;
|
|
|
|
// If we are coiterating non-unique levels, then use pHi=segHi;
|
|
// otherwise use pHi=pLo+1.
|
|
// NOTE: Just because the level is non-unique, that does not
|
|
// guarantee that segHi is defined: because we only generate segHi
|
|
// whenever coiterating, in order to improve code quality for the
|
|
// non-coiterating cases.
|
|
const auto parentSegHi = segHi[tid][srcLvl - 1];
|
|
highs[tid][srcLvl] =
|
|
(!isUniqueDLT(lvlTypes[tid][srcLvl - 1]) && parentSegHi)
|
|
? parentSegHi
|
|
: ADDI(pLo, c1);
|
|
return;
|
|
}
|
|
}
|
|
|
|
llvm_unreachable("Unrecognized level-type!");
|
|
}
|
|
|
|
void LoopEmitter::emitExtraLocalsForTensorsAtDenseLvls(
|
|
OpBuilder &builder, Location loc, ArrayRef<TensorLevel> tidLvls) {
|
|
// Initialize dense positions. Note that we generate dense coordinates of the
|
|
// output tensor unconditionally, since they may not appear in the lattice,
|
|
// but may be needed for linearized codegen.
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(tidLvls)) {
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Slice-driven dense level should have be handled already.
|
|
if (!dependentLvlMap[tid][lvl].empty())
|
|
continue;
|
|
|
|
auto enc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (enc && !isSparseOutput(tid)) {
|
|
bool validPos = lvl == 0 || posits[tid][lvl - 1];
|
|
if (!validPos) {
|
|
// We might not find the pos for the sparse output tensor as it is
|
|
// unconditionally required by the sparsification.
|
|
assert(isOutputTensor(tid));
|
|
continue;
|
|
}
|
|
posits[tid][lvl] =
|
|
genAddress(builder, loc, tid, lvl, loopStack.back().iv);
|
|
// NOTE: we can also prepare for next lvl here in advance
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) {
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(isDenseDLT(lvlTypes[tid][lvl]));
|
|
assert(*info.slicedOnLvl == lvl && !reduced);
|
|
(void)reduced;
|
|
// Resets slices pointers as the resolved slices are invalidated after we
|
|
// moves forward to the next slice.
|
|
invalidateSliceIterIdx(rewriter, loc, tid, lvl);
|
|
info.minCrd = info.offset = info.isNonEmpty = Value();
|
|
levelReducedDep[tid][lvl]--;
|
|
}
|
|
if (auto forOp = llvm::dyn_cast<scf::ForOp>(loopInfo.loop)) {
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == forOp.getNumResults());
|
|
rewriter.create<scf::YieldOp>(loc, reduc);
|
|
}
|
|
// Exit the loop.
|
|
rewriter.setInsertionPointAfter(forOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = forOp.getResults().size(); i < e; i++)
|
|
reduc[i] = forOp.getResult(i);
|
|
} else {
|
|
auto parOp = llvm::cast<scf::ParallelOp>(loopInfo.loop);
|
|
if (!reduc.empty()) {
|
|
assert(reduc.size() == parOp.getInitVals().size() && reduc.size() == 1);
|
|
Operation *redExp = reduc.front().getDefiningOp();
|
|
// Reduction expression should have no use.
|
|
assert(redExp->getUses().empty());
|
|
// This must be a binary operation.
|
|
// NOTE: This is users' responsibilty to ensure the operation are
|
|
// commutative.
|
|
assert(redExp->getNumOperands() == 2 && redExp->getNumResults() == 1);
|
|
|
|
Value redVal = parOp.getInitVals().front();
|
|
Value curVal;
|
|
if (redExp->getOperand(0) == redVal)
|
|
curVal = redExp->getOperand(1);
|
|
else if (redExp->getOperand(1) == redVal)
|
|
curVal = redExp->getOperand(0);
|
|
// One of the operands must be the init value (which is also the
|
|
// previous reduction value).
|
|
assert(curVal);
|
|
#ifndef NDEBUG
|
|
// The reduction expression should be the only user of the reduction val
|
|
// inside the parallel for.
|
|
unsigned numUsers = 0;
|
|
for (Operation *op : redVal.getUsers()) {
|
|
if (op->getParentOp() == parOp)
|
|
numUsers++;
|
|
}
|
|
assert(numUsers == 1);
|
|
#endif // NDEBUG
|
|
|
|
rewriter.setInsertionPointAfter(redExp);
|
|
auto redOp = rewriter.create<scf::ReduceOp>(loc, curVal);
|
|
// Attach to the reduction op.
|
|
Block *redBlock = &redOp.getRegion().getBlocks().front();
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
Operation *newRed = rewriter.clone(*redExp);
|
|
// Replaces arguments of the reduction expression by using the block
|
|
// arguments from scf.reduce.
|
|
rewriter.updateRootInPlace(
|
|
newRed, [&]() { newRed->setOperands(redBlock->getArguments()); });
|
|
// Erases the out-dated reduction expression.
|
|
rewriter.eraseOp(redExp);
|
|
rewriter.setInsertionPointToEnd(redBlock);
|
|
rewriter.create<scf::ReduceReturnOp>(loc, newRed->getResult(0));
|
|
}
|
|
rewriter.setInsertionPointAfter(parOp);
|
|
// In-place update reduction variables.
|
|
for (unsigned i = 0, e = parOp.getResults().size(); i < e; i++)
|
|
reduc[i] = parOp.getResult(i);
|
|
}
|
|
|
|
// Finished iterating a tensor, clean up
|
|
// We only do the clean up on for loop as while loops do not necessarily
|
|
// finish the iteration on a sparse tensor
|
|
for (auto [tid, lvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
|
|
// Reset to null.
|
|
coords[tid][lvl] = Value();
|
|
posits[tid][lvl] = Value();
|
|
// Dense level, high is fixed.
|
|
if (!isDenseDLT(lvlTypes[tid][lvl]))
|
|
highs[tid][lvl] = Value();
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::exitWhileLoop(OpBuilder &builder, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
auto whileOp = llvm::cast<scf::WhileOp>(loopInfo.loop);
|
|
builder.setInsertionPointToEnd(loopInfo.userCodeBlock);
|
|
Value iv = loopInfo.iv;
|
|
|
|
// Finalize the induction. Note that the induction could be performed
|
|
// in the individual if-branches to avoid re-evaluating the conditions.
|
|
// However, that would result in a rather elaborate forest of yield
|
|
// instructions during code generation. Moreover, performing the induction
|
|
// after the if-statements more closely resembles code generated by TACO.
|
|
unsigned o = 0;
|
|
SmallVector<Value> operands;
|
|
unsigned delta = 0;
|
|
for (auto [tid, lvl, resolved] : loopInfo.sliceDrivenInfo) {
|
|
// TODO: handle dense.
|
|
assert(isCompressedDLT(lvlTypes[tid][lvl]));
|
|
levelReducedDep[tid][lvl]--;
|
|
if (!resolved) {
|
|
genSliceNextInduction(builder, loc, whileOp, tid, lvl, operands, o);
|
|
continue;
|
|
}
|
|
// TODO: We need to distinguish coiterate loop with slice-driven loop and
|
|
// fully reduced while op for iterating one slices.
|
|
// FIXME: since we didn't implement coiteration, this must be iteration
|
|
// just on fully resolved slice.
|
|
assert(loopInfo.sliceDrivenInfo.size() == 1 && loopInfo.tidLvls.empty());
|
|
// The if guard to filter out out-range coordinates.
|
|
assert(llvm::isa<scf::IfOp>(builder.getInsertionBlock()->getParentOp()));
|
|
posits[tid][lvl] = whileOp->getResult(o++);
|
|
// FIXME: we are not using continue here since we do not support
|
|
// coiteration on slices. But it need to be treated similarly as the
|
|
// universal index.
|
|
o++; // skip continue flag.
|
|
// Since we did not push two results from whileOp. The size of the
|
|
// operands vector is smaller than the actual number of return values from
|
|
// the whileOp.
|
|
// It is because we are actually generating yield in the IfOp inside the
|
|
// whileOp to only iterates over inbound coordinates within the slices.
|
|
delta += 2;
|
|
};
|
|
|
|
Value one = C_IDX(1);
|
|
for (auto [tid, dstLvl] : unpackTensorLevelRange(loopInfo.tidLvls)) {
|
|
const auto lvlTp = lvlTypes[tid][dstLvl];
|
|
if (isCompressedDLT(lvlTp) || isSingletonDLT(lvlTp) ||
|
|
isCompressedWithHiDLT(lvlTp)) {
|
|
const auto reassoc = getCollapseReassociation(tid, dstLvl);
|
|
assert(reassoc.size() == 1 || isUniqueCOOType(tensors[tid].getType()));
|
|
for (unsigned i = 0, e = reassoc.size() - 1; i < e; i++) {
|
|
const Level srcLvl = reassoc[i];
|
|
if (!isUniqueDLT(lvlTypes[tid][srcLvl])) {
|
|
operands.push_back(segHi[tid][srcLvl]);
|
|
o++;
|
|
}
|
|
}
|
|
const Value crd = coords[tid][dstLvl];
|
|
const Value pos = posits[tid][dstLvl];
|
|
Value cmp = CMPI(eq, crd, iv);
|
|
// If the loop contains a coiteration with non-unique level, we fast
|
|
// forward all the duplicated coords by setting the position to the
|
|
// segment high.
|
|
Value add = !isUniqueDLT(lvlTypes[tid][reassoc.back()])
|
|
? segHi[tid][reassoc.back()]
|
|
: ADDI(pos, one);
|
|
|
|
operands.push_back(SELECT(cmp, add, pos));
|
|
// Following loops continue iteration from the break point of the
|
|
// current while loop.
|
|
const Value newPos = whileOp->getResult(o++);
|
|
// We need to define a new local variable for `tid` to avoid
|
|
// warnings about "captured structured bindings are a C++20 extension".
|
|
// FIXME(wrengr): define a helper function to capture this idiom!
|
|
const TensorId newTid = tid;
|
|
llvm::for_each(reassoc, [this, newTid, newPos](Level srcLvl) {
|
|
posits[newTid][srcLvl] = newPos;
|
|
});
|
|
// The coordinate is invalid now.
|
|
coords[tid][dstLvl] = nullptr;
|
|
// The segment high is invalid now.
|
|
segHi[tid][dstLvl] = nullptr;
|
|
// highs remains unchanged.
|
|
}
|
|
}
|
|
|
|
// Reduction value from users.
|
|
for (auto &i : reduc) {
|
|
operands.push_back(i);
|
|
// In place update reduction variable.
|
|
i = whileOp->getResult(o++);
|
|
}
|
|
|
|
// An (optional) universal index.
|
|
if (operands.size() + delta < whileOp.getNumResults()) {
|
|
assert(operands.size() + delta + 1 == whileOp.getNumResults());
|
|
// The last one is the universial index.
|
|
operands.push_back(ADDI(iv, one));
|
|
// update the loop starting point of current loop sequence
|
|
loopSeqStack.back().first = whileOp->getResult(o++);
|
|
}
|
|
|
|
assert(o == operands.size() + delta);
|
|
YIELD(operands);
|
|
builder.setInsertionPointAfter(whileOp);
|
|
}
|
|
|
|
void LoopEmitter::exitCurrentLoop(RewriterBase &rewriter, Location loc,
|
|
MutableArrayRef<Value> reduc) {
|
|
// Clean up the values, it would help use to discover potential bug at a
|
|
// earlier stage (instead of silently using a wrong value).
|
|
const LoopInfo &loopInfo = loopStack.back();
|
|
SmallVector<Value> red;
|
|
if (llvm::isa<scf::WhileOp>(loopInfo.loop)) {
|
|
exitWhileLoop(rewriter, loc, reduc);
|
|
} else {
|
|
exitForLoop(rewriter, loc, reduc);
|
|
}
|
|
|
|
assert(loopStack.size() == loopSeqStack.size());
|
|
loopStack.pop_back();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Slice-driven loop related methods.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
unsigned LoopEmitter::remDepOnLevel(TensorId tid, Level lvl) const {
|
|
unsigned totalDependencies = dependentLvlMap[tid][lvl].size();
|
|
if (totalDependencies != 0) {
|
|
assert(totalDependencies >= 2);
|
|
return totalDependencies - levelReducedDep[tid][lvl];
|
|
}
|
|
return totalDependencies;
|
|
}
|
|
|
|
const LoopEmitter::SliceInfo &LoopEmitter::getMostRecentSliceOnLvl(TensorId tid,
|
|
Level lvl) {
|
|
// Finds the most-recent slice using a reverse iteration.
|
|
for (auto it = sliceStack[tid].rbegin(), ie = sliceStack[tid].rend(); it < ie;
|
|
it++) {
|
|
if (it->slicedOnLvl == lvl) { // the level matched
|
|
return *it;
|
|
}
|
|
}
|
|
llvm_unreachable("Failed to find sliceInfo");
|
|
}
|
|
|
|
// Generates a while loop to iterate over a slice sparse level as follows.
|
|
//
|
|
// while(loopLo < loopHi) {
|
|
// if (coords[loopLo] < offset + size) {
|
|
// body_builder
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
std::pair<Operation *, ValueRange> LoopEmitter::genSliceLvlTraverseLoop(
|
|
OpBuilder &builder, Location loc, Value loopLo, Value loopHi, Value offset,
|
|
Value size, TensorId tid, Level lvl, ValueRange userReduc, bool genYield,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
Value c1 = C_IDX(1);
|
|
Value sliceHi = ADDI(offset, sliceSizes[tid][lvl].back());
|
|
|
|
SmallVector<Value> reduc = {
|
|
loopLo, // loop lower bounds
|
|
constantI1(builder, loc, true), // continue
|
|
};
|
|
// Append user required reduction value.
|
|
reduc.append(userReduc.begin(), userReduc.end());
|
|
scf::WhileOp whileOp = builder.create<scf::WhileOp>(
|
|
loc, ValueRange(reduc).getTypes(), reduc,
|
|
/*beforeBuilder=*/
|
|
[loopHi](OpBuilder &builder, Location loc, ValueRange args) {
|
|
Value lo = args[0];
|
|
Value cont = args[1];
|
|
Value inBound = CMPI(ult, lo, loopHi);
|
|
Value cond = ANDI(cont, inBound);
|
|
// continue if not yet break nor out of bound.
|
|
builder.create<scf::ConditionOp>(loc, cond, args);
|
|
},
|
|
/*afterBuilder=*/
|
|
[this, c1, tid, lvl, sliceHi, genYield,
|
|
bodyBuilder](OpBuilder &builder, Location loc, ValueRange args) {
|
|
Value iv = args[0];
|
|
Value coord =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], iv);
|
|
Value cont = CMPI(ult, coord, sliceHi);
|
|
TypeRange types = args.drop_front(2).getTypes();
|
|
|
|
auto ifOp = builder.create<scf::IfOp>(loc, types, cont, true);
|
|
{
|
|
// 2 reduction variable maintained by us.
|
|
SmallVector<Value> ifRet = args.drop_front(2);
|
|
assert(ifRet.size() == args.size() - 2);
|
|
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// If coord >= sliceHi.
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
YIELD(ifRet);
|
|
|
|
// If coord < sliceHi.
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
// Delegates to users' callback.
|
|
bodyBuilder(builder, loc, iv, ifRet);
|
|
if (genYield) {
|
|
builder.setInsertionPointToEnd(&ifOp.getThenRegion().front());
|
|
YIELD(ifRet);
|
|
}
|
|
}
|
|
// Marks this speical ifOp to avoid sparisification finalizing it.
|
|
ifOp->setAttr(getLoopEmitterLoopAttrName(),
|
|
StringAttr::get(builder.getContext(), "slice"));
|
|
// Insertion point restored to after ifOp.
|
|
SmallVector<Value> yields;
|
|
// Increase induction variable.
|
|
yields.push_back(ADDI(iv, c1));
|
|
yields.push_back(cont);
|
|
yields.append(ifOp.getResults().begin(), ifOp.getResults().end());
|
|
YIELD(yields);
|
|
});
|
|
|
|
builder.setInsertionPointAfter(whileOp);
|
|
return std::make_pair(whileOp, whileOp.getResults().drop_front(2));
|
|
}
|
|
|
|
// Generates a loop nest that traverse all the unresolved levels in between.
|
|
// TODO: it can only handle all compressed tensors.
|
|
//
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
//
|
|
// // Then the same loop generated by genSliceLvlTraverse above.
|
|
// while (loopLo < loopHI) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// bodyBuilder();
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse(
|
|
OpBuilder &builder, Location loc, TensorId tid,
|
|
ArrayRef<const SliceInfo *> unResLvls,
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl, ValueRange userReduc,
|
|
LoopBodyBuilder bodyBuilder) {
|
|
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
Value pos = c0;
|
|
OpBuilder::InsertPoint ip;
|
|
SmallVector<Value> innerArgs(userReduc.begin(), userReduc.end());
|
|
scf::ForOp outerMost = nullptr; // the outtermost loop.
|
|
if (firstResLvl.has_value()) {
|
|
// Overwrite position when the first level is fully resolved.
|
|
pos = posits[firstResLvl->first][firstResLvl->second];
|
|
ip = builder.saveInsertionPoint();
|
|
} else {
|
|
const SliceInfo &frontSlice = *unResLvls.back();
|
|
Level firstLvl = *frontSlice.slicedOnLvl;
|
|
if (!lvlFullyResolved(tid, firstLvl)) {
|
|
if (isCompressedDLT(lvlTypes[tid][firstLvl])) {
|
|
unsigned depth = frontSlice.depth - 1;
|
|
Value offset = frontSlice.offset;
|
|
Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth];
|
|
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, c2, mSz, c2, innerArgs,
|
|
[this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos,
|
|
&innerArgs](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
// generate traversal for each level.
|
|
Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv);
|
|
Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1));
|
|
ValueRange itArgs =
|
|
genSliceLvlTraverseLoop(
|
|
builder, loc, loopLo, loopHi, offset,
|
|
sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs,
|
|
false,
|
|
[&](OpBuilder &builder, Location, Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(reduc.begin(), reduc.end());
|
|
})
|
|
.second;
|
|
YIELD(itArgs);
|
|
});
|
|
} else if (isDenseDLT(lvlTypes[tid][firstLvl])) {
|
|
assert(firstLvl == 0); // This must be the first level.
|
|
Value lb = frontSlice.offset;
|
|
Value sliceSz =
|
|
sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1];
|
|
Value ub = ADDI(lb, sliceSz);
|
|
outerMost = builder.create<scf::ForOp>(
|
|
loc, lb, ub, c1, innerArgs,
|
|
[&](OpBuilder &builder, Location loc, Value iv,
|
|
ValueRange iterArgs) {
|
|
ip = builder.saveInsertionPoint();
|
|
pos = iv;
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
});
|
|
}
|
|
// We generated the loop for the first slice above, now remove it.
|
|
unResLvls = unResLvls.drop_back();
|
|
}
|
|
}
|
|
// Reset the insertion point into the loop body.
|
|
builder.restoreInsertionPoint(ip);
|
|
if (!unResLvls.empty()) {
|
|
// Fills in dense slices levels in between.
|
|
SmallVector<Value> lbs, ubs, steps, lvlSzs;
|
|
for (const SliceInfo *slice : llvm::reverse(unResLvls)) {
|
|
Level sliceLvl = *slice->slicedOnLvl;
|
|
assert(isDenseDLT(lvlTypes[tid][sliceLvl]));
|
|
Value offset = slice->offset;
|
|
Value sliceSz = sliceSizes[tid][sliceLvl][slice->depth - 1];
|
|
lbs.push_back(offset);
|
|
ubs.push_back(ADDI(offset, sliceSz));
|
|
steps.push_back(c1);
|
|
lvlSzs.push_back(lvlSizes[tid][sliceLvl]);
|
|
}
|
|
auto denseNest =
|
|
scf::buildLoopNest(builder, loc, lbs, ubs, steps, innerArgs,
|
|
[&innerArgs, &lvlSzs, &pos, bodyBuilder](
|
|
OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
for (auto em : llvm::enumerate(ivs)) {
|
|
// Linearizes postion: pos = (pos * lvlsize) +
|
|
// iv;
|
|
pos = MULI(pos, lvlSzs[em.index()]);
|
|
pos = ADDI(pos, em.value());
|
|
}
|
|
innerArgs.assign(iterArgs.begin(), iterArgs.end());
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
return innerArgs;
|
|
});
|
|
|
|
if (!outerMost) {
|
|
// If the outermost loop has not been set, this is the outermost loop.
|
|
outerMost = denseNest.loops.front();
|
|
} else {
|
|
// Otherwise we need to generate yield operations to link the SSA chain.
|
|
YIELD(denseNest.results);
|
|
}
|
|
} else {
|
|
assert(outerMost);
|
|
// Generates user request loop body.
|
|
bodyBuilder(builder, loc, pos, innerArgs);
|
|
YIELD(innerArgs);
|
|
}
|
|
assert(outerMost);
|
|
// Insert after current while operation.
|
|
builder.setInsertionPointAfter(outerMost);
|
|
return outerMost.getResults();
|
|
}
|
|
|
|
void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3),
|
|
c4 = C_IDX(4);
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
// Dense slice begin is trivial.
|
|
sliceStack[tid].emplace_back(/*minCoord=*/c0, /*offset=*/c0,
|
|
/*nonEmpty=*/constantI1(builder, loc, true),
|
|
lvl, /*depth=*/1);
|
|
return;
|
|
}
|
|
Value size = sliceSizes[tid][lvl][0];
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][0];
|
|
Value pHi, pLo;
|
|
if (lvl == 0) {
|
|
pLo = c0;
|
|
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1);
|
|
} else {
|
|
pLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
|
|
posits[tid][lvl - 1]);
|
|
pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl],
|
|
ADDI(posits[tid][lvl - 1], c1));
|
|
}
|
|
// Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi]
|
|
builder.create<memref::StoreOp>(loc, c4, sPtrBuf, c0); // memSize = 4
|
|
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1); // index = 0
|
|
builder.create<memref::StoreOp>(loc, pLo, sPtrBuf, c2); // pLo
|
|
builder.create<memref::StoreOp>(loc, pHi, sPtrBuf, c3); // pHi
|
|
|
|
// This is an non empty tensor if 0 < pHi.
|
|
Value isNonEmpty = CMPI(ult, c0, pHi);
|
|
// The minimal coord must be at the first on ordered level.
|
|
// FIXME: Technically we should load the coord only when the slice is
|
|
// nonempty. though we assume that even on empty sparse tensors, a non-empty
|
|
// ptr/idx buffer is allocated for each level so it would not cause OOB to
|
|
// avoid generating a ifOp here.
|
|
Value minCrd = genIndexLoad(builder, loc, coordinatesBuffers[tid][0], c0);
|
|
|
|
// FIXME: We need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, /*depth=*/1);
|
|
}
|
|
|
|
// Fills in the slicePosBuffer before slice-driven loop begin.
|
|
// TODO: it can only handle all compressed tensors.
|
|
//
|
|
// // Loop generated by `genUnResolvedSliceTreeTraverse`
|
|
// for(int i = 0; i < slicePos.size(); i+=2) {
|
|
// loopLo = slicePos[i];
|
|
// loopHi = slicePos[i + 1];
|
|
// minCrd = max;
|
|
// while (loopLo < loopHi) {
|
|
// if (pos[loopLo] < sliceHi) {
|
|
// // bodyBuilder
|
|
// slicePos[tid].push_back(pos[loopLo]);
|
|
// slicePos[tid].push_back(pos[loopLo + 1]);
|
|
// minCrd = min(minCrd, crd[pos[loopLo]]);
|
|
// } else {
|
|
// break;
|
|
// }
|
|
// loopLo ++;
|
|
// }
|
|
// }
|
|
void LoopEmitter::genUnResolvedSliceBegin(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
unsigned depth = levelReducedDep[tid][lvl];
|
|
Value size = sliceSizes[tid][lvl][depth];
|
|
// Dense slice begin is trivial
|
|
if (isDenseDLT(lvlTypes[tid][lvl])) {
|
|
sliceStack[tid].emplace_back(c0, c0, constantI1(builder, loc, false), lvl,
|
|
depth + 1);
|
|
return;
|
|
}
|
|
|
|
assert(isCompressedDLT(lvlTypes[tid][lvl]));
|
|
// Unhandled Cases:
|
|
//
|
|
// 1st, lvl = prevSlicedLvl, i.e., t[d0 + d1 + d2,...] (more than one
|
|
// variable need to be reduced on the same level).
|
|
//
|
|
// 2nd, lvl > prevSliceLvl + 1, i.e., t[..., d2, d3 + d4] (having a
|
|
// simple dim expression in between).
|
|
assert(lvl == *sliceStack[tid].back().slicedOnLvl + 1);
|
|
|
|
// Check slice stack integrity.
|
|
assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth);
|
|
|
|
SmallVector<const SliceInfo *> unResSlices;
|
|
std::optional<std::pair<TensorId, Level>> firstResLvl;
|
|
for (Level curLvl = lvl; curLvl >= 1; curLvl--) {
|
|
Level prevLvl = curLvl - 1;
|
|
if (lvlFullyResolved(tid, prevLvl)) {
|
|
firstResLvl = std::make_pair(tid, prevLvl);
|
|
break;
|
|
}
|
|
unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl));
|
|
if (!isDenseDLT(lvlTypes[tid][prevLvl])) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(!unResSlices.empty() &&
|
|
!lvlFullyResolved(tid, *unResSlices.front()->slicedOnLvl));
|
|
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl].back();
|
|
SmallVector<Value, 3> reduc = {
|
|
constantI1(builder, loc, false), // isNonEmpty
|
|
lvlSizes[tid][lvl], // minCoord
|
|
c2, // memSize
|
|
};
|
|
|
|
ValueRange result = genUnResolvedSliceTreeTraverse(
|
|
builder, loc, tid, unResSlices, firstResLvl, reduc,
|
|
[this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc,
|
|
Value iv,
|
|
MutableArrayRef<Value> reduc) {
|
|
Value &nonEmpty = reduc[0];
|
|
Value &minCrd = reduc[1];
|
|
Value &curMemSz = reduc[2];
|
|
|
|
Value pHi = ADDI(iv, c1);
|
|
Value sPLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], iv);
|
|
Value sPHi =
|
|
genIndexLoad(builder, loc, positionsBuffers[tid][lvl], pHi);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty, i.e., as long as there is one
|
|
// non-empty lvl, the slice is non-empty.
|
|
Value lvlNonEmpty = CMPI(ult, sPLo, sPHi);
|
|
nonEmpty = builder.create<arith::OrIOp>(loc, lvlNonEmpty, nonEmpty);
|
|
|
|
// Update the minimum coordinate.
|
|
auto ifNonEmpty = builder.create<scf::IfOp>(loc, builder.getIndexType(),
|
|
lvlNonEmpty, true);
|
|
{
|
|
// Generate Code as follows.
|
|
//
|
|
// if (nonEmpty) {
|
|
// minCrd = min(minCrd, crd[pos[pLo]]);
|
|
// }
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointToStart(ifNonEmpty.thenBlock());
|
|
Value curC =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], sPLo);
|
|
Value isSmaller = CMPI(ult, curC, minCrd);
|
|
Value newMin = SELECT(isSmaller, curC, minCrd);
|
|
YIELD(newMin);
|
|
builder.setInsertionPointToStart(ifNonEmpty.elseBlock());
|
|
YIELD(minCrd);
|
|
}
|
|
minCrd = ifNonEmpty.getResult(0);
|
|
builder.create<memref::StoreOp>(loc, sPLo, sPtrBuf, curMemSz);
|
|
Value nxtMemSize = ADDI(curMemSz, c1);
|
|
builder.create<memref::StoreOp>(loc, sPHi, sPtrBuf, nxtMemSize);
|
|
// curMemSize += 2
|
|
curMemSz = ADDI(curMemSz, c2);
|
|
});
|
|
|
|
Value isNonEmpty = result[0];
|
|
Value minCrd = result[1];
|
|
// Two metadata [memSize, idx].
|
|
// TODO: Can use an SSA value for these two metadata
|
|
builder.create<memref::StoreOp>(loc, result[2], sPtrBuf, c0);
|
|
builder.create<memref::StoreOp>(loc, c0, sPtrBuf, c1);
|
|
// FIXME: we need the relative offset related to the base slice.
|
|
Value absOffset = offsetFromMinCoord(builder, loc, minCrd, size, isNonEmpty);
|
|
sliceStack[tid].emplace_back(minCrd, absOffset, isNonEmpty, lvl, depth + 1);
|
|
}
|
|
|
|
bool LoopEmitter::genSliceBegin(OpBuilder &builder, Location loc, TensorId tid,
|
|
Level lvl) {
|
|
Value c1 = C_IDX(1), c2 = C_IDX(2);
|
|
|
|
if (depFullyReduced(tid, lvl)) {
|
|
// Do not need to prepare for slice driven loop on dense level after it is
|
|
// fully reduced.
|
|
if (isDenseDLT(lvlTypes[tid][lvl]))
|
|
return true;
|
|
// If constraints on the tensor is fully resolved. We do not need to
|
|
// generates slice begin any more, instead we fall back to TACO-based
|
|
// algorithm to (co)iterates over the slice.
|
|
Value pLoPtr =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), c1);
|
|
pLoPtr = ADDI(pLoPtr, c2);
|
|
Value pHiPtr = ADDI(pLoPtr, c1);
|
|
posits[tid][lvl] =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pLoPtr);
|
|
highs[tid][lvl] =
|
|
genIndexLoad(builder, loc, slicePosBuffer[tid][lvl].back(), pHiPtr);
|
|
return true;
|
|
}
|
|
|
|
// Only when the level is sorted, the next-non-empty slice can be computed
|
|
// efficiently.
|
|
const DimLevelType lvlType = lvlTypes[tid][lvl];
|
|
assert(isOrderedDLT(lvlType));
|
|
if (isSingletonDLT(lvlType)) {
|
|
llvm_unreachable("TODO: dense level should be easy to support, while "
|
|
"singleton level requres more efforts");
|
|
}
|
|
|
|
assert(!dependentLvlMap[tid][lvl].empty());
|
|
assert(!sliceStack[tid].empty());
|
|
|
|
const SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
auto baseEnc = getSparseTensorEncoding(tensors[tid].getType());
|
|
if (baseEnc.isSlice())
|
|
llvm_unreachable("TODO: not yet implemented");
|
|
|
|
// Generate caches required to fast compute next-non-empty slices with
|
|
// increasing offset for slice-base loop.
|
|
// We do not need cache for dense levels.
|
|
if (slicePosBuffer[tid][lvl][0] == nullptr && !isDenseDLT(lvlType)) {
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// The buffer can be reused, and the size is loop invariant: it only depends
|
|
// on the iteration graph's toposort.
|
|
builder.setInsertionPointAfter(localInsertPos);
|
|
Value bufSize = C_IDX(1);
|
|
Value c2 = C_IDX(2);
|
|
// Accumlates the size required to cache the pLo for the slice.
|
|
// E.g., if we want to cache the pIdx for slice<d0xd1xf64> on the second
|
|
// level. We at most need to a memref<d0xindex>.
|
|
// NOTE: this is apperantly an over-approximation when the previous
|
|
// level is compressed, and we can compute a precise memory size
|
|
// inside the loops. But that would also requires us to allocate/free
|
|
// memorys in loops.
|
|
// TODO: Maybe using allocaScopeOp inside the loop to resolve the issue?
|
|
for (Level curLevel = lvl;
|
|
curLevel >= 1 && !lvlFullyResolved(tid, curLevel - 1); curLevel--) {
|
|
auto depth = remDepOnLevel(tid, curLevel - 1);
|
|
assert(sliceSizes[tid][lvl].size() >= depth);
|
|
Value sz = *(sliceSizes[tid][lvl].rbegin() + depth - 1);
|
|
bufSize = MULI(bufSize, sz);
|
|
}
|
|
// For a pair of [pLo, pHi]. Note that we can not compress pHi because slice
|
|
// creates segments in the index buffer so that the pHi for the current
|
|
// level is no longer the pLo for the next level.
|
|
bufSize = MULI(bufSize, c2);
|
|
// Additional two metadata {memSize, idx} at head.
|
|
bufSize = ADDI(bufSize, c2);
|
|
llvm::for_each(
|
|
slicePosBuffer[tid][lvl], [bufSize, loc, &builder](Value &cache) {
|
|
cache = genAlloca(builder, loc, bufSize, builder.getIndexType());
|
|
});
|
|
}
|
|
|
|
if (sliceInfo.isInitialTensor() ||
|
|
(lvl >= 1 && lvlFullyResolved(tid, lvl - 1))) {
|
|
// First level or previous level has been full resolved.
|
|
genResolvedSliceBegin(builder, loc, tid, lvl);
|
|
} else {
|
|
// The previous level has not been full resolved.
|
|
genUnResolvedSliceBegin(builder, loc, tid, lvl);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc,
|
|
TensorId tid, Level lvl) {
|
|
for (unsigned i = 0; i <= lvl; i++) {
|
|
if (!isDenseDLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) {
|
|
builder.create<memref::StoreOp>(loc, C_IDX(0),
|
|
slicePosBuffer[tid][i].back(), C_IDX(1));
|
|
}
|
|
}
|
|
}
|
|
|
|
void LoopEmitter::genSliceNextInduction(OpBuilder &builder, Location loc,
|
|
const Operation *op, TensorId tid,
|
|
Level lvl,
|
|
SmallVectorImpl<Value> &operands,
|
|
unsigned &retIdx) {
|
|
if (!isCompressedDLT(lvlTypes[tid][lvl]))
|
|
llvm_unreachable("TODO");
|
|
|
|
// else generate code to compute next non empty slice.
|
|
Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2);
|
|
|
|
auto whileOp = llvm::cast<scf::WhileOp>(op);
|
|
SliceInfo &info = sliceStack[tid].back();
|
|
assert(info.slicedOnLvl == lvl);
|
|
//
|
|
// We forward to the next non empty slice by
|
|
// if (minCrd > offset) {
|
|
// offset += 1
|
|
// } else {
|
|
// minCrd = nextMinInSlice();
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
//
|
|
// if (offset + size > parents.size)
|
|
// isNonEmpty = false;
|
|
//
|
|
Value absOffset = info.offset;
|
|
// Resets slices pointers as the resolved slices are invalidated after we
|
|
// moves forward to the next slice.
|
|
invalidateSliceIterIdx(builder, loc, tid, lvl);
|
|
|
|
SmallVector<Value, 3> reduc = {info.minCrd, info.isNonEmpty, absOffset};
|
|
Value sPtrBuf = slicePosBuffer[tid][lvl][info.depth - 1];
|
|
Value fastPathP = CMPI(ugt, info.minCrd, absOffset);
|
|
auto ifOp = builder.create<scf::IfOp>(loc, ValueRange(reduc).getTypes(),
|
|
fastPathP, true);
|
|
{
|
|
OpBuilder::InsertionGuard guard(builder);
|
|
// Take the fast path
|
|
// if (minCrd > offset) {
|
|
// return offset += 1
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
|
|
reduc[2] = ADDI(absOffset, c1);
|
|
// Yield offset + 1.
|
|
YIELD(reduc);
|
|
|
|
// else /*minCrd == offset*/ {
|
|
// for (i = 0; i < slicePos.size(); i+=2) {
|
|
// if (crd[pos[slicePos[i]]] == minCrd) {
|
|
// slicePos[i]++;
|
|
// }
|
|
// minCrd=min(minCrd, crd[pos[slicePos[i]]]);
|
|
// }
|
|
// offset = minCrd - size + 1;
|
|
// }
|
|
builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
|
|
reduc[2] = absOffset; // restore value.
|
|
Value pSt = c2; // pointer starting index
|
|
Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize
|
|
reduc[0] = lvlSizes[tid][lvl]; // next min coord
|
|
reduc[1] = constantI1(builder, loc, false); // isNonEmpty
|
|
auto loopArgs = static_cast<ValueRange>(reduc).drop_back();
|
|
auto forOp = scf::buildLoopNest(
|
|
builder, loc, pSt, mSz, c2, loopArgs,
|
|
[this, tid, lvl, c1, sPtrBuf,
|
|
&info](OpBuilder &builder, Location loc, ValueRange ivs,
|
|
ValueRange iterArgs) -> scf::ValueVector {
|
|
Value curMinCrd = iterArgs[0];
|
|
Value isNonEmpty = iterArgs[1];
|
|
|
|
Type idxTp = builder.getIndexType();
|
|
Value pLo = genIndexLoad(builder, loc, sPtrBuf, ivs.front());
|
|
Value pHi =
|
|
genIndexLoad(builder, loc, sPtrBuf, ADDI(ivs.front(), c1));
|
|
//
|
|
// if (pLo < pHi) // Only loads when inbound.
|
|
// coord = load[pLo]
|
|
// if coord == minCrd
|
|
// pLo += 1
|
|
//
|
|
// if (pLo < pHi)
|
|
// curMinCrd = min(curMinCrd, load[pLo])
|
|
//
|
|
Value pred = CMPI(ult, pLo, pHi);
|
|
auto advPLo = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if pLo < pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getThenRegion().front());
|
|
// coord = load[pLo]
|
|
Value coord =
|
|
genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo);
|
|
Value pred = CMPI(eq, coord, info.minCrd);
|
|
auto ifEqual = builder.create<scf::IfOp>(loc, idxTp, pred, true);
|
|
/* if coord == minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getThenRegion().front());
|
|
Value newPlo = ADDI(pLo, c1);
|
|
// Updates the cache.
|
|
builder.create<memref::StoreOp>(loc, newPlo, sPtrBuf,
|
|
ivs.front());
|
|
YIELD(newPlo);
|
|
}
|
|
/* else coord != minCrd */ {
|
|
builder.setInsertionPointToStart(
|
|
&ifEqual.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
builder.setInsertionPointAfter(ifEqual);
|
|
YIELD(ifEqual.getResults());
|
|
}
|
|
/* else pLo >= pHi */ {
|
|
builder.setInsertionPointToStart(&advPLo.getElseRegion().front());
|
|
YIELD(pLo);
|
|
}
|
|
|
|
builder.setInsertionPointAfter(advPLo);
|
|
pLo = advPLo.getResult(0);
|
|
Value lvlNonEmpty = CMPI(ult, pLo, pHi);
|
|
// Update minCrds
|
|
auto newMin =
|
|
builder.create<scf::IfOp>(loc, idxTp, lvlNonEmpty, true);
|
|
builder.setInsertionPointToStart(&newMin.getThenRegion().front());
|
|
YIELD(genIndexLoad(builder, loc, coordinatesBuffers[tid][lvl], pLo));
|
|
|
|
builder.setInsertionPointToStart(&newMin.getElseRegion().front());
|
|
YIELD(curMinCrd);
|
|
builder.setInsertionPointAfter(newMin);
|
|
|
|
// isNonEmpty = isNonEmpty || lvlNonEmpty
|
|
isNonEmpty =
|
|
builder.create<arith::OrIOp>(loc, lvlNonEmpty, isNonEmpty);
|
|
curMinCrd = builder.create<arith::SelectOp>(
|
|
loc, CMPI(ult, newMin.getResult(0), curMinCrd),
|
|
newMin.getResult(0), curMinCrd);
|
|
return {curMinCrd, isNonEmpty};
|
|
});
|
|
|
|
builder.setInsertionPointAfter(forOp.loops.front());
|
|
// minOffset = minCrd + 1 >= size ? minCrd + 1 - size : c0
|
|
Value tmp = ADDI(forOp.results.front(), c1);
|
|
Value minOffset = SUBI(tmp, sliceSizes[tid][lvl][info.depth - 1]);
|
|
Value p = CMPI(uge, tmp, sliceSizes[tid][lvl][info.depth - 1]);
|
|
minOffset = SELECT(p, minOffset, c0);
|
|
SmallVector<Value, 3> yields;
|
|
yields.assign(forOp.results.begin(), forOp.results.end());
|
|
yields.push_back(minOffset);
|
|
YIELD(yields);
|
|
}
|
|
|
|
Value nextMinCrd = ifOp.getResults()[0];
|
|
Value nextNonEmpty = ifOp.getResults()[1];
|
|
|
|
// The next offset should at least be offset + 1;
|
|
Value minOffset = ifOp.getResults()[2];
|
|
Value nxOffset = ADDI(info.offset, c1);
|
|
Value maxPred = CMPI(ugt, minOffset, nxOffset);
|
|
Value nextAbsOffset = SELECT(maxPred, minOffset, nxOffset);
|
|
|
|
Value sliceUB = ADDI(nextAbsOffset, sliceSizes[tid][lvl][info.depth - 1]);
|
|
|
|
// FIXME: this only works if there is only one parent.
|
|
assert(info.depth - 1 == 0);
|
|
// nextNonEmpty = nextNonEmpty && slice upper bound <= parent upperbound.
|
|
nextNonEmpty = ANDI(nextNonEmpty, CMPI(ule, sliceUB, lvlSizes[tid][lvl]));
|
|
|
|
// FIXME: compute relative offset.
|
|
assert(info.depth - 1 == 0);
|
|
Value nextRelOffset = nextAbsOffset;
|
|
nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0);
|
|
|
|
operands.push_back(nextNonEmpty);
|
|
operands.push_back(nextMinCrd);
|
|
operands.push_back(nextAbsOffset); // we push the absolute offset.
|
|
|
|
// Update the slice stack.
|
|
info.isNonEmpty = whileOp.getResult(retIdx++);
|
|
info.minCrd = whileOp.getResult(retIdx++);
|
|
info.offset = whileOp.getResult(retIdx++);
|
|
}
|
|
|
|
Operation *LoopEmitter::emitSliceDrivenLoopOverTensorAtLvl(
|
|
OpBuilder &builder, Location loc, TensorId tid, Level lvl,
|
|
MutableArrayRef<Value> reduc) {
|
|
assert(!depFullyReduced(tid, lvl));
|
|
SliceInfo &sliceInfo = sliceStack[tid].back();
|
|
assert(sliceInfo.slicedOnLvl == lvl);
|
|
|
|
// The order matters!
|
|
SmallVector<Value, 3> operands{sliceInfo.isNonEmpty, sliceInfo.minCrd,
|
|
sliceInfo.offset};
|
|
// number of reduction maintained by us.
|
|
size_t numMetaReduc = operands.size();
|
|
|
|
// Append user-required reduction values.
|
|
operands.append(reduc.begin(), reduc.end());
|
|
assert(operands.size() == numMetaReduc + reduc.size());
|
|
|
|
// while (slice.nonEmpty()) {
|
|
// bodyBuilder();
|
|
// SliceNext();
|
|
// }
|
|
auto whileOp = builder.create<scf::WhileOp>(
|
|
loc, ValueRange(operands).getTypes(), operands,
|
|
/*beforeBuilder=*/
|
|
[](OpBuilder &builder, Location loc, ValueRange args) {
|
|
builder.create<scf::ConditionOp>(loc, /*isNonEmpty*/ args[0], args);
|
|
},
|
|
/*afterBuilder=*/
|
|
[this, tid, lvl, reduc, numMetaReduc,
|
|
&sliceInfo](OpBuilder &builder, Location loc, ValueRange args) {
|
|
assert(args.size() == reduc.size() + numMetaReduc);
|
|
sliceInfo.isNonEmpty = args[0];
|
|
sliceInfo.minCrd = args[1];
|
|
sliceInfo.offset = args[2];
|
|
// The slice offset is used to coiterate with other tensors'
|
|
// coordinates.
|
|
Value c = sliceInfo.offset;
|
|
if (sliceInfo.depth > 1) {
|
|
// Coord is the relative offset related to its parents.
|
|
// Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1]
|
|
llvm_unreachable("TODO: not yet implement");
|
|
}
|
|
coords[tid][lvl] = c;
|
|
|
|
for (unsigned i = 0, e = reduc.size(); i < e; i++)
|
|
reduc[i] = args[i + numMetaReduc];
|
|
});
|
|
|
|
// Set the insertion point to while loop body.
|
|
builder.setInsertionPointToEnd(&whileOp.getAfter().front());
|
|
return whileOp;
|
|
}
|
|
|
|
#undef CMPI
|
|
#undef C_IDX
|
|
#undef YIELD
|
|
#undef ADDI
|
|
#undef ANDI
|
|
#undef SUBI
|
|
#undef MULI
|
|
#undef SELECT
|