
It was disabled because there may be artificial padding. After [refining the pack op semantics](773e158c64
),
we can assume that there is no artificial padding. Thus, the check can
be removed, and we can unconditionally enable the consumer fusion if it
is a perfect tiling case.
Signed-off-by: hanhanW <hanhan0912@gmail.com>
1426 lines
60 KiB
C++
1426 lines
60 KiB
C++
//===- TilingInterfaceImpl.cpp - Implementation of TilingInterface -------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
|
|
|
|
#include "mlir/Analysis/SliceAnalysis.h"
|
|
#include "mlir/Dialect/Affine/IR/AffineOps.h"
|
|
#include "mlir/Dialect/Affine/Utils.h"
|
|
#include "mlir/Dialect/Arith/IR/Arith.h"
|
|
#include "mlir/Dialect/Arith/Utils/Utils.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
|
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/Dialect/Utils/IndexingUtils.h"
|
|
#include "mlir/Dialect/Utils/StaticValueUtils.h"
|
|
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
|
|
#include "mlir/IR/BuiltinTypeInterfaces.h"
|
|
#include "mlir/Interfaces/TilingInterface.h"
|
|
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include <optional>
|
|
|
|
#define DEBUG_TYPE "linalg-tiling-interface-impl"
|
|
|
|
using namespace mlir;
|
|
using namespace mlir::linalg;
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// Utility methods for implementation of Tiling Interface for Linalg ops
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// Return the SSA values that represent the data point accessed using a given
|
|
/// `indexingMap` for a given point in the iteration space represented by `ivs`.
|
|
static SmallVector<Value> getIndicesForAccess(OpBuilder &b, Location loc,
|
|
AffineMap indexingMap,
|
|
ValueRange ivs) {
|
|
SmallVector<Value> indices;
|
|
indices.reserve(indexingMap.getNumResults());
|
|
for (auto result : indexingMap.getResults()) {
|
|
AffineMap m = AffineMap::get(indexingMap.getNumDims(),
|
|
indexingMap.getNumSymbols(), result);
|
|
Value v = affine::AffineApplyOp::create(b, loc, m, ivs);
|
|
indices.push_back(v);
|
|
}
|
|
return indices;
|
|
}
|
|
|
|
/// Method to inline the payload of a `linalgOp` given the iteration space
|
|
/// point and values for the arguments of the payload.
|
|
static LogicalResult inlinePayload(OpBuilder &b, LinalgOp linalgOp,
|
|
ValueRange ivs, ValueRange argValues) {
|
|
Block *body = linalgOp.getBlock();
|
|
IRMapping map;
|
|
map.map(body->getArguments(), argValues);
|
|
for (auto &op : body->without_terminator()) {
|
|
if (auto indexOp = dyn_cast<IndexOp>(&op)) {
|
|
map.map(indexOp.getResult(), ivs[indexOp.getDim()]);
|
|
continue;
|
|
}
|
|
b.clone(op, map);
|
|
}
|
|
|
|
Operation *terminator = body->getTerminator();
|
|
Location loc = terminator->getLoc();
|
|
for (const auto &operand : llvm::enumerate(terminator->getOperands())) {
|
|
Value toStore = map.lookupOrDefault(operand.value());
|
|
OpOperand *storeInto = linalgOp.getDpsInitOperand(operand.index());
|
|
auto indices = getIndicesForAccess(
|
|
b, loc, linalgOp.getMatchingIndexingMap(storeInto), ivs);
|
|
memref::StoreOp::create(b, loc, toStore,
|
|
linalgOp.getDpsInitOperand(operand.index())->get(),
|
|
indices);
|
|
}
|
|
return success();
|
|
}
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// External Model for implementing `TilingInterface` for `LinalgOp`s.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
namespace {
|
|
/// External model implementation of TilingInterface for LinalgOps. An external
|
|
/// model implementation is used for now till the use of `TilingInterface` is
|
|
/// on-par with the current Linalg tiling + fusion patterns. Once it is
|
|
/// maybe possible to move this into the op-definition (though there are
|
|
/// advantages to leaving it as an external model)
|
|
template <typename LinalgOpTy>
|
|
struct LinalgOpTilingInterface
|
|
: public TilingInterface::ExternalModel<LinalgOpTilingInterface<LinalgOpTy>,
|
|
LinalgOpTy> {
|
|
/// Return the loop iterator type.
|
|
SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
|
|
LinalgOpTy concreteOp = cast<LinalgOpTy>(op);
|
|
return concreteOp.getIteratorTypesArray();
|
|
}
|
|
|
|
/// Return the iteration domain range.
|
|
SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
|
|
OpBuilder::InsertionGuard g(b);
|
|
b.setInsertionPoint(op);
|
|
Location loc = op->getLoc();
|
|
LinalgOp linalgOp = cast<LinalgOp>(op);
|
|
SmallVector<OpFoldResult> allShapesSizes =
|
|
linalgOp.createFlatListOfOperandDims(b, loc);
|
|
AffineMap map = linalgOp.getShapesToLoopsMap();
|
|
|
|
return llvm::to_vector(
|
|
llvm::map_range(map.getResults(), [&](AffineExpr loopExpr) {
|
|
OpFoldResult ofr = affine::makeComposedFoldedAffineApply(
|
|
b, loc, loopExpr, allShapesSizes);
|
|
return Range{b.getIndexAttr(0), ofr, b.getIndexAttr(1)};
|
|
}));
|
|
}
|
|
|
|
/// Instantiate the tiled implementation of the operation.
|
|
FailureOr<TilingResult>
|
|
getTiledImplementation(Operation *op, OpBuilder &b,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
// Leave the `sizeBounds` value empty. That is only needed when the `sizes`
|
|
// specified could lead to out of bounds accesses.
|
|
Location loc = op->getLoc();
|
|
LinalgOp linalgOp = cast<LinalgOp>(op);
|
|
SmallVector<Value> valuesToTile = linalgOp->getOperands();
|
|
SmallVector<Value> tiledOperands = makeTiledShapes(
|
|
b, loc, linalgOp, valuesToTile, offsets, sizes, {}, true);
|
|
SmallVector<Operation *> generatedSlices = llvm::map_to_vector(
|
|
llvm::make_filter_range(
|
|
tiledOperands,
|
|
[](Value v) -> bool {
|
|
return isa_and_nonnull<tensor::ExtractSliceOp, memref::SubViewOp>(
|
|
v.getDefiningOp());
|
|
}),
|
|
[](Value v) -> Operation * { return v.getDefiningOp(); });
|
|
|
|
SmallVector<Type> resultTensorTypes =
|
|
getTensorOutputTypes(linalgOp, tiledOperands);
|
|
|
|
Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, tiledOperands);
|
|
offsetIndices(b, cast<LinalgOp>(tiledOp), offsets);
|
|
|
|
return TilingResult{
|
|
{tiledOp}, SmallVector<Value>(tiledOp->getResults()), generatedSlices};
|
|
}
|
|
|
|
/// Utility to fetch the offsets and sizes when applied as per the indexing
|
|
/// map of the linalg op. This helps in fusing the linalg op as a consumer of
|
|
/// a given slice op.
|
|
static LogicalResult
|
|
getMappedOffsetAndSize(LinalgOp linalgOp, OpBuilder &b,
|
|
ArrayRef<AffineMap> indexingMaps,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes,
|
|
SmallVectorImpl<OpFoldResult> &mappedOffsetsVec,
|
|
SmallVectorImpl<OpFoldResult> &mappedSizesVec) {
|
|
DenseMap<unsigned, OpFoldResult> mappedOffsets, mappedSizes;
|
|
|
|
for (auto [indexingMap, offsets, sizes] :
|
|
llvm::zip_equal(indexingMaps, allOffsets, allSizes)) {
|
|
for (auto [resultExpr, offset, size] :
|
|
llvm::zip_equal(indexingMap.getResults(), offsets, sizes)) {
|
|
auto dimExpr = dyn_cast<AffineDimExpr>(resultExpr);
|
|
if (!dimExpr)
|
|
continue;
|
|
unsigned position = dimExpr.getPosition();
|
|
auto it = mappedOffsets.find(position);
|
|
if (it != mappedOffsets.end()) {
|
|
OpFoldResult seenOffset = it->second;
|
|
OpFoldResult seenSize = mappedSizes.lookup(position);
|
|
if (seenOffset != offset || seenSize != size) {
|
|
LLVM_DEBUG({
|
|
llvm::dbgs() << "inconsistent iteration space mapping from "
|
|
"offsets/sizes of operands/results";
|
|
});
|
|
return failure();
|
|
}
|
|
} else {
|
|
mappedOffsets[position] = offset;
|
|
mappedSizes[position] = size;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Aggregate from the given operand offsets and sizes, or default to
|
|
// iteration space values.
|
|
SmallVector<Range> iterationDomain =
|
|
cast<TilingInterface>(linalgOp.getOperation()).getIterationDomain(b);
|
|
mappedOffsetsVec.resize(iterationDomain.size());
|
|
mappedSizesVec.resize(iterationDomain.size());
|
|
for (auto [index, domain] : llvm::enumerate(iterationDomain)) {
|
|
auto it = mappedOffsets.find(index);
|
|
if (it != mappedOffsets.end()) {
|
|
mappedOffsetsVec[index] = it->second;
|
|
mappedSizesVec[index] = mappedSizes.lookup(index);
|
|
continue;
|
|
}
|
|
mappedOffsetsVec[index] = domain.offset;
|
|
mappedSizesVec[index] = domain.size;
|
|
}
|
|
return success();
|
|
}
|
|
|
|
/// Method to return the position of the result tile computed by the tiled
|
|
/// operation.
|
|
LogicalResult getIterationDomainTileFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes,
|
|
SmallVectorImpl<OpFoldResult> &iterDomainOffsets,
|
|
SmallVectorImpl<OpFoldResult> &iterDomainSizes) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
|
|
std::optional<SmallVector<OpFoldResult>> iterationSpaceOffsets,
|
|
iterationSpaceSizes;
|
|
SmallVector<AffineMap> indexingMaps =
|
|
llvm::map_to_vector(operandNumbers, [&](unsigned operandNumber) {
|
|
OpOperand &opOperand = linalgOp->getOpOperand(operandNumber);
|
|
return linalgOp.getMatchingIndexingMap(&opOperand);
|
|
});
|
|
if (failed(getMappedOffsetAndSize(linalgOp, b, indexingMaps, allOffsets,
|
|
allSizes, iterDomainOffsets,
|
|
iterDomainSizes))) {
|
|
return failure();
|
|
}
|
|
return success();
|
|
}
|
|
|
|
/// Return the details of the output tile generated by the tiled
|
|
/// implementation.
|
|
LogicalResult
|
|
getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes,
|
|
SmallVector<OpFoldResult> &resultOffsets,
|
|
SmallVector<OpFoldResult> &resultSizes) const {
|
|
Location loc = op->getLoc();
|
|
LinalgOp linalgOp = cast<LinalgOp>(op);
|
|
|
|
AffineExpr d0;
|
|
bindDims(b.getContext(), d0);
|
|
SmallVector<OpFoldResult> subShapeSizes =
|
|
llvm::to_vector(llvm::map_range(sizes, [&](OpFoldResult ofr) {
|
|
return affine::makeComposedFoldedAffineApply(b, loc, d0 - 1, ofr);
|
|
}));
|
|
|
|
OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber);
|
|
SliceParameters sliceParams = computeSliceParameters(
|
|
b, loc, outOperand->get(), sizes,
|
|
linalgOp.getMatchingIndexingMap(outOperand), offsets,
|
|
/*ubs*/ {}, subShapeSizes, true);
|
|
resultOffsets = sliceParams.offsets;
|
|
resultSizes = sliceParams.sizes;
|
|
return success();
|
|
}
|
|
|
|
LogicalResult getIterationDomainTileFromResultTile(
|
|
Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets, ArrayRef<OpFoldResult> sizes,
|
|
SmallVectorImpl<OpFoldResult> &iterDomainOffsets,
|
|
SmallVectorImpl<OpFoldResult> &iterDomainSizes) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
|
|
// Check that the indexing map used for the output is a projected
|
|
// permutation. This could be relaxed with a more general approach that can
|
|
// map the offsets and sizes from the result to iteration space tiles
|
|
// (filling in full extent for dimensions not used to access the result).
|
|
AffineMap indexingMap =
|
|
linalgOp.getIndexingMapMatchingResult(op->getResult(resultNumber));
|
|
if (!indexingMap.isProjectedPermutation()) {
|
|
return op->emitOpError(
|
|
"unhandled tiled implementation generation when result is not "
|
|
"accessed using a permuted projection");
|
|
}
|
|
|
|
SmallVector<OpFoldResult> allOffsets = llvm::to_vector(offsets);
|
|
SmallVector<OpFoldResult> allSizes = llvm::to_vector(sizes);
|
|
auto status =
|
|
getMappedOffsetAndSize(linalgOp, b, indexingMap, {allOffsets},
|
|
{allSizes}, iterDomainOffsets, iterDomainSizes);
|
|
(void)status;
|
|
assert(succeeded(status) && "unexpected error in offset calculation");
|
|
return success();
|
|
}
|
|
|
|
FailureOr<TilingResult>
|
|
generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
|
|
if (failed(getIterationDomainTileFromResultTile(
|
|
op, b, resultNumber, offsets, sizes, mappedOffsets, mappedSizes))) {
|
|
return failure();
|
|
}
|
|
auto tilingInterfaceOp = cast<TilingInterface>(op);
|
|
FailureOr<TilingResult> tilingResult =
|
|
tilingInterfaceOp.getTiledImplementation(b, mappedOffsets, mappedSizes);
|
|
|
|
if (failed(tilingResult))
|
|
return failure();
|
|
|
|
if (tilingResult->tiledOps.size() != 1)
|
|
return op->emitOpError("failed to generate tiled implementation");
|
|
|
|
return TilingResult{
|
|
tilingResult->tiledOps,
|
|
SmallVector<Value>{tilingResult->tiledValues[resultNumber]},
|
|
tilingResult->generatedSlices};
|
|
}
|
|
|
|
/// Method to generate the tiled implementation of an operation from the tile
|
|
/// of the operand.
|
|
FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
|
|
SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
|
|
if (failed(getIterationDomainTileFromOperandTiles(
|
|
op, b, operandNumbers, allOffsets, allSizes, mappedOffsets,
|
|
mappedSizes))) {
|
|
return failure();
|
|
}
|
|
return getTiledImplementation(op, b, mappedOffsets, mappedSizes);
|
|
}
|
|
|
|
LogicalResult generateScalarImplementation(Operation *op, OpBuilder &builder,
|
|
Location loc,
|
|
ValueRange ivs) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
if (!linalgOp.hasPureBufferSemantics())
|
|
return op->emitOpError("expected operation to have buffer semantics");
|
|
|
|
SmallVector<Value> indexedValues;
|
|
indexedValues.reserve(linalgOp->getNumOperands());
|
|
Location linalgOpLoc = op->getLoc();
|
|
/// Load the data corresponding to the block arguments that
|
|
/// represent input operands.
|
|
for (OpOperand &operand : linalgOp->getOpOperands()) {
|
|
if (!linalgOp.payloadUsesValueFromOperand(&operand)) {
|
|
indexedValues.push_back(nullptr);
|
|
continue;
|
|
}
|
|
if (linalgOp.isScalar(&operand)) {
|
|
indexedValues.push_back(operand.get());
|
|
continue;
|
|
}
|
|
SmallVector<Value> indices = getIndicesForAccess(
|
|
builder, linalgOpLoc, linalgOp.getMatchingIndexingMap(&operand), ivs);
|
|
Value load =
|
|
memref::LoadOp::create(builder, linalgOpLoc, operand.get(), indices);
|
|
indexedValues.push_back(load);
|
|
}
|
|
|
|
/// Inline the op payload and store the result.
|
|
return inlinePayload(builder, linalgOp, ivs, indexedValues);
|
|
}
|
|
};
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
// External Model for implementing `PartialReductionInterface` for `LinalgOp`s.
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// In a given set vector, get the position of a particular element.
|
|
std::optional<int> getPositionIn(const llvm::SetVector<unsigned> &reductionDims,
|
|
unsigned value) {
|
|
for (auto [index, reductionDim] : llvm::enumerate(reductionDims)) {
|
|
if (reductionDim == value) {
|
|
return index;
|
|
}
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/// Return an AffineMaps to use for the `outs` operands of the linalg op
|
|
/// generated for partial results. The new AffineMap is the AffineMap of the
|
|
/// untiled op with reduction dimensions appended at end in order in which they
|
|
/// were specified during tiling.
|
|
static SmallVector<AffineMap>
|
|
getPartialResultAffineMaps(LinalgOp linalgOp,
|
|
const SetVector<unsigned> &reductionDims) {
|
|
auto partialReductionMaps = llvm::map_to_vector(
|
|
linalgOp.getDpsInitsMutable(), [&](OpOperand &opOperand) {
|
|
AffineMap map = linalgOp.getMatchingIndexingMap(&opOperand);
|
|
for (auto redPos : reductionDims) {
|
|
map =
|
|
map.insertResult(getAffineDimExpr(redPos, linalgOp.getContext()),
|
|
map.getNumResults());
|
|
}
|
|
return map;
|
|
});
|
|
return partialReductionMaps;
|
|
}
|
|
|
|
struct InitSliceInfo {
|
|
SmallVector<int64_t> resultShape;
|
|
SmallVector<OpFoldResult> offsets;
|
|
SmallVector<OpFoldResult> sizes;
|
|
SmallVector<OpFoldResult> strides;
|
|
};
|
|
|
|
/// Return the result shape, offsets, sizes and strides of the slice of the
|
|
/// `initValue` to use as the destination of the partial reduction op generated
|
|
/// with outer reduction strategy.
|
|
static InitSliceInfo getInitSliceInfoForOuterReduction(
|
|
MLIRContext *context, ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims,
|
|
ArrayRef<OpFoldResult> splitReductionIvs, AffineMap partialReductionMap) {
|
|
int64_t initRank = partialReductionMap.getNumResults();
|
|
SmallVector<OpFoldResult> initOffsets, initSizes;
|
|
Attribute zero = IntegerAttr::get(IndexType::get(context), 0);
|
|
Attribute one = IntegerAttr::get(IndexType::get(context), 1);
|
|
SmallVector<OpFoldResult> initStrides(initRank, one);
|
|
for (AffineExpr dimExpr : partialReductionMap.getResults()) {
|
|
unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition();
|
|
if (reductionDims.contains(dim)) {
|
|
initOffsets.push_back(zero);
|
|
} else {
|
|
initOffsets.push_back(offsets[dim]);
|
|
}
|
|
initSizes.push_back(sizes[dim]);
|
|
}
|
|
SmallVector<int64_t> resultShape;
|
|
std::tie(resultShape, std::ignore) = decomposeMixedValues(initSizes);
|
|
return {resultShape, initOffsets, initSizes, initStrides};
|
|
}
|
|
|
|
/// Return the result shape, offsets, sizes and strides of the slice of the
|
|
/// `initValue` to use as destination of the partial reduction op generated with
|
|
/// outer parallel strategy.
|
|
static InitSliceInfo getInitSliceInfoForOuterParallel(
|
|
MLIRContext *context, ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims,
|
|
ArrayRef<OpFoldResult> splitReductionIvs, AffineMap partialReductionMap) {
|
|
int64_t initRank = partialReductionMap.getNumResults();
|
|
SmallVector<OpFoldResult> initOffsets, initSizes;
|
|
Attribute one = IntegerAttr::get(IndexType::get(context), 1);
|
|
SmallVector<OpFoldResult> initStrides(initRank, one);
|
|
SmallVector<OpFoldResult> resultShape;
|
|
for (AffineExpr dimExpr : partialReductionMap.getResults()) {
|
|
unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition();
|
|
if (std::optional<unsigned> dimPos = getPositionIn(reductionDims, dim)) {
|
|
initOffsets.push_back(splitReductionIvs[dimPos.value()]);
|
|
initSizes.push_back(one);
|
|
} else {
|
|
initOffsets.push_back(offsets[dim]);
|
|
initSizes.push_back(sizes[dim]);
|
|
resultShape.push_back(sizes[dim]);
|
|
}
|
|
}
|
|
SmallVector<int64_t> staticShapes;
|
|
std::tie(staticShapes, std::ignore) = decomposeMixedValues(resultShape);
|
|
return {staticShapes, initOffsets, initSizes, initStrides};
|
|
}
|
|
|
|
/// Return the result shape, offsets, sizes and strides of the slice of the
|
|
/// `initValue` to use as destination of the partial reduction op.
|
|
static InitSliceInfo getInitSliceInfo(MLIRContext *context,
|
|
ReductionTilingStrategy strategy,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes,
|
|
const SetVector<unsigned> &reductionDims,
|
|
ArrayRef<OpFoldResult> splitReductionIvs,
|
|
AffineMap partialReductionMap) {
|
|
if (strategy == ReductionTilingStrategy::PartialReductionOuterReduction) {
|
|
return getInitSliceInfoForOuterReduction(context, offsets, sizes,
|
|
reductionDims, splitReductionIvs,
|
|
partialReductionMap);
|
|
}
|
|
assert(strategy == ReductionTilingStrategy::PartialReductionOuterParallel &&
|
|
"unexpected ReductionTilingStrategy");
|
|
return getInitSliceInfoForOuterParallel(context, offsets, sizes,
|
|
reductionDims, splitReductionIvs,
|
|
partialReductionMap);
|
|
}
|
|
|
|
/// External model implementation of PartialReductionInterface for
|
|
/// LinalgOps.
|
|
template <typename LinalgOpTy>
|
|
struct LinalgOpPartialReductionInterface
|
|
: public PartialReductionOpInterface::ExternalModel<
|
|
LinalgOpPartialReductionInterface<LinalgOpTy>, LinalgOpTy> {
|
|
FailureOr<SmallVector<Value>> generateInitialTensorForPartialReduction(
|
|
Operation *op, OpBuilder &b, Location loc, ArrayRef<OpFoldResult> sizes,
|
|
const SetVector<unsigned> &reductionDims) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
|
|
OpBuilder::InsertionGuard guard(b);
|
|
if (linalgOp.hasPureBufferSemantics())
|
|
return op->emitOpError("expected operation to have tensor semantics");
|
|
|
|
SmallVector<AffineMap> partialResultMaps =
|
|
getPartialResultAffineMaps(linalgOp, reductionDims);
|
|
|
|
SmallVector<Value> inits;
|
|
for (auto [initIdx, result, partialMap] :
|
|
llvm::enumerate(linalgOp->getResults(), partialResultMaps)) {
|
|
SmallVector<Operation *, 4> combinerOps;
|
|
if (!matchReduction(linalgOp.getRegionOutputArgs(), initIdx,
|
|
combinerOps) ||
|
|
combinerOps.size() != 1)
|
|
return op->emitOpError("Failed to anaysis the reduction operation.");
|
|
|
|
Operation *reductionOp = combinerOps[0];
|
|
std::optional<TypedAttr> identity = arith::getNeutralElement(reductionOp);
|
|
if (!identity.has_value())
|
|
return op->emitOpError(
|
|
"Failed to get an identity value for the reduction operation.");
|
|
|
|
// Append the new partial result dimensions.
|
|
SmallVector<OpFoldResult> partialResultShape;
|
|
for (AffineExpr dimExpr : partialMap.getResults()) {
|
|
auto dim = cast<AffineDimExpr>(dimExpr);
|
|
partialResultShape.push_back(sizes[dim.getPosition()]);
|
|
}
|
|
|
|
Type elType = getElementTypeOrSelf(result.getType());
|
|
Value emptyTensor =
|
|
tensor::EmptyOp::create(b, loc, partialResultShape, elType);
|
|
Value constantOp = arith::ConstantOp::create(b, loc, *identity);
|
|
auto identityTensor =
|
|
linalg::FillOp::create(b, loc, constantOp, emptyTensor);
|
|
inits.push_back(identityTensor.getResult(0));
|
|
}
|
|
|
|
return inits;
|
|
}
|
|
|
|
FailureOr<TilingResult>
|
|
tileToPartialReduction(Operation *op, OpBuilder &b, Location loc,
|
|
ReductionTilingStrategy tilingStrategy,
|
|
ValueRange init, ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes,
|
|
const SetVector<unsigned> &reductionDims,
|
|
ArrayRef<OpFoldResult> splitReductionIvs) const {
|
|
OpBuilder::InsertionGuard guard(b);
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
|
|
SmallVector<AffineMap> partialReductionMaps =
|
|
getPartialResultAffineMaps(linalgOp, reductionDims);
|
|
|
|
// Step 1. Extend init maps to have reduction dimension dims, since we
|
|
// are converting them to parallel dimensions.
|
|
SmallVector<AffineMap> newInitMaps;
|
|
if (tilingStrategy ==
|
|
ReductionTilingStrategy::PartialReductionOuterReduction) {
|
|
newInitMaps = llvm::to_vector(partialReductionMaps);
|
|
} else {
|
|
newInitMaps = llvm::map_to_vector(
|
|
linalgOp.getDpsInitsMutable(), [&](OpOperand &opOperand) {
|
|
return linalgOp.getMatchingIndexingMap(&opOperand);
|
|
});
|
|
}
|
|
|
|
// Step 2a: Extract a slice of the input operands.
|
|
SmallVector<Value> tiledInputs = makeTiledShapes(
|
|
b, loc, linalgOp, linalgOp.getDpsInputs(), offsets, sizes, {}, true);
|
|
SmallVector<Operation *> generatedSlices = llvm::map_to_vector(
|
|
llvm::make_filter_range(
|
|
tiledInputs, [](Value v) -> bool { return v.getDefiningOp(); }),
|
|
[](Value v) -> Operation * { return v.getDefiningOp(); });
|
|
|
|
// Step 2b: Extract a slice of the init operands.
|
|
SmallVector<Value, 1> tiledInits;
|
|
for (auto [partialReductionMap, valueToTile] :
|
|
llvm::zip_equal(partialReductionMaps, init)) {
|
|
InitSliceInfo sliceInfo = getInitSliceInfo(
|
|
b.getContext(), tilingStrategy, offsets, sizes, reductionDims,
|
|
splitReductionIvs, partialReductionMap);
|
|
auto valueToTileType = cast<RankedTensorType>(valueToTile.getType());
|
|
RankedTensorType sliceResultType = RankedTensorType::get(
|
|
sliceInfo.resultShape, valueToTileType.getElementType(),
|
|
valueToTileType.getEncoding());
|
|
auto sliceOp = tensor::ExtractSliceOp::create(
|
|
b, loc, sliceResultType, valueToTile, sliceInfo.offsets,
|
|
sliceInfo.sizes, sliceInfo.strides);
|
|
tiledInits.push_back(sliceOp.getResult());
|
|
generatedSlices.push_back(sliceOp);
|
|
}
|
|
|
|
// Update the indexing maps.
|
|
SmallVector<AffineMap> newMaps = linalgOp.getIndexingMapsArray();
|
|
for (auto [initOperand, newInitMap] :
|
|
llvm::zip_equal(linalgOp.getDpsInitsMutable(), newInitMaps)) {
|
|
int mapIdx = linalgOp.getIndexingMapIndex(&initOperand);
|
|
newMaps[mapIdx] = newInitMap;
|
|
}
|
|
|
|
// Step 3. Change the reduction dim iterator types.
|
|
SmallVector<utils::IteratorType> newIteratorTypes =
|
|
linalgOp.getIteratorTypesArray();
|
|
if (tilingStrategy ==
|
|
ReductionTilingStrategy::PartialReductionOuterReduction) {
|
|
for (int dim : reductionDims)
|
|
newIteratorTypes[dim] = utils::IteratorType::parallel;
|
|
}
|
|
|
|
// Step 4. Create the new generic op.
|
|
Operation *partialReductionOp;
|
|
auto resultTypes = ValueRange(tiledInits).getTypes();
|
|
if (tilingStrategy ==
|
|
ReductionTilingStrategy::PartialReductionOuterReduction) {
|
|
auto genericOp = GenericOp::create(b, loc, resultTypes, tiledInputs,
|
|
tiledInits, newMaps, newIteratorTypes);
|
|
IRMapping mapping;
|
|
op->getRegion(0).cloneInto(&genericOp.getRegion(),
|
|
genericOp.getRegion().begin(), mapping);
|
|
partialReductionOp = genericOp.getOperation();
|
|
} else {
|
|
SmallVector<Value> operands = std::move(tiledInputs);
|
|
llvm::append_range(operands, tiledInits);
|
|
partialReductionOp = mlir::clone(b, op, resultTypes, operands);
|
|
}
|
|
return TilingResult{
|
|
{partialReductionOp},
|
|
llvm::map_to_vector(partialReductionOp->getResults(),
|
|
[](OpResult r) -> Value { return r; }),
|
|
generatedSlices};
|
|
}
|
|
|
|
FailureOr<MergeResult>
|
|
mergeReductions(Operation *op, OpBuilder &b, Location loc,
|
|
ValueRange partialReduce,
|
|
const SetVector<unsigned> &reductionDims) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
SmallVector<AffineMap> partialReductionMaps =
|
|
getPartialResultAffineMaps(linalgOp, reductionDims);
|
|
|
|
// Permute the reduction dims as permuted by the partial result map.
|
|
SmallVector<Operation *> mergeOperations;
|
|
SmallVector<Value> replacements;
|
|
for (auto [idx, init, partialResult, partialMap] : llvm::enumerate(
|
|
linalgOp.getDpsInits(), partialReduce, partialReductionMaps)) {
|
|
unsigned initIdx = idx;
|
|
// linalg.reduce's iteration space is the tiled result's iteration space
|
|
// (and not the tiled operation's iteration space). To account for this,
|
|
// permute the reduction dimensions based on the partial result map of the
|
|
// tiled result.
|
|
SmallVector<int64_t> partialReductionDims;
|
|
for (auto [resultNum, dimExpr] :
|
|
llvm::enumerate(partialMap.getResults())) {
|
|
unsigned dim = cast<AffineDimExpr>(dimExpr).getPosition();
|
|
if (llvm::is_contained(reductionDims, dim)) {
|
|
partialReductionDims.push_back(resultNum);
|
|
}
|
|
}
|
|
|
|
auto reduction = linalg::ReduceOp::create(
|
|
b, loc, partialResult, init, partialReductionDims,
|
|
[&linalgOp, &initIdx](OpBuilder &b, Location loc, ValueRange inputs) {
|
|
// Get the combiner op.
|
|
SmallVector<Operation *, 4> combinerOps;
|
|
matchReduction(linalgOp.getRegionOutputArgs(), initIdx,
|
|
combinerOps);
|
|
Operation *clonedReductionOp = b.clone(*combinerOps[0]);
|
|
// Combine the input at idx and output at numInits + idx.
|
|
clonedReductionOp->setOperand(0, inputs[0]);
|
|
clonedReductionOp->setOperand(1, inputs[1]);
|
|
linalg::YieldOp::create(b, loc, clonedReductionOp->getResult(0));
|
|
});
|
|
|
|
mergeOperations.push_back(reduction);
|
|
replacements.push_back(reduction->getResult(0));
|
|
}
|
|
|
|
return MergeResult{mergeOperations, replacements};
|
|
}
|
|
|
|
LogicalResult getPartialResultTilePosition(
|
|
Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ReductionTilingStrategy tilingStrategy, ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes, const SetVector<unsigned> &reductionDims,
|
|
ArrayRef<OpFoldResult> splitReductionIvs,
|
|
SmallVector<OpFoldResult> &resultOffsets,
|
|
SmallVector<OpFoldResult> &resultSizes) const {
|
|
auto linalgOp = cast<LinalgOp>(op);
|
|
SmallVector<AffineMap> partialReductionMaps =
|
|
getPartialResultAffineMaps(linalgOp, reductionDims);
|
|
InitSliceInfo sliceInfo = getInitSliceInfo(
|
|
b.getContext(), tilingStrategy, offsets, sizes, reductionDims,
|
|
splitReductionIvs, partialReductionMaps[resultNumber]);
|
|
std::swap(resultOffsets, sliceInfo.offsets);
|
|
std::swap(resultSizes, sliceInfo.sizes);
|
|
|
|
return success();
|
|
}
|
|
};
|
|
|
|
template <typename OpTy>
|
|
static SmallVector<Range> getPackUnPackIterationDomain(OpTy op,
|
|
OpBuilder &builder) {
|
|
static_assert(llvm::is_one_of<OpTy, PackOp, UnPackOp>::value,
|
|
"applies to only pack or unpack operations");
|
|
OpBuilder::InsertionGuard g(builder);
|
|
int64_t rank = (std::is_same<OpTy, PackOp>::value) ? op.getSourceRank()
|
|
: op.getDestRank();
|
|
OpFoldResult zero = builder.getIndexAttr(0);
|
|
OpFoldResult one = builder.getIndexAttr(1);
|
|
ReifiedRankedShapedTypeDims resultShape;
|
|
(void)reifyResultShapes(builder, op, resultShape);
|
|
SmallVector<Range> loopBounds(rank);
|
|
for (auto dim : llvm::seq<int64_t>(0, rank)) {
|
|
loopBounds[dim].offset = zero;
|
|
loopBounds[dim].stride = one;
|
|
loopBounds[dim].size = resultShape[0][dim];
|
|
}
|
|
return loopBounds;
|
|
}
|
|
|
|
static void applyPermToRange(SmallVector<OpFoldResult> &offsets,
|
|
SmallVector<OpFoldResult> &sizes,
|
|
ArrayRef<int64_t> permutation) {
|
|
if (permutation.empty())
|
|
return;
|
|
applyPermutationToVector<OpFoldResult>(offsets, permutation);
|
|
applyPermutationToVector<OpFoldResult>(sizes, permutation);
|
|
}
|
|
|
|
struct PackOpTiling
|
|
: public TilingInterface::ExternalModel<PackOpTiling, linalg::PackOp> {
|
|
|
|
SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
|
|
// Note that here we only consider untiled dimensions and outer tiled data
|
|
// dimensions, the inner tiled data dimensions are materialized when
|
|
// building the body of the operation.
|
|
auto packOp = cast<PackOp>(op);
|
|
SmallVector<utils::IteratorType> iteratorTypes(
|
|
packOp.getSourceRank(), utils::IteratorType::parallel);
|
|
return iteratorTypes;
|
|
}
|
|
|
|
SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
|
|
return getPackUnPackIterationDomain<PackOp>(cast<PackOp>(op), b);
|
|
}
|
|
|
|
FailureOr<TilingResult>
|
|
getTiledImplementation(Operation *op, OpBuilder &b,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
auto packOp = cast<PackOp>(op);
|
|
Location loc = packOp.getLoc();
|
|
|
|
// The tiling is applied on interchanged dimensions. We have to undo the
|
|
// interchange to map sizes and offsets to the original input.
|
|
int64_t inputRank = packOp.getSourceRank();
|
|
SmallVector<OpFoldResult> origOffsets(offsets);
|
|
SmallVector<OpFoldResult> origSizes(sizes);
|
|
applyPermToRange(origOffsets, origSizes,
|
|
invertPermutationVector(packOp.getOuterDimsPerm()));
|
|
|
|
DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
|
|
packOp.getDimAndTileMapping();
|
|
SmallVector<OpFoldResult> srcDimValues =
|
|
tensor::getMixedSizes(b, loc, packOp.getSource());
|
|
SmallVector<OpFoldResult> inputIndices, inputSizes;
|
|
for (auto dim : llvm::seq<int64_t>(0, inputRank)) {
|
|
using AV = affine::AffineValueExpr;
|
|
affine::AffineBuilder ab(b, loc);
|
|
AffineExpr dim0, dim1, sym;
|
|
bindDims(b.getContext(), dim0, dim1);
|
|
bindSymbols(b.getContext(), sym);
|
|
if (dimAndTileMapping.count(dim)) {
|
|
// If the data dimension is tiled, the i-th index is the product of
|
|
// offset_i and tile_i, and the i-th size is the product of sizes_i and
|
|
// tile_i.
|
|
auto avOffset = AV(dim0).bind(origOffsets[dim]);
|
|
auto avSize = AV(dim0).bind(origSizes[dim]);
|
|
auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
|
|
inputIndices.push_back(ab.mul(avOffset, avTileSize));
|
|
inputSizes.push_back(ab.mul(avSize, avTileSize));
|
|
} else {
|
|
inputIndices.push_back(origOffsets[dim]);
|
|
inputSizes.push_back(origSizes[dim]);
|
|
}
|
|
|
|
// Limit the size of the input operand for incomplete tiles.
|
|
if (packOp.getPaddingValue()) {
|
|
OpFoldResult dimSize = srcDimValues[dim];
|
|
auto avDimSize = AV(dim0).bind(dimSize);
|
|
auto avInputIdx = AV(dim1).bind(inputIndices.back());
|
|
inputSizes.back() =
|
|
ab.min({inputSizes.back(), ab.sub(avDimSize, avInputIdx)});
|
|
}
|
|
}
|
|
|
|
auto oneAttr = b.getI64IntegerAttr(1);
|
|
SmallVector<OpFoldResult> strides(inputRank, oneAttr);
|
|
|
|
SmallVector<Value> tiledOperands;
|
|
auto sourceSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, packOp.getSource(), inputIndices, inputSizes, strides);
|
|
tiledOperands.push_back(sourceSlice);
|
|
|
|
SmallVector<OpFoldResult> outputOffsets, outputSizes;
|
|
if (failed(getResultTilePosition(op, b, 0, offsets, sizes, outputOffsets,
|
|
outputSizes)))
|
|
return {};
|
|
|
|
strides.append(packOp.getDestRank() - inputRank, oneAttr);
|
|
auto outSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, packOp.getDest(), outputOffsets, outputSizes, strides);
|
|
tiledOperands.push_back(outSlice);
|
|
|
|
if (auto val = packOp.getPaddingValue())
|
|
tiledOperands.push_back(val);
|
|
for (auto tile : packOp.getInnerTiles())
|
|
tiledOperands.push_back(tile);
|
|
|
|
Operation *tiledPackOp = PackOp::create(
|
|
b, loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs());
|
|
|
|
return TilingResult{
|
|
{tiledPackOp},
|
|
SmallVector<Value>(tiledPackOp->getResults()),
|
|
llvm::to_vector(ArrayRef<Operation *>{sourceSlice, outSlice})};
|
|
}
|
|
|
|
LogicalResult
|
|
getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes,
|
|
SmallVector<OpFoldResult> &resultOffsets,
|
|
SmallVector<OpFoldResult> &resultSizes) const {
|
|
// The iteration domain is over outer dimensions of packed layout. In this
|
|
// context, the outer dimensions of `resultOffsets` are `offsets`. The
|
|
// inner dimensions of `resultOffsets` are zeros because tiling is not
|
|
// applied to them.
|
|
auto packOp = cast<PackOp>(op);
|
|
int64_t inputRank = packOp.getSourceRank();
|
|
int64_t outputRank = packOp.getDestRank();
|
|
auto zeroAttr = b.getI64IntegerAttr(0);
|
|
resultOffsets.assign(offsets.begin(), offsets.end());
|
|
resultOffsets.append(outputRank - inputRank, zeroAttr);
|
|
|
|
ReifiedRankedShapedTypeDims outputShape;
|
|
(void)reifyResultShapes(b, packOp, outputShape);
|
|
resultSizes.assign(sizes.begin(), sizes.end());
|
|
for (auto dataTileDim : llvm::seq<unsigned>(inputRank, outputRank))
|
|
resultSizes.push_back(outputShape[0][dataTileDim]);
|
|
|
|
return success();
|
|
}
|
|
|
|
FailureOr<TilingResult>
|
|
generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
auto packOp = cast<PackOp>(op);
|
|
int64_t numTiles = packOp.getInnerDimsPos().size();
|
|
|
|
// tensor.pack op is fusible (as a producer) only if full inner tiles are
|
|
// iterated or inner dims are not tiled. Otherwise, it will generate a
|
|
// sequence of non-trivial ops (for partial tiles).
|
|
for (auto offset : offsets.take_back(numTiles))
|
|
if (!isZeroInteger(offset))
|
|
return failure();
|
|
|
|
for (auto iter :
|
|
llvm::zip_equal(packOp.getMixedTiles(), sizes.take_back(numTiles)))
|
|
if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter)))
|
|
return failure();
|
|
|
|
FailureOr<TilingResult> tilingResult = getTiledImplementation(
|
|
op, b, offsets.drop_back(numTiles), sizes.drop_back(numTiles));
|
|
if (failed(tilingResult))
|
|
return failure();
|
|
return tilingResult.value();
|
|
}
|
|
|
|
/// Method to return the position of iteration domain tile computed by the
|
|
/// tiled operation. In current `tensor.pack` context, the `resultOffsets` and
|
|
/// `resultSizes` only cover outer dimensions.
|
|
LogicalResult getIterationDomainTileFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes,
|
|
SmallVectorImpl<OpFoldResult> &resultOffsets,
|
|
SmallVectorImpl<OpFoldResult> &resultSizes) const {
|
|
if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
|
|
LLVM_DEBUG(
|
|
{ llvm::dbgs() << "unsupported operands for consumer fusion"; });
|
|
return failure();
|
|
}
|
|
|
|
ArrayRef<OpFoldResult> offsets(allOffsets[0]);
|
|
ArrayRef<OpFoldResult> sizes(allSizes[0]);
|
|
auto packOp = cast<PackOp>(op);
|
|
Location loc = packOp.getLoc();
|
|
SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
|
|
DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
|
|
packOp.getDimAndTileMapping();
|
|
SmallVector<int64_t> outerShapeWithoutTranspose(
|
|
packOp.getDestType().getShape().take_front(packOp.getSourceRank()));
|
|
if (!packOp.getOuterDimsPerm().empty()) {
|
|
applyPermutationToVector(
|
|
outerShapeWithoutTranspose,
|
|
invertPermutationVector(packOp.getOuterDimsPerm()));
|
|
}
|
|
for (auto dim : llvm::seq<int64_t>(packOp.getSourceRank())) {
|
|
if (dimAndTileMapping.count(dim)) {
|
|
FailureOr<int64_t> cstTileSize =
|
|
ValueBoundsConstraintSet::computeConstantBound(
|
|
presburger::BoundType::UB, sizes[dim],
|
|
/*stopCondition=*/nullptr, /*closedUB=*/true);
|
|
std::optional<int64_t> cstInnerSize =
|
|
getConstantIntValue(dimAndTileMapping[dim]);
|
|
|
|
// If a dimension is not tiled, it is always valid to fuse the pack op,
|
|
// even if the op has padding semantics. Because it always generates a
|
|
// full slice along the dimension. The tile sizes are for unpacked
|
|
// domain, i.e., `srcDimSize`, so `tileSize < srcDimSize` means that the
|
|
// dimension is tiled.
|
|
// TODO: It could be untiled if the `srcDimSize` is dynamic. It is a
|
|
// hard check to determine if a dimension is tiled or not.
|
|
int64_t srcDimSize = packOp.getSourceType().getDimSize(dim);
|
|
int64_t destDimSize = outerShapeWithoutTranspose[dim];
|
|
bool isTiled = failed(cstTileSize) ||
|
|
ShapedType::isDynamic(srcDimSize) ||
|
|
cstTileSize.value() < srcDimSize;
|
|
if (!isTiled) {
|
|
outerDimOffsets.push_back(offsets[dim]);
|
|
if (ShapedType::isStatic(destDimSize)) {
|
|
outerDimSizes.push_back(b.getIndexAttr(destDimSize));
|
|
} else {
|
|
outerDimSizes.push_back(
|
|
b.createOrFold<tensor::DimOp>(loc, packOp.getDest(), dim));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
// Currently fusing `packOp` as consumer only expects perfect tiling
|
|
// scenario because even if without padding semantic, the `packOp` may
|
|
// also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>,
|
|
// where the `tileSize` from operand of `packOp` is 5, which is not
|
|
// exactly divided by `innerTile`(=6) of `packOp`. As the result:
|
|
// 1. the first slice is extracted from (0) to (4) and inserted into
|
|
// (0,0)~(0,4) at first row.
|
|
// 2. the second slice is extracted from (5) to (9) and SHOULD BE
|
|
// respectively inserted into two rows with different length, including
|
|
// first row: (0,5) and second row (1,0)~(1,3). It is hard to coordinate
|
|
// them, thus adding below constraint to bypass them temporarily. In
|
|
// another word, we can only support tiling with consumer if the tile
|
|
// size for the producer is a multiple of the inner tile size for the
|
|
// packed dimensions at this moment.
|
|
if ((failed(cstTileSize) || !cstInnerSize ||
|
|
*cstTileSize % *cstInnerSize != 0))
|
|
return failure();
|
|
|
|
using AV = affine::AffineValueExpr;
|
|
affine::AffineBuilder ab(b, loc);
|
|
AffineExpr dim0, sym;
|
|
bindDims(b.getContext(), dim0);
|
|
bindSymbols(b.getContext(), sym);
|
|
auto avOffset = AV(dim0).bind(offsets[dim]);
|
|
auto avSize = AV(dim0).bind(sizes[dim]);
|
|
auto avTileSize = AV(sym).bind(dimAndTileMapping[dim]);
|
|
outerDimOffsets.push_back(ab.floor(avOffset, avTileSize));
|
|
outerDimSizes.push_back(ab.ceil(avSize, avTileSize));
|
|
} else {
|
|
outerDimOffsets.push_back(offsets[dim]);
|
|
outerDimSizes.push_back(sizes[dim]);
|
|
}
|
|
}
|
|
applyPermToRange(outerDimOffsets, outerDimSizes, packOp.getOuterDimsPerm());
|
|
resultOffsets = outerDimOffsets;
|
|
resultSizes = outerDimSizes;
|
|
return success();
|
|
}
|
|
|
|
/// Method to return the tiled implementation of tensor.pack as a consumer.
|
|
FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
|
|
if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
|
|
LLVM_DEBUG(
|
|
{ llvm ::dbgs() << "unhandled operands for consumer fusion"; });
|
|
return failure();
|
|
}
|
|
|
|
ArrayRef<OpFoldResult> offsets(allOffsets[0]);
|
|
ArrayRef<OpFoldResult> sizes(allSizes[0]);
|
|
|
|
auto packOp = cast<PackOp>(op);
|
|
Location loc = packOp.getLoc();
|
|
|
|
int64_t inputRank = packOp.getSourceRank();
|
|
auto oneAttr = b.getI64IntegerAttr(1);
|
|
SmallVector<OpFoldResult> strides(inputRank, oneAttr);
|
|
|
|
SmallVector<Value> tiledOperands;
|
|
auto sourceSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, packOp.getSource(), offsets, sizes, strides);
|
|
tiledOperands.push_back(sourceSlice);
|
|
|
|
SmallVector<OpFoldResult> outerDimOffsets, outerDimSizes;
|
|
if (failed(getIterationDomainTileFromOperandTiles(
|
|
op, b, operandNumbers, allOffsets, allSizes, outerDimOffsets,
|
|
outerDimSizes)))
|
|
return failure();
|
|
|
|
SmallVector<OpFoldResult> outputOffsets, outputSizes;
|
|
if (failed(getResultTilePosition(op, b, 0, outerDimOffsets, outerDimSizes,
|
|
outputOffsets, outputSizes)))
|
|
return failure();
|
|
|
|
strides.append(packOp.getDestRank() - inputRank, oneAttr);
|
|
auto outSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, packOp.getDest(), outputOffsets, outputSizes, strides);
|
|
tiledOperands.push_back(outSlice);
|
|
|
|
if (auto val = packOp.getPaddingValue())
|
|
tiledOperands.push_back(val);
|
|
for (auto tile : packOp.getInnerTiles())
|
|
tiledOperands.push_back(tile);
|
|
|
|
Operation *tiledPackOp = PackOp::create(
|
|
b, loc, TypeRange{outSlice.getType()}, tiledOperands, op->getAttrs());
|
|
|
|
return TilingResult{
|
|
{tiledPackOp},
|
|
SmallVector<Value>(tiledPackOp->getResults()),
|
|
llvm::to_vector(ArrayRef<Operation *>{sourceSlice, outSlice})};
|
|
}
|
|
};
|
|
|
|
struct UnpackTileDimInfo {
|
|
bool isAlignedToInnerTileSize;
|
|
OpFoldResult sourceOffset;
|
|
OpFoldResult sourceSize;
|
|
OpFoldResult resultOffset;
|
|
OpFoldResult destExpandedSize;
|
|
};
|
|
|
|
/// Returns the needed information for tiling unpack op on `tileDim` with given
|
|
/// `tileOffset` and `tileSize`. For more details, see the comment of the
|
|
/// `getTiledImplementation`.
|
|
static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp,
|
|
int64_t tileDim,
|
|
OpFoldResult tileOffset,
|
|
OpFoldResult tileSize) {
|
|
UnpackTileDimInfo info;
|
|
Attribute zeroAttr = b.getIndexAttr(0);
|
|
Attribute oneAttr = b.getIndexAttr(1);
|
|
DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
|
|
unpackOp.getDimAndTileMapping();
|
|
// The dimension is not one of packed data dimension.
|
|
if (!dimAndTileMapping.count(tileDim)) {
|
|
info.isAlignedToInnerTileSize = true;
|
|
info.sourceOffset = tileOffset;
|
|
info.sourceSize = tileSize;
|
|
info.resultOffset = zeroAttr;
|
|
info.destExpandedSize = tileSize;
|
|
return info;
|
|
}
|
|
|
|
Location loc = unpackOp.getLoc();
|
|
using AV = affine::AffineValueExpr;
|
|
affine::AffineBuilder ab(b, loc);
|
|
AffineExpr dim0, dim1, sym0;
|
|
bindDims(b.getContext(), dim0, dim1);
|
|
bindSymbols(b.getContext(), sym0);
|
|
|
|
OpFoldResult innerTileSize = dimAndTileMapping[tileDim];
|
|
|
|
info.isAlignedToInnerTileSize = false;
|
|
FailureOr<int64_t> cstSize = ValueBoundsConstraintSet::computeConstantBound(
|
|
presburger::BoundType::UB, tileSize,
|
|
/*stopCondition=*/nullptr, /*closedUB=*/true);
|
|
std::optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize);
|
|
if (!failed(cstSize) && cstInnerSize) {
|
|
if (*cstSize % *cstInnerSize == 0)
|
|
info.isAlignedToInnerTileSize = true;
|
|
|
|
// If the tiling size equals to the inner tiling size, the outer dims are
|
|
// always 1.
|
|
if (*cstInnerSize == *cstSize) {
|
|
auto lhs = AV(dim0).bind(tileOffset);
|
|
auto rhs = AV(dim1).bind(innerTileSize);
|
|
info.sourceOffset = ab.floor(lhs, rhs);
|
|
info.sourceSize = oneAttr;
|
|
info.resultOffset = zeroAttr;
|
|
info.destExpandedSize = tileSize;
|
|
return info;
|
|
}
|
|
}
|
|
|
|
if (info.isAlignedToInnerTileSize) {
|
|
info.sourceOffset =
|
|
ab.floor(AV(dim0).bind(tileOffset), AV(dim1).bind(innerTileSize));
|
|
info.resultOffset = zeroAttr;
|
|
info.destExpandedSize = tileSize;
|
|
|
|
// The ceilDiv is needed here because there could be incomplete tile even
|
|
// it is perfect tiling cases. E.g.,
|
|
// %0 = unpack tensor<33x2xf32> into tensor<64xf32>
|
|
// If the tiling size is 32, there will be 3 tiles. Two of them have
|
|
// size=32; one of them have size=2. The size is represented using
|
|
// affine_min op; we need ceilDiv.
|
|
info.sourceSize =
|
|
ab.ceil(AV(dim0).bind(tileSize), AV(dim1).bind(innerTileSize));
|
|
return info;
|
|
}
|
|
|
|
affine::DivModValue firstCoord = affine::getDivMod(
|
|
b, loc, getValueOrCreateConstantIndexOp(b, loc, tileOffset),
|
|
getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
|
|
OpFoldResult tileExclusiveBound =
|
|
ab.add(AV(dim0).bind(tileOffset), AV(dim1).bind(tileSize));
|
|
affine::DivModValue lastCoord = affine::getDivMod(
|
|
b, loc,
|
|
getValueOrCreateConstantIndexOp(
|
|
b, loc,
|
|
ab.sub(AV(dim0).bind(tileExclusiveBound), AV(dim1).bind(oneAttr))),
|
|
getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
|
|
|
|
OpFoldResult lengthMinusOne = ab.sub(AV(dim0).bind(lastCoord.quotient),
|
|
AV(dim1).bind(firstCoord.quotient));
|
|
info.sourceSize =
|
|
ab.add(AV(dim0).bind(lengthMinusOne), AV(dim1).bind(oneAttr));
|
|
info.sourceOffset = firstCoord.quotient;
|
|
info.resultOffset = firstCoord.remainder;
|
|
// Do not create an Affine ops for expanded size because the affine op is too
|
|
// complicated which would trigger an issue in affine ops simplification.
|
|
info.destExpandedSize = b.createOrFold<arith::MulIOp>(
|
|
loc, getValueOrCreateConstantIndexOp(b, loc, info.sourceSize),
|
|
getValueOrCreateConstantIndexOp(b, loc, innerTileSize));
|
|
return info;
|
|
}
|
|
|
|
struct UnPackOpTiling
|
|
: public TilingInterface::ExternalModel<UnPackOpTiling, linalg::UnPackOp> {
|
|
|
|
SmallVector<utils::IteratorType> getLoopIteratorTypes(Operation *op) const {
|
|
auto unpackOp = cast<UnPackOp>(op);
|
|
SmallVector<utils::IteratorType> iteratorTypes(
|
|
unpackOp.getDestRank(), utils::IteratorType::parallel);
|
|
return iteratorTypes;
|
|
}
|
|
|
|
SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
|
|
return getPackUnPackIterationDomain<UnPackOp>(cast<UnPackOp>(op), b);
|
|
}
|
|
|
|
/// There are two cases in tiling unpack ops. If the tiling size is aligned to
|
|
/// the inner tile size, the corresponding tiles of source are all complete.
|
|
/// Otherwise, there are in-complete tiles. We will need to expand the slice
|
|
/// of source for getting complete tiles. The tiled unpack op unpacks more
|
|
/// data from source, so We'll need an extract_slice op to shift and truncate
|
|
/// the output.
|
|
/// Take Nn_to_N as an example. Say that N=32, n=8, and tiling_size=15. The
|
|
/// coordinates of second tile (i.e., result[15..31]) are
|
|
/// [(1, 7), (2, 0,), (2, 1) ... (3, 6), (3, 7)]. The first row and the last
|
|
/// row are incomplete tiles. To represent the unpack op, we have to complete
|
|
/// the rows. I.e., the input coordinates would start with (1, 0); end with
|
|
/// (3, 7). In this context, the tiled unpack produces a (3 * n) elements
|
|
/// because there are 3 rows in total. Follow by a tensor.extract_slice op, we
|
|
/// can get the actual result.
|
|
FailureOr<TilingResult>
|
|
getTiledImplementation(Operation *op, OpBuilder &b,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
auto unpackOp = cast<UnPackOp>(op);
|
|
int64_t srcRank = unpackOp.getSourceRank();
|
|
int64_t destRank = unpackOp.getDestRank();
|
|
int64_t numInnerTiles = srcRank - destRank;
|
|
Location loc = unpackOp.getLoc();
|
|
|
|
// The perfect tiling case indicates that the tiling sizes are multiple of
|
|
// inner_tile_size. In this context, no extra data is needed when
|
|
// representing the tiled unpack op.
|
|
bool isPerfectTilingCase = true;
|
|
Attribute oneAttr = b.getIndexAttr(1);
|
|
SmallVector<OpFoldResult> sliceSrcStrides(destRank, oneAttr);
|
|
SmallVector<OpFoldResult> sliceSrcIndices, sliceSrcSizes;
|
|
SmallVector<OpFoldResult> destExpandedSizes, resultOffsetsFromDest;
|
|
for (auto dim : llvm::seq<int64_t>(0, destRank)) {
|
|
UnpackTileDimInfo info =
|
|
getUnpackTileDimInfo(b, unpackOp, dim, offsets[dim], sizes[dim]);
|
|
if (!info.isAlignedToInnerTileSize)
|
|
isPerfectTilingCase = false;
|
|
sliceSrcIndices.push_back(info.sourceOffset);
|
|
sliceSrcSizes.push_back(info.sourceSize);
|
|
destExpandedSizes.push_back(info.destExpandedSize);
|
|
resultOffsetsFromDest.push_back(info.resultOffset);
|
|
}
|
|
|
|
// The tiling is applied on destination dimensions. We have to apply the
|
|
// interchange on source dimensions if outer_dims_perm is set.
|
|
applyPermToRange(sliceSrcIndices, sliceSrcSizes,
|
|
unpackOp.getOuterDimsPerm());
|
|
Attribute zeroAttr = b.getIndexAttr(0);
|
|
sliceSrcIndices.append(numInnerTiles, zeroAttr);
|
|
sliceSrcSizes.append(unpackOp.getMixedTiles());
|
|
sliceSrcStrides.append(numInnerTiles, oneAttr);
|
|
SmallVector<Operation *> generatedSlices;
|
|
tensor::ExtractSliceOp sliceSource = tensor::ExtractSliceOp::create(
|
|
b, loc, unpackOp.getSource(), sliceSrcIndices, sliceSrcSizes,
|
|
sliceSrcStrides);
|
|
generatedSlices.push_back(sliceSource);
|
|
|
|
SmallVector<OpFoldResult> destStrides(destRank, oneAttr);
|
|
Value sliceDest;
|
|
if (isPerfectTilingCase) {
|
|
auto destSliceOp = tensor::ExtractSliceOp::create(
|
|
b, loc, unpackOp.getDest(), offsets, sizes, destStrides);
|
|
sliceDest = destSliceOp;
|
|
generatedSlices.push_back(destSliceOp);
|
|
} else {
|
|
sliceDest = tensor::EmptyOp::create(
|
|
b, loc, destExpandedSizes, unpackOp.getDestType().getElementType());
|
|
}
|
|
|
|
SmallVector<Value> tiledOperands = {sliceSource.getResult(), sliceDest};
|
|
for (auto tile : unpackOp.getInnerTiles())
|
|
tiledOperands.push_back(tile);
|
|
|
|
Operation *tiledUnpackOp = UnPackOp::create(
|
|
b, loc, TypeRange{sliceDest.getType()}, tiledOperands, op->getAttrs());
|
|
|
|
if (isPerfectTilingCase)
|
|
return TilingResult{{tiledUnpackOp},
|
|
SmallVector<Value>(tiledUnpackOp->getResults()),
|
|
generatedSlices};
|
|
|
|
auto extractSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, tiledUnpackOp->getResult(0), resultOffsetsFromDest, sizes,
|
|
destStrides);
|
|
return TilingResult{
|
|
{tiledUnpackOp}, {extractSlice.getResult()}, generatedSlices};
|
|
}
|
|
|
|
LogicalResult
|
|
getResultTilePosition(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes,
|
|
SmallVector<OpFoldResult> &resultOffsets,
|
|
SmallVector<OpFoldResult> &resultSizes) const {
|
|
resultOffsets = llvm::to_vector(offsets);
|
|
resultSizes = llvm::to_vector(sizes);
|
|
return success();
|
|
}
|
|
|
|
FailureOr<TilingResult>
|
|
generateResultTileValue(Operation *op, OpBuilder &b, unsigned resultNumber,
|
|
ArrayRef<OpFoldResult> offsets,
|
|
ArrayRef<OpFoldResult> sizes) const {
|
|
FailureOr<TilingResult> tilingResult =
|
|
getTiledImplementation(op, b, offsets, sizes);
|
|
if (failed(tilingResult))
|
|
return failure();
|
|
return tilingResult.value();
|
|
}
|
|
|
|
/// Method to return the position of iteration domain tile computed by the
|
|
/// tiled operation.
|
|
LogicalResult getIterationDomainTileFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes,
|
|
SmallVectorImpl<OpFoldResult> &resultOffsets,
|
|
SmallVectorImpl<OpFoldResult> &resultSizes) const {
|
|
if (operandNumbers.size() != 1) {
|
|
LLVM_DEBUG({ llvm::dbgs() << "unable to handle multiple operands"; });
|
|
return failure();
|
|
}
|
|
auto unPackOp = cast<UnPackOp>(op);
|
|
unsigned operandNumber = operandNumbers[0];
|
|
ArrayRef<OpFoldResult> offsets(allOffsets[0]);
|
|
ArrayRef<OpFoldResult> sizes(allSizes[0]);
|
|
|
|
// If the operand tile is the dest, then no adjustment is needed.
|
|
if (operandNumber == unPackOp.getDestMutable().getOperandNumber()) {
|
|
resultOffsets = llvm::to_vector(offsets);
|
|
resultSizes = llvm::to_vector(sizes);
|
|
return success();
|
|
}
|
|
Location loc = unPackOp.getLoc();
|
|
|
|
int64_t numTiles = unPackOp.getInnerDimsPos().size();
|
|
auto destOffsets = offsets.drop_back(numTiles);
|
|
auto destSizes = sizes.drop_back(numTiles);
|
|
// The tiling is applied on interchanged dimensions. We have to undo the
|
|
// interchange to map sizes and offsets to the original input.
|
|
int64_t outputRank = unPackOp.getDestRank();
|
|
ReifiedRankedShapedTypeDims reifiedReturnShapes;
|
|
if (failed(reifyResultShapes(b, unPackOp, reifiedReturnShapes)))
|
|
return failure();
|
|
SmallVector<OpFoldResult> outputMixedSizes = reifiedReturnShapes.front();
|
|
SmallVector<OpFoldResult> origOffsets(destOffsets);
|
|
SmallVector<OpFoldResult> origSizes(destSizes);
|
|
applyPermToRange(origOffsets, origSizes,
|
|
invertPermutationVector(unPackOp.getOuterDimsPerm()));
|
|
|
|
DenseMap<int64_t, OpFoldResult> dimAndTileMapping =
|
|
unPackOp.getDimAndTileMapping();
|
|
|
|
for (auto dim : llvm::seq<int64_t>(0, outputRank)) {
|
|
using AV = affine::AffineValueExpr;
|
|
affine::AffineBuilder ab(b, loc);
|
|
AffineExpr dim0, dim1, sym0;
|
|
bindDims(b.getContext(), dim0, dim1);
|
|
bindSymbols(b.getContext(), sym0);
|
|
if (dimAndTileMapping.count(dim)) {
|
|
// If the data dimension is tiled, the i-th index is the product of
|
|
// offset_i and tile_i, and the i-th size is the product of sizes_i and
|
|
// tile_i. The sizes must be clamped to the sizes of the unpack result.
|
|
auto avOffset = AV(dim0).bind(origOffsets[dim]);
|
|
auto avSize = AV(dim0).bind(origSizes[dim]);
|
|
auto avTileSize = AV(sym0).bind(dimAndTileMapping[dim]);
|
|
auto avResultSize = AV(dim0).bind(outputMixedSizes[dim]);
|
|
resultOffsets.push_back(ab.mul(avOffset, avTileSize));
|
|
auto avResultOffset = AV(dim1).bind(resultOffsets.back());
|
|
resultSizes.push_back(ab.min({ab.mul(avSize, avTileSize),
|
|
ab.sub(avResultSize, avResultOffset)}));
|
|
} else {
|
|
resultOffsets.push_back(origOffsets[dim]);
|
|
resultSizes.push_back(origSizes[dim]);
|
|
}
|
|
}
|
|
return success();
|
|
}
|
|
|
|
/// Method to return the tiled implementation of tensor.unpack as a consumer.
|
|
FailureOr<TilingResult> getTiledImplementationFromOperandTiles(
|
|
Operation *op, OpBuilder &b, ArrayRef<unsigned> operandNumbers,
|
|
ArrayRef<SmallVector<OpFoldResult>> allOffsets,
|
|
ArrayRef<SmallVector<OpFoldResult>> allSizes) const {
|
|
if (operandNumbers.size() != 1 || operandNumbers[0] != 0) {
|
|
LLVM_DEBUG({ llvm::dbgs() << "unhandled operands for consumer fusion"; });
|
|
return failure();
|
|
}
|
|
auto unPackOp = cast<UnPackOp>(op);
|
|
ArrayRef<OpFoldResult> offsets(allOffsets[0]);
|
|
ArrayRef<OpFoldResult> sizes(allSizes[0]);
|
|
|
|
// tensor.unpack op is fusible (as a consumer) only if inner dims are not
|
|
// tiled.
|
|
int64_t numTiles = unPackOp.getInnerDimsPos().size();
|
|
for (auto iter :
|
|
llvm::zip_equal(unPackOp.getMixedTiles(), sizes.take_back(numTiles))) {
|
|
if (!isEqualConstantIntOrValue(std::get<0>(iter), std::get<1>(iter)))
|
|
return failure();
|
|
}
|
|
|
|
Location loc = unPackOp.getLoc();
|
|
|
|
// Fetch offset/size for creating the slice of the dest operand of
|
|
// unpack op.
|
|
SmallVector<OpFoldResult> outputOffsets, outputSizes;
|
|
if (failed(getIterationDomainTileFromOperandTiles(
|
|
op, b, operandNumbers, allOffsets, allSizes, outputOffsets,
|
|
outputSizes)))
|
|
return failure();
|
|
|
|
auto oneAttr = b.getI64IntegerAttr(1);
|
|
int64_t outputRank = unPackOp.getDestRank();
|
|
SmallVector<OpFoldResult> strides(outputRank, oneAttr);
|
|
|
|
SmallVector<Value> tiledOperands;
|
|
// Create slice of the dest operand.
|
|
auto extractDestSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, unPackOp.getDest(), outputOffsets, outputSizes, strides);
|
|
tiledOperands.push_back(extractDestSlice);
|
|
|
|
strides.append(unPackOp.getSourceRank() - outputRank, oneAttr);
|
|
// Create slice of the source operand.
|
|
auto extractSourceSlice = tensor::ExtractSliceOp::create(
|
|
b, loc, unPackOp.getSource(), offsets, sizes, strides);
|
|
tiledOperands.insert(tiledOperands.begin(), extractSourceSlice);
|
|
for (auto tile : unPackOp.getInnerTiles())
|
|
tiledOperands.push_back(tile);
|
|
|
|
// Create tiled unpack op.
|
|
Operation *tiledUnPackOp =
|
|
UnPackOp::create(b, loc, TypeRange{extractDestSlice.getType()},
|
|
tiledOperands, op->getAttrs());
|
|
|
|
return TilingResult{{tiledUnPackOp},
|
|
SmallVector<Value>(tiledUnPackOp->getResults()),
|
|
llvm::to_vector(ArrayRef<Operation *>{
|
|
extractSourceSlice, extractDestSlice})};
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
template <typename OpType>
|
|
static void registerOne(MLIRContext *ctx) {
|
|
OpType::template attachInterface<LinalgOpTilingInterface<OpType>>(*ctx);
|
|
OpType::template attachInterface<LinalgOpPartialReductionInterface<OpType>>(
|
|
*ctx);
|
|
}
|
|
|
|
/// Variadic helper function.
|
|
template <typename... OpTypes>
|
|
static void registerAll(MLIRContext *ctx) {
|
|
(registerOne<OpTypes>(ctx), ...);
|
|
}
|
|
|
|
#define GET_OP_LIST
|
|
|
|
void mlir::linalg::registerTilingInterfaceExternalModels(
|
|
DialectRegistry ®istry) {
|
|
registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
|
|
registerOne<linalg::GenericOp>(ctx);
|
|
linalg::PackOp::attachInterface<PackOpTiling>(*ctx);
|
|
linalg::UnPackOp::attachInterface<UnPackOpTiling>(*ctx);
|
|
registerAll<
|
|
#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
|
|
>(ctx);
|
|
});
|
|
}
|
|
|
|
void mlir::linalg::registerTilingInterfaceExternalModelsForPackUnPackOps(
|
|
DialectRegistry ®istry) {
|
|
registry.addExtension(+[](MLIRContext *ctx, LinalgDialect *dialect) {
|
|
linalg::PackOp::attachInterface<PackOpTiling>(*ctx);
|
|
linalg::UnPackOp::attachInterface<UnPackOpTiling>(*ctx);
|
|
});
|
|
}
|