This change adds a new op `alloc_tensor` to the bufferization dialect. During bufferization, this op is always lowered to a buffer allocation (unless it is "eliminated" by a pre-processing pass). It is useful to have such an op in tensor land, because it allows users to model tensor SSA use-def chains (which drive bufferization decisions) and because tensor SSA use-def chains can be analyzed by One-Shot Bufferize, while memref values cannot. This change also replaces all uses of linalg.init_tensor in bufferization-related code with bufferization.alloc_tensor. linalg.init_tensor and bufferization.alloc_tensor are similar, but the purpose of the former one is just to carry a shape. It does not indicate a memory allocation. linalg.init_tensor is not suitable for modelling SSA use-def chains for bufferization purposes, because linalg.init_tensor is marked as not having side effects (in contrast to alloc_tensor). As such, it is legal to move linalg.init_tensor ops around/CSE them/etc. This is not desirable for alloc_tensor; it represents an explicit buffer allocation while still in tensor land and such allocations should not suddenly disappear or get moved around when running the canonicalizer/CSE/etc. BEGIN_PUBLIC No public commit message needed for presubmit. END_PUBLIC Differential Revision: https://reviews.llvm.org/D126003
244 lines
9.8 KiB
C++
244 lines
9.8 KiB
C++
//===- BufferizableOpInterfaceImpl.cpp - Impl. of BufferizableOpInterface -===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "mlir/Dialect/Linalg/Transforms/BufferizableOpInterfaceImpl.h"
|
|
#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
|
|
#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
|
|
#include "mlir/Dialect/Linalg/IR/Linalg.h"
|
|
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
|
#include "mlir/IR/Dialect.h"
|
|
#include "mlir/IR/Operation.h"
|
|
|
|
using namespace mlir;
|
|
using namespace linalg;
|
|
using namespace mlir::bufferization;
|
|
|
|
namespace {
|
|
|
|
// TODO: Ops in the linalg dialect can directly implement this interface.
|
|
|
|
/// Generic conversion for any LinalgOp on tensors.
|
|
static LogicalResult bufferizeLinalgOp(RewriterBase &rewriter, LinalgOp op,
|
|
BufferizationState &state) {
|
|
// Take a guard before anything else.
|
|
OpBuilder::InsertionGuard g(rewriter);
|
|
rewriter.setInsertionPoint(op);
|
|
|
|
// Nothing to do. This op is already bufferized.
|
|
if (op.hasBufferSemantics())
|
|
return success();
|
|
|
|
// Ensure op has only tensors. Allow mixed tensor-buffer mode on a per-need
|
|
// basis.
|
|
if (!op.hasTensorSemantics())
|
|
return op->emitError() << "op does not have tensor semantics";
|
|
|
|
// New input operands for the cloned op.
|
|
SmallVector<Value> newInputBuffers;
|
|
newInputBuffers.reserve(op.getNumInputs());
|
|
for (OpOperand *opOperand : op.getInputOperands()) {
|
|
if (op.isScalar(opOperand)) {
|
|
newInputBuffers.push_back(opOperand->get());
|
|
continue;
|
|
}
|
|
// Input operands are never written to.
|
|
newInputBuffers.push_back(*state.getBuffer(
|
|
rewriter, *opOperand,
|
|
BufferizationState::ForceInPlacability::FORCE_INPLACE));
|
|
}
|
|
|
|
// New output operands for the cloned op.
|
|
SmallVector<Value> newOutputBuffers;
|
|
for (OpResult opResult : op->getOpResults()) {
|
|
SmallVector<OpOperand *> aliasingOpOperands =
|
|
state.getAnalysisState().getAliasingOpOperand(opResult);
|
|
assert(aliasingOpOperands.size() == 1 && "expected 1 OpOperand");
|
|
FailureOr<Value> resultBuffer =
|
|
state.getBuffer(rewriter, *aliasingOpOperands.front());
|
|
if (failed(resultBuffer))
|
|
return failure();
|
|
newOutputBuffers.push_back(*resultBuffer);
|
|
}
|
|
|
|
// Merge input/output operands.
|
|
SmallVector<Value> newOperands = newInputBuffers;
|
|
newOperands.append(newOutputBuffers.begin(), newOutputBuffers.end());
|
|
|
|
// Set insertion point now that potential alloc/dealloc are introduced.
|
|
rewriter.setInsertionPoint(op);
|
|
// Clone the op, but use the new operands. Move the existing block into the
|
|
// new op. Since the new op does not have any tensor results, it does not
|
|
// return anything.
|
|
assert(op->getNumRegions() == 1 && "expected that op has 1 region");
|
|
auto newOp = cast<LinalgOp>(op.cloneWithoutRegions(
|
|
rewriter, op.getLoc(), /*resultTypes=*/TypeRange{}, newOperands));
|
|
rewriter.inlineRegionBefore(op->getRegion(0), newOp->getRegion(0),
|
|
newOp->getRegion(0).begin());
|
|
|
|
// Replace the results of the old op with the new output buffers.
|
|
replaceOpWithBufferizedValues(rewriter, op, newOutputBuffers);
|
|
|
|
return success();
|
|
}
|
|
|
|
/// Linalg OpResults usually bufferize inplace with their tied (output
|
|
/// OpOperands. However, if an output OpOperand is not used in the computation,
|
|
/// it is better to bufferize inplace with an actually used input OpOperand;
|
|
/// less memory will be touched that way.
|
|
///
|
|
/// Example:
|
|
/// O(i, j) = A(i, j) + B(j) --> bufferizes inplace to: A(i, j) += B(j)
|
|
///
|
|
/// O(i, j) = A(j, i) + B(j) --> cannot bufferize inplace with A because
|
|
/// indexing maps are not identical
|
|
///
|
|
/// O(i, j) += A(i, j) + B(j) --> Output is used in computation.
|
|
/// This could bufferize inplace with A:
|
|
/// A(i, j) += O(i, j) + B(j)
|
|
/// However, we choose to bufferize inplace with O here, as there is no clear
|
|
/// benefit of choosing A. TODO: We may want to consider both options and make
|
|
/// an informed decision during analysis in the future.
|
|
static DenseMap<OpOperand *, OpResult> computeAliasingPairs(LinalgOp op) {
|
|
DenseMap<OpOperand *, OpResult> mapping;
|
|
for (OpResult opResult : op->getOpResults()) {
|
|
OpOperand *tiedOperand =
|
|
op.getOutputTensorOperands()[opResult.getResultNumber()];
|
|
AffineMap outputIndexingMap = op.getTiedIndexingMap(tiedOperand);
|
|
bool onlyParallelIterators = op.getNumParallelLoops() == op.getNumLoops();
|
|
bool tiedOperandUsed = op.payloadUsesValueFromOperand(tiedOperand);
|
|
|
|
// If the output arg is used in the computation or at least one iterator is
|
|
// not parallel, try to bufferize inplace with the corresponding output
|
|
// tensor.
|
|
if (tiedOperandUsed || !onlyParallelIterators) {
|
|
mapping[tiedOperand] = opResult;
|
|
continue;
|
|
}
|
|
|
|
// Otherwise, try to bufferize inplace with one of the inputs.
|
|
OpOperand *chosenOperand = nullptr;
|
|
for (OpOperand *opOperand : op.getInputTensorOperands()) {
|
|
if (opOperand->get().getType() != opResult.getType())
|
|
continue;
|
|
if (!op.payloadUsesValueFromOperand(opOperand))
|
|
continue;
|
|
if (op.getTiedIndexingMap(opOperand) != outputIndexingMap)
|
|
continue;
|
|
// No other OpResult bufferizes aliases with this OpOperand.
|
|
if (mapping.count(opOperand))
|
|
continue;
|
|
assert(op.getTiedIndexingMap(opOperand).isProjectedPermutation() &&
|
|
"expected projected permutation");
|
|
chosenOperand = opOperand;
|
|
break;
|
|
}
|
|
|
|
// No suitable input tensor found. Use output tensor.
|
|
// TODO: This operand could bufferize inplace with OpOperands that have the
|
|
// correct type, even if they are not used inside the computation.
|
|
if (!chosenOperand)
|
|
chosenOperand = tiedOperand;
|
|
|
|
mapping[chosenOperand] = opResult;
|
|
}
|
|
return mapping;
|
|
}
|
|
|
|
/// Bufferization of linalg.generic. Replace with a new linalg.generic that
|
|
/// operates entirely on memrefs.
|
|
template <typename OpTy>
|
|
struct LinalgOpInterface
|
|
: public BufferizableOpInterface::ExternalModel<LinalgOpInterface<OpTy>,
|
|
OpTy> {
|
|
bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
|
|
const AnalysisState &state) const {
|
|
// Operand is read if it is used in the computation.
|
|
auto genericOp = cast<linalg::LinalgOp>(op);
|
|
return genericOp.payloadUsesValueFromOperand(&opOperand);
|
|
}
|
|
|
|
bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
|
|
const AnalysisState &state) const {
|
|
// Operand is written to if it has an aliasing OpResult.
|
|
auto bufferizableOp = cast<BufferizableOpInterface>(op);
|
|
return !bufferizableOp.getAliasingOpResult(opOperand, state).empty();
|
|
}
|
|
|
|
SmallVector<OpOperand *>
|
|
getAliasingOpOperand(Operation *op, OpResult opResult,
|
|
const AnalysisState &state) const {
|
|
auto genericOp = cast<linalg::LinalgOp>(op);
|
|
|
|
// By default, the i-th OpResult may alias with the i-th "out" tensor.
|
|
if (state.getOptions().alwaysAliasingWithDest)
|
|
return {genericOp.getOutputOperand(opResult.getResultNumber())};
|
|
|
|
// We can try to be smart and alias in-place with an "in" tensor if the
|
|
// corresponding "out" tensor is not used in the computation.
|
|
// Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`.
|
|
DenseMap<OpOperand *, OpResult> pairs = computeAliasingPairs(genericOp);
|
|
for (OpOperand *opOperand : genericOp.getInputAndOutputOperands())
|
|
if (pairs[opOperand] == opResult)
|
|
return {opOperand};
|
|
return {};
|
|
}
|
|
|
|
SmallVector<OpResult> getAliasingOpResult(Operation *op, OpOperand &opOperand,
|
|
const AnalysisState &state) const {
|
|
auto genericOp = cast<linalg::LinalgOp>(op);
|
|
|
|
// By default, the i-th "out" tensor may alias with the i-th OpResult.
|
|
if (state.getOptions().alwaysAliasingWithDest) {
|
|
if (genericOp.isOutputTensor(&opOperand))
|
|
return {genericOp.getTiedOpResult(&opOperand)};
|
|
return {};
|
|
}
|
|
|
|
// We can try to be smart. See comment in `getAliasingOpOperand`.
|
|
// Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`.
|
|
DenseMap<OpOperand *, OpResult> pairs = computeAliasingPairs(genericOp);
|
|
if (!pairs.count(&opOperand))
|
|
return {};
|
|
return {pairs[&opOperand]};
|
|
}
|
|
|
|
BufferRelation bufferRelation(Operation *op, OpResult opResult,
|
|
const AnalysisState &state) const {
|
|
return BufferRelation::Equivalent;
|
|
}
|
|
|
|
LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
|
|
BufferizationState &state) const {
|
|
return bufferizeLinalgOp(rewriter, cast<LinalgOp>(op), state);
|
|
}
|
|
};
|
|
|
|
/// Helper structure that iterates over all LinalgOps in `OpTys` and registers
|
|
/// the `BufferizableOpInterface` with each of them.
|
|
template <typename... Ops>
|
|
struct LinalgOpInterfaceHelper {
|
|
static void registerOpInterface(MLIRContext *ctx) {
|
|
(void)std::initializer_list<int>{
|
|
0, (Ops::template attachInterface<LinalgOpInterface<Ops>>(*ctx), 0)...};
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
void mlir::linalg::registerBufferizableOpInterfaceExternalModels(
|
|
DialectRegistry ®istry) {
|
|
registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
|
|
// Register all Linalg structured ops. `LinalgOp` is an interface and it is
|
|
// not possible to attach an external interface to an existing interface.
|
|
// Therefore, attach the `BufferizableOpInterface` to all ops one-by-one.
|
|
LinalgOpInterfaceHelper<
|
|
#define GET_OP_LIST
|
|
#include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
|
|
>::registerOpInterface(ctx);
|
|
});
|
|
}
|