Adds initial support for GPU by-ref reductions. The main problem for
reduction by reference is that, prior to this PR, we were shuffling
(from remote lanes within the same warp or across different warps within
the block) pointers/references to the private reduction values rather
than the private reduction values themselves.
In particular, this diff adds support for reductions on scalar
allocatables where reductions happen on loops nested in `target`
regions. For example:
```fortran
integer :: i
real, allocatable :: scalar_alloc
allocate(scalar_alloc)
scalar_alloc = 0
!$omp target map(tofrom: scalar_alloc)
!$omp parallel do reduction(+: scalar_alloc)
do i = 1, 1000000
scalar_alloc = scalar_alloc + 1
end do
!$omp end target
```
This PR supports by-ref reductions on the intra- and inter-warp levels.
So far, there are still steps to be takens for full support of by-ref
reductions, for example:
* Support inter-block value combination is still not supported.
Therefore, `target teams distribute parallel do` is still not supported.
* Support for dynamically-sized arrays still needs to be added.
* Support for more than one allocatable/array on the same `reduction`
clause.
952 lines
37 KiB
C++
952 lines
37 KiB
C++
//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Optimizer/Builder/DirectivesCommon.h"
|
|
#include "flang/Optimizer/Builder/FIRBuilder.h"
|
|
#include "flang/Optimizer/Builder/HLFIRTools.h"
|
|
#include "flang/Optimizer/Builder/Todo.h"
|
|
#include "flang/Optimizer/Dialect/FIROps.h"
|
|
#include "flang/Optimizer/HLFIR/HLFIROps.h"
|
|
#include "flang/Optimizer/OpenMP/Passes.h"
|
|
#include "flang/Optimizer/OpenMP/Utils.h"
|
|
#include "flang/Support/OpenMP-utils.h"
|
|
#include "flang/Utils/OpenMP.h"
|
|
#include "mlir/Analysis/SliceAnalysis.h"
|
|
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
|
|
#include "mlir/IR/IRMapping.h"
|
|
#include "mlir/Transforms/DialectConversion.h"
|
|
#include "mlir/Transforms/RegionUtils.h"
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
namespace flangomp {
|
|
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
|
|
#include "flang/Optimizer/OpenMP/Passes.h.inc"
|
|
} // namespace flangomp
|
|
|
|
#define DEBUG_TYPE "do-concurrent-conversion"
|
|
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
|
|
|
|
namespace {
|
|
namespace looputils {
|
|
/// Stores info needed about the induction/iteration variable for each `do
|
|
/// concurrent` in a loop nest.
|
|
struct InductionVariableInfo {
|
|
InductionVariableInfo(fir::DoConcurrentLoopOp loop,
|
|
mlir::Value inductionVar) {
|
|
populateInfo(loop, inductionVar);
|
|
}
|
|
/// The operation allocating memory for iteration variable.
|
|
mlir::Operation *iterVarMemDef;
|
|
/// the operation(s) updating the iteration variable with the current
|
|
/// iteration number.
|
|
llvm::SmallVector<mlir::Operation *, 2> indVarUpdateOps;
|
|
|
|
private:
|
|
/// For the \p doLoop parameter, find the following:
|
|
///
|
|
/// 1. The operation that declares its iteration variable or allocates memory
|
|
/// for it. For example, give the following loop:
|
|
/// ```
|
|
/// ...
|
|
/// %i:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : ...
|
|
/// ...
|
|
/// fir.do_concurrent.loop (%ind_var) = (%lb) to (%ub) step (%s) {
|
|
/// %ind_var_conv = fir.convert %ind_var : (index) -> i32
|
|
/// fir.store %ind_var_conv to %i#1 : !fir.ref<i32>
|
|
/// ...
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// This function sets the `iterVarMemDef` member to the `hlfir.declare` op
|
|
/// for `%i`.
|
|
///
|
|
/// 2. The operation(s) that update the loop's iteration variable from its
|
|
/// induction variable. For the above example, the `indVarUpdateOps` is
|
|
/// populated with the first 2 ops in the loop's body.
|
|
///
|
|
/// Note: The current implementation is dependent on how flang emits loop
|
|
/// bodies; which is sufficient for the current simple test/use cases. If this
|
|
/// proves to be insufficient, this should be made more generic.
|
|
void populateInfo(fir::DoConcurrentLoopOp loop, mlir::Value inductionVar) {
|
|
mlir::Value result = nullptr;
|
|
|
|
// Checks if a StoreOp is updating the memref of the loop's iteration
|
|
// variable.
|
|
auto isStoringIV = [&](fir::StoreOp storeOp) {
|
|
// Direct store into the IV memref.
|
|
if (storeOp.getValue() == inductionVar) {
|
|
indVarUpdateOps.push_back(storeOp);
|
|
return true;
|
|
}
|
|
|
|
// Indirect store into the IV memref.
|
|
if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(
|
|
storeOp.getValue().getDefiningOp())) {
|
|
if (convertOp.getOperand() == inductionVar) {
|
|
indVarUpdateOps.push_back(convertOp);
|
|
indVarUpdateOps.push_back(storeOp);
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
};
|
|
|
|
for (mlir::Operation &op : loop) {
|
|
if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(op))
|
|
if (isStoringIV(storeOp)) {
|
|
result = storeOp.getMemref();
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(result != nullptr && result.getDefiningOp() != nullptr);
|
|
iterVarMemDef = result.getDefiningOp();
|
|
}
|
|
};
|
|
|
|
using InductionVariableInfos = llvm::SmallVector<InductionVariableInfo>;
|
|
|
|
/// Collect the list of values used inside the loop but defined outside of it.
|
|
void collectLoopLiveIns(fir::DoConcurrentLoopOp loop,
|
|
llvm::SmallVectorImpl<mlir::Value> &liveIns) {
|
|
llvm::SmallDenseSet<mlir::Value> seenValues;
|
|
llvm::SmallPtrSet<mlir::Operation *, 8> seenOps;
|
|
|
|
for (auto [lb, ub, st] : llvm::zip_equal(
|
|
loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
|
|
liveIns.push_back(lb);
|
|
liveIns.push_back(ub);
|
|
liveIns.push_back(st);
|
|
}
|
|
|
|
mlir::visitUsedValuesDefinedAbove(
|
|
loop.getRegion(), [&](mlir::OpOperand *operand) {
|
|
if (!seenValues.insert(operand->get()).second)
|
|
return;
|
|
|
|
mlir::Operation *definingOp = operand->get().getDefiningOp();
|
|
// We want to collect ops corresponding to live-ins only once.
|
|
if (definingOp && !seenOps.insert(definingOp).second)
|
|
return;
|
|
|
|
liveIns.push_back(operand->get());
|
|
});
|
|
|
|
for (mlir::Value local : loop.getLocalVars())
|
|
liveIns.push_back(local);
|
|
|
|
for (mlir::Value reduce : loop.getReduceVars())
|
|
liveIns.push_back(reduce);
|
|
}
|
|
|
|
/// Collects values that are local to a loop: "loop-local values". A loop-local
|
|
/// value is one that is used exclusively inside the loop but allocated outside
|
|
/// of it. This usually corresponds to temporary values that are used inside the
|
|
/// loop body for initialzing other variables for example.
|
|
///
|
|
/// See `flang/test/Transforms/DoConcurrent/locally_destroyed_temp.f90` for an
|
|
/// example of why we need this.
|
|
///
|
|
/// \param [in] doLoop - the loop within which the function searches for values
|
|
/// used exclusively inside.
|
|
///
|
|
/// \param [out] locals - the list of loop-local values detected for \p doLoop.
|
|
void collectLoopLocalValues(fir::DoConcurrentLoopOp loop,
|
|
llvm::SetVector<mlir::Value> &locals) {
|
|
loop.walk([&](mlir::Operation *op) {
|
|
for (mlir::Value operand : op->getOperands()) {
|
|
if (locals.contains(operand))
|
|
continue;
|
|
|
|
bool isLocal = true;
|
|
|
|
if (!mlir::isa_and_present<fir::AllocaOp>(operand.getDefiningOp()))
|
|
continue;
|
|
|
|
// Values defined inside the loop are not interesting since they do not
|
|
// need to be localized.
|
|
if (loop->isAncestor(operand.getDefiningOp()))
|
|
continue;
|
|
|
|
for (auto *user : operand.getUsers()) {
|
|
if (!loop->isAncestor(user)) {
|
|
isLocal = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (isLocal)
|
|
locals.insert(operand);
|
|
}
|
|
});
|
|
}
|
|
|
|
/// For a "loop-local" value \p local within a loop's scope, localizes that
|
|
/// value within the scope of the parallel region the loop maps to. Towards that
|
|
/// end, this function moves the allocation of \p local within \p allocRegion.
|
|
///
|
|
/// \param local - the value used exclusively within a loop's scope (see
|
|
/// collectLoopLocalValues).
|
|
///
|
|
/// \param allocRegion - the parallel region where \p local's allocation will be
|
|
/// privatized.
|
|
///
|
|
/// \param rewriter - builder used for updating \p allocRegion.
|
|
static void localizeLoopLocalValue(mlir::Value local, mlir::Region &allocRegion,
|
|
mlir::ConversionPatternRewriter &rewriter) {
|
|
rewriter.moveOpBefore(local.getDefiningOp(), &allocRegion.front().front());
|
|
}
|
|
} // namespace looputils
|
|
|
|
class DoConcurrentConversion
|
|
: public mlir::OpConversionPattern<fir::DoConcurrentOp> {
|
|
private:
|
|
struct TargetDeclareShapeCreationInfo {
|
|
// Note: We use `std::vector` (rather than `llvm::SmallVector` as usual) to
|
|
// interface more easily `ShapeShiftOp::getOrigins()` which returns
|
|
// `std::vector`.
|
|
std::vector<mlir::Value> startIndices;
|
|
std::vector<mlir::Value> extents;
|
|
|
|
TargetDeclareShapeCreationInfo(mlir::Value liveIn) {
|
|
mlir::Value shape = nullptr;
|
|
mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
|
|
auto declareOp =
|
|
mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
|
|
|
|
if (declareOp != nullptr)
|
|
shape = declareOp.getShape();
|
|
|
|
if (!shape)
|
|
return;
|
|
|
|
auto shapeOp =
|
|
mlir::dyn_cast_if_present<fir::ShapeOp>(shape.getDefiningOp());
|
|
auto shapeShiftOp =
|
|
mlir::dyn_cast_if_present<fir::ShapeShiftOp>(shape.getDefiningOp());
|
|
|
|
if (!shapeOp && !shapeShiftOp)
|
|
TODO(liveIn.getLoc(),
|
|
"Shapes not defined by `fir.shape` or `fir.shape_shift` op's are"
|
|
"not supported yet.");
|
|
|
|
if (shapeShiftOp != nullptr)
|
|
startIndices = shapeShiftOp.getOrigins();
|
|
|
|
extents = shapeOp != nullptr
|
|
? std::vector<mlir::Value>(shapeOp.getExtents().begin(),
|
|
shapeOp.getExtents().end())
|
|
: shapeShiftOp.getExtents();
|
|
}
|
|
|
|
bool isShapedValue() const { return !extents.empty(); }
|
|
bool isShapeShiftedValue() const { return !startIndices.empty(); }
|
|
};
|
|
|
|
using LiveInShapeInfoMap =
|
|
llvm::DenseMap<mlir::Value, TargetDeclareShapeCreationInfo>;
|
|
|
|
public:
|
|
using mlir::OpConversionPattern<fir::DoConcurrentOp>::OpConversionPattern;
|
|
|
|
DoConcurrentConversion(
|
|
mlir::MLIRContext *context, bool mapToDevice,
|
|
llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip,
|
|
mlir::SymbolTable &moduleSymbolTable)
|
|
: OpConversionPattern(context), mapToDevice(mapToDevice),
|
|
concurrentLoopsToSkip(concurrentLoopsToSkip),
|
|
moduleSymbolTable(moduleSymbolTable) {}
|
|
|
|
mlir::LogicalResult
|
|
matchAndRewrite(fir::DoConcurrentOp doLoop, OpAdaptor adaptor,
|
|
mlir::ConversionPatternRewriter &rewriter) const override {
|
|
looputils::InductionVariableInfos ivInfos;
|
|
auto loop = mlir::cast<fir::DoConcurrentLoopOp>(
|
|
doLoop.getRegion().back().getTerminator());
|
|
|
|
auto indVars = loop.getLoopInductionVars();
|
|
assert(indVars.has_value());
|
|
|
|
for (mlir::Value indVar : *indVars)
|
|
ivInfos.emplace_back(loop, indVar);
|
|
|
|
llvm::SmallVector<mlir::Value> loopNestLiveIns;
|
|
looputils::collectLoopLiveIns(loop, loopNestLiveIns);
|
|
assert(!loopNestLiveIns.empty());
|
|
|
|
llvm::SetVector<mlir::Value> locals;
|
|
looputils::collectLoopLocalValues(loop, locals);
|
|
|
|
// We do not want to map "loop-local" values to the device through
|
|
// `omp.map.info` ops. Therefore, we remove them from the list of live-ins.
|
|
loopNestLiveIns.erase(llvm::remove_if(loopNestLiveIns,
|
|
[&](mlir::Value liveIn) {
|
|
return locals.contains(liveIn);
|
|
}),
|
|
loopNestLiveIns.end());
|
|
|
|
mlir::omp::TargetOp targetOp;
|
|
mlir::omp::LoopNestOperands loopNestClauseOps;
|
|
|
|
mlir::IRMapping mapper;
|
|
|
|
if (mapToDevice) {
|
|
mlir::ModuleOp module = doLoop->getParentOfType<mlir::ModuleOp>();
|
|
bool isTargetDevice =
|
|
llvm::cast<mlir::omp::OffloadModuleInterface>(*module)
|
|
.getIsTargetDevice();
|
|
|
|
mlir::omp::TargetOperands targetClauseOps;
|
|
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps,
|
|
isTargetDevice ? nullptr : &targetClauseOps);
|
|
|
|
LiveInShapeInfoMap liveInShapeInfoMap;
|
|
fir::FirOpBuilder builder(
|
|
rewriter,
|
|
fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
|
|
|
|
for (mlir::Value liveIn : loopNestLiveIns) {
|
|
targetClauseOps.mapVars.push_back(
|
|
genMapInfoOpForLiveIn(builder, liveIn));
|
|
liveInShapeInfoMap.insert(
|
|
{liveIn, TargetDeclareShapeCreationInfo(liveIn)});
|
|
}
|
|
|
|
targetOp =
|
|
genTargetOp(doLoop.getLoc(), rewriter, mapper, loopNestLiveIns,
|
|
targetClauseOps, loopNestClauseOps, liveInShapeInfoMap);
|
|
genTeamsOp(rewriter, loop, mapper);
|
|
}
|
|
|
|
mlir::omp::ParallelOp parallelOp =
|
|
genParallelOp(rewriter, loop, ivInfos, mapper);
|
|
|
|
// Only set as composite when part of `distribute parallel do`.
|
|
parallelOp.setComposite(mapToDevice);
|
|
|
|
if (!mapToDevice)
|
|
genLoopNestClauseOps(doLoop.getLoc(), rewriter, loop, loopNestClauseOps);
|
|
|
|
for (mlir::Value local : locals)
|
|
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
|
|
rewriter);
|
|
|
|
if (mapToDevice)
|
|
genDistributeOp(doLoop.getLoc(), rewriter).setComposite(/*val=*/true);
|
|
|
|
auto [loopNestOp, wsLoopOp] =
|
|
genWsLoopOp(rewriter, loop, mapper, loopNestClauseOps,
|
|
/*isComposite=*/mapToDevice);
|
|
|
|
// `local` region arguments are transferred/cloned from the `do concurrent`
|
|
// loop to the loopnest op when the region is cloned above. Instead, these
|
|
// region arguments should be on the workshare loop's region.
|
|
if (mapToDevice) {
|
|
for (auto [parallelArg, loopNestArg] : llvm::zip_equal(
|
|
parallelOp.getRegion().getArguments(),
|
|
loopNestOp.getRegion().getArguments().slice(
|
|
loop.getLocalOperandsStart(), loop.getNumLocalOperands())))
|
|
rewriter.replaceAllUsesWith(loopNestArg, parallelArg);
|
|
|
|
for (auto [wsloopArg, loopNestArg] : llvm::zip_equal(
|
|
wsLoopOp.getRegion().getArguments(),
|
|
loopNestOp.getRegion().getArguments().slice(
|
|
loop.getReduceOperandsStart(), loop.getNumReduceOperands())))
|
|
rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
|
|
} else {
|
|
for (auto [wsloopArg, loopNestArg] :
|
|
llvm::zip_equal(wsLoopOp.getRegion().getArguments(),
|
|
loopNestOp.getRegion().getArguments().drop_front(
|
|
loopNestClauseOps.loopLowerBounds.size())))
|
|
rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
|
|
}
|
|
|
|
for (unsigned i = 0;
|
|
i < loop.getLocalVars().size() + loop.getReduceVars().size(); ++i)
|
|
loopNestOp.getRegion().eraseArgument(
|
|
loopNestClauseOps.loopLowerBounds.size());
|
|
|
|
rewriter.setInsertionPoint(doLoop);
|
|
fir::FirOpBuilder builder(
|
|
rewriter,
|
|
fir::getKindMapping(doLoop->getParentOfType<mlir::ModuleOp>()));
|
|
|
|
// Collect iteration variable(s) allocations so that we can move them
|
|
// outside the `fir.do_concurrent` wrapper (before erasing it).
|
|
llvm::SmallVector<mlir::Operation *> opsToMove;
|
|
for (mlir::Operation &op : llvm::drop_end(doLoop))
|
|
opsToMove.push_back(&op);
|
|
|
|
mlir::Block *allocBlock = builder.getAllocaBlock();
|
|
|
|
for (mlir::Operation *op : llvm::reverse(opsToMove)) {
|
|
rewriter.moveOpBefore(op, allocBlock, allocBlock->begin());
|
|
}
|
|
|
|
// Mark `unordered` loops that are not perfectly nested to be skipped from
|
|
// the legality check of the `ConversionTarget` since we are not interested
|
|
// in mapping them to OpenMP.
|
|
loopNestOp->walk([&](fir::DoConcurrentOp doLoop) {
|
|
concurrentLoopsToSkip.insert(doLoop);
|
|
});
|
|
|
|
rewriter.eraseOp(doLoop);
|
|
|
|
return mlir::success();
|
|
}
|
|
|
|
private:
|
|
mlir::omp::ParallelOp
|
|
genParallelOp(mlir::ConversionPatternRewriter &rewriter,
|
|
fir::DoConcurrentLoopOp loop,
|
|
looputils::InductionVariableInfos &ivInfos,
|
|
mlir::IRMapping &mapper) const {
|
|
mlir::omp::ParallelOperands parallelOps;
|
|
|
|
if (mapToDevice)
|
|
genPrivatizers(rewriter, mapper, loop, parallelOps);
|
|
|
|
mlir::Location loc = loop.getLoc();
|
|
auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc, parallelOps);
|
|
Fortran::common::openmp::EntryBlockArgs parallelArgs;
|
|
parallelArgs.priv.vars = parallelOps.privateVars;
|
|
Fortran::common::openmp::genEntryBlock(rewriter, parallelArgs,
|
|
parallelOp.getRegion());
|
|
rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
|
|
|
|
genLoopNestIndVarAllocs(rewriter, ivInfos, mapper);
|
|
return parallelOp;
|
|
}
|
|
|
|
void genLoopNestIndVarAllocs(mlir::ConversionPatternRewriter &rewriter,
|
|
looputils::InductionVariableInfos &ivInfos,
|
|
mlir::IRMapping &mapper) const {
|
|
|
|
for (auto &indVarInfo : ivInfos)
|
|
genInductionVariableAlloc(rewriter, indVarInfo.iterVarMemDef, mapper);
|
|
}
|
|
|
|
mlir::Operation *
|
|
genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::Operation *indVarMemDef,
|
|
mlir::IRMapping &mapper) const {
|
|
assert(
|
|
indVarMemDef != nullptr &&
|
|
"Induction variable memdef is expected to have a defining operation.");
|
|
|
|
llvm::SmallSetVector<mlir::Operation *, 2> indVarDeclareAndAlloc;
|
|
for (auto operand : indVarMemDef->getOperands())
|
|
indVarDeclareAndAlloc.insert(operand.getDefiningOp());
|
|
indVarDeclareAndAlloc.insert(indVarMemDef);
|
|
|
|
mlir::Operation *result;
|
|
for (mlir::Operation *opToClone : indVarDeclareAndAlloc)
|
|
result = rewriter.clone(*opToClone, mapper);
|
|
|
|
return result;
|
|
}
|
|
|
|
void genLoopNestClauseOps(
|
|
mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
|
|
fir::DoConcurrentLoopOp loop,
|
|
mlir::omp::LoopNestOperands &loopNestClauseOps,
|
|
mlir::omp::TargetOperands *targetClauseOps = nullptr) const {
|
|
assert(loopNestClauseOps.loopLowerBounds.empty() &&
|
|
"Loop nest bounds were already emitted!");
|
|
|
|
auto populateBounds = [](mlir::Value var,
|
|
llvm::SmallVectorImpl<mlir::Value> &bounds) {
|
|
bounds.push_back(var.getDefiningOp()->getResult(0));
|
|
};
|
|
|
|
auto hostEvalCapture = [&](mlir::Value var,
|
|
llvm::SmallVectorImpl<mlir::Value> &bounds) {
|
|
populateBounds(var, bounds);
|
|
|
|
// Ensure that loop-nest bounds are evaluated in the host and forwarded to
|
|
// the nested omp constructs when we map to the device.
|
|
if (targetClauseOps)
|
|
targetClauseOps->hostEvalVars.push_back(var);
|
|
};
|
|
|
|
for (auto [lb, ub, st] : llvm::zip_equal(
|
|
loop.getLowerBound(), loop.getUpperBound(), loop.getStep())) {
|
|
hostEvalCapture(lb, loopNestClauseOps.loopLowerBounds);
|
|
hostEvalCapture(ub, loopNestClauseOps.loopUpperBounds);
|
|
hostEvalCapture(st, loopNestClauseOps.loopSteps);
|
|
}
|
|
|
|
loopNestClauseOps.loopInclusive = rewriter.getUnitAttr();
|
|
}
|
|
|
|
std::pair<mlir::omp::LoopNestOp, mlir::omp::WsloopOp>
|
|
genWsLoopOp(mlir::ConversionPatternRewriter &rewriter,
|
|
fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
|
|
const mlir::omp::LoopNestOperands &clauseOps,
|
|
bool isComposite) const {
|
|
mlir::omp::WsloopOperands wsloopClauseOps;
|
|
if (!mapToDevice)
|
|
genPrivatizers(rewriter, mapper, loop, wsloopClauseOps);
|
|
|
|
genReductions(rewriter, mapper, loop, wsloopClauseOps);
|
|
|
|
auto wsloopOp =
|
|
mlir::omp::WsloopOp::create(rewriter, loop.getLoc(), wsloopClauseOps);
|
|
wsloopOp.setComposite(isComposite);
|
|
|
|
Fortran::common::openmp::EntryBlockArgs wsloopArgs;
|
|
wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
|
|
wsloopArgs.reduction.vars = wsloopClauseOps.reductionVars;
|
|
Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
|
|
wsloopOp.getRegion());
|
|
|
|
auto loopNestOp =
|
|
mlir::omp::LoopNestOp::create(rewriter, loop.getLoc(), clauseOps);
|
|
|
|
// Clone the loop's body inside the loop nest construct using the
|
|
// mapped values.
|
|
rewriter.cloneRegionBefore(loop.getRegion(), loopNestOp.getRegion(),
|
|
loopNestOp.getRegion().begin(), mapper);
|
|
|
|
rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
|
|
mlir::omp::YieldOp::create(rewriter, loop->getLoc());
|
|
|
|
return {loopNestOp, wsloopOp};
|
|
}
|
|
|
|
void genBoundsOps(fir::FirOpBuilder &builder, mlir::Value liveIn,
|
|
mlir::Value rawAddr,
|
|
llvm::SmallVectorImpl<mlir::Value> &boundsOps) const {
|
|
fir::ExtendedValue extVal =
|
|
hlfir::translateToExtendedValue(rawAddr.getLoc(), builder,
|
|
hlfir::Entity{liveIn},
|
|
/*contiguousHint=*/
|
|
true)
|
|
.first;
|
|
fir::factory::AddrAndBoundsInfo info = fir::factory::getDataOperandBaseAddr(
|
|
builder, rawAddr, /*isOptional=*/false, rawAddr.getLoc());
|
|
boundsOps = fir::factory::genImplicitBoundsOps<mlir::omp::MapBoundsOp,
|
|
mlir::omp::MapBoundsType>(
|
|
builder, info, extVal,
|
|
/*dataExvIsAssumedSize=*/false, rawAddr.getLoc());
|
|
}
|
|
|
|
mlir::omp::MapInfoOp genMapInfoOpForLiveIn(fir::FirOpBuilder &builder,
|
|
mlir::Value liveIn) const {
|
|
mlir::Value rawAddr = liveIn;
|
|
llvm::StringRef name;
|
|
|
|
mlir::Operation *liveInDefiningOp = liveIn.getDefiningOp();
|
|
auto declareOp =
|
|
mlir::dyn_cast_if_present<hlfir::DeclareOp>(liveInDefiningOp);
|
|
|
|
if (declareOp != nullptr) {
|
|
// Use the raw address to avoid unboxing `fir.box` values whenever
|
|
// possible. Put differently, if we have access to the direct value memory
|
|
// reference/address, we use it.
|
|
rawAddr = declareOp.getOriginalBase();
|
|
name = declareOp.getUniqName();
|
|
}
|
|
|
|
if (!llvm::isa<mlir::omp::PointerLikeType>(rawAddr.getType())) {
|
|
mlir::OpBuilder::InsertionGuard guard(builder);
|
|
builder.setInsertionPointAfter(liveInDefiningOp);
|
|
auto copyVal = builder.createTemporary(liveIn.getLoc(), liveIn.getType());
|
|
builder.createStoreWithConvert(copyVal.getLoc(), liveIn, copyVal);
|
|
rawAddr = copyVal;
|
|
}
|
|
|
|
mlir::Type liveInType = liveIn.getType();
|
|
mlir::Type eleType = liveInType;
|
|
if (auto refType = mlir::dyn_cast<fir::ReferenceType>(liveInType))
|
|
eleType = refType.getElementType();
|
|
|
|
mlir::omp::ClauseMapFlags mapFlag = mlir::omp::ClauseMapFlags::implicit;
|
|
mlir::omp::VariableCaptureKind captureKind =
|
|
mlir::omp::VariableCaptureKind::ByRef;
|
|
|
|
if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
|
|
captureKind = mlir::omp::VariableCaptureKind::ByCopy;
|
|
} else if (!fir::isa_builtin_cptr_type(eleType)) {
|
|
mapFlag |= mlir::omp::ClauseMapFlags::to;
|
|
mapFlag |= mlir::omp::ClauseMapFlags::from;
|
|
}
|
|
|
|
llvm::SmallVector<mlir::Value> boundsOps;
|
|
genBoundsOps(builder, liveIn, rawAddr, boundsOps);
|
|
|
|
return Fortran::utils::openmp::createMapInfoOp(
|
|
builder, liveIn.getLoc(), rawAddr,
|
|
/*varPtrPtr=*/{}, name.str(), boundsOps,
|
|
/*members=*/{},
|
|
/*membersIndex=*/mlir::ArrayAttr{}, mapFlag, captureKind,
|
|
rawAddr.getType());
|
|
}
|
|
|
|
mlir::omp::TargetOp
|
|
genTargetOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::IRMapping &mapper, llvm::ArrayRef<mlir::Value> mappedVars,
|
|
mlir::omp::TargetOperands &clauseOps,
|
|
mlir::omp::LoopNestOperands &loopNestClauseOps,
|
|
const LiveInShapeInfoMap &liveInShapeInfoMap) const {
|
|
auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
|
|
auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
|
|
|
|
mlir::Region ®ion = targetOp.getRegion();
|
|
|
|
llvm::SmallVector<mlir::Type> regionArgTypes;
|
|
llvm::SmallVector<mlir::Location> regionArgLocs;
|
|
|
|
for (auto var : llvm::concat<const mlir::Value>(clauseOps.hostEvalVars,
|
|
clauseOps.mapVars)) {
|
|
regionArgTypes.push_back(var.getType());
|
|
regionArgLocs.push_back(var.getLoc());
|
|
}
|
|
|
|
rewriter.createBlock(®ion, {}, regionArgTypes, regionArgLocs);
|
|
fir::FirOpBuilder builder(
|
|
rewriter,
|
|
fir::getKindMapping(targetOp->getParentOfType<mlir::ModuleOp>()));
|
|
|
|
// Within the loop, it is possible that we discover other values that need
|
|
// to be mapped to the target region (the shape info values for arrays, for
|
|
// example). Therefore, the map block args might be extended and resized.
|
|
// Hence, we invoke `argIface.getMapBlockArgs()` every iteration to make
|
|
// sure we access the proper vector of data.
|
|
int idx = 0;
|
|
for (auto [mapInfoOp, mappedVar] :
|
|
llvm::zip_equal(clauseOps.mapVars, mappedVars)) {
|
|
auto miOp = mlir::cast<mlir::omp::MapInfoOp>(mapInfoOp.getDefiningOp());
|
|
hlfir::DeclareOp liveInDeclare =
|
|
genLiveInDeclare(builder, targetOp, argIface.getMapBlockArgs()[idx],
|
|
miOp, liveInShapeInfoMap.at(mappedVar));
|
|
++idx;
|
|
|
|
// If `mappedVar.getDefiningOp()` is a `fir::BoxAddrOp`, we probably
|
|
// need to "unpack" the box by getting the defining op of it's value.
|
|
// However, we did not hit this case in reality yet so leaving it as a
|
|
// todo for now.
|
|
if (mlir::isa<fir::BoxAddrOp>(mappedVar.getDefiningOp()))
|
|
TODO(mappedVar.getLoc(),
|
|
"Mapped variabled defined by `BoxAddrOp` are not supported yet");
|
|
|
|
auto mapHostValueToDevice = [&](mlir::Value hostValue,
|
|
mlir::Value deviceValue) {
|
|
if (!llvm::isa<mlir::omp::PointerLikeType>(hostValue.getType()))
|
|
mapper.map(hostValue,
|
|
builder.loadIfRef(hostValue.getLoc(), deviceValue));
|
|
else
|
|
mapper.map(hostValue, deviceValue);
|
|
};
|
|
|
|
mapHostValueToDevice(mappedVar, liveInDeclare.getOriginalBase());
|
|
|
|
if (auto origDeclareOp = mlir::dyn_cast_if_present<hlfir::DeclareOp>(
|
|
mappedVar.getDefiningOp()))
|
|
mapHostValueToDevice(origDeclareOp.getBase(), liveInDeclare.getBase());
|
|
}
|
|
|
|
for (auto [arg, hostEval] : llvm::zip_equal(argIface.getHostEvalBlockArgs(),
|
|
clauseOps.hostEvalVars))
|
|
mapper.map(hostEval, arg);
|
|
|
|
for (unsigned i = 0; i < loopNestClauseOps.loopLowerBounds.size(); ++i) {
|
|
loopNestClauseOps.loopLowerBounds[i] =
|
|
mapper.lookup(loopNestClauseOps.loopLowerBounds[i]);
|
|
loopNestClauseOps.loopUpperBounds[i] =
|
|
mapper.lookup(loopNestClauseOps.loopUpperBounds[i]);
|
|
loopNestClauseOps.loopSteps[i] =
|
|
mapper.lookup(loopNestClauseOps.loopSteps[i]);
|
|
}
|
|
|
|
// Check if cloning the bounds introduced any dependency on the outer
|
|
// region. If so, then either clone them as well if they are
|
|
// MemoryEffectFree, or else copy them to a new temporary and add them to
|
|
// the map and block_argument lists and replace their uses with the new
|
|
// temporary.
|
|
Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
|
|
rewriter.setInsertionPoint(
|
|
mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));
|
|
|
|
return targetOp;
|
|
}
|
|
|
|
hlfir::DeclareOp genLiveInDeclare(
|
|
fir::FirOpBuilder &builder, mlir::omp::TargetOp targetOp,
|
|
mlir::Value liveInArg, mlir::omp::MapInfoOp liveInMapInfoOp,
|
|
const TargetDeclareShapeCreationInfo &targetShapeCreationInfo) const {
|
|
mlir::Type liveInType = liveInArg.getType();
|
|
std::string liveInName = liveInMapInfoOp.getName().has_value()
|
|
? liveInMapInfoOp.getName().value().str()
|
|
: std::string("");
|
|
if (fir::isa_ref_type(liveInType))
|
|
liveInType = fir::unwrapRefType(liveInType);
|
|
|
|
mlir::Value shape = [&]() -> mlir::Value {
|
|
if (!targetShapeCreationInfo.isShapedValue())
|
|
return {};
|
|
|
|
if (targetShapeCreationInfo.isShapeShiftedValue()) {
|
|
llvm::SmallVector<mlir::Value> shapeShiftOperands;
|
|
|
|
size_t shapeIdx = 0;
|
|
for (auto [startIndex, extent] :
|
|
llvm::zip_equal(targetShapeCreationInfo.startIndices,
|
|
targetShapeCreationInfo.extents)) {
|
|
shapeShiftOperands.push_back(
|
|
Fortran::utils::openmp::mapTemporaryValue(
|
|
builder, targetOp, startIndex,
|
|
liveInName + ".start_idx.dim" + std::to_string(shapeIdx)));
|
|
shapeShiftOperands.push_back(
|
|
Fortran::utils::openmp::mapTemporaryValue(
|
|
builder, targetOp, extent,
|
|
liveInName + ".extent.dim" + std::to_string(shapeIdx)));
|
|
++shapeIdx;
|
|
}
|
|
|
|
auto shapeShiftType = fir::ShapeShiftType::get(
|
|
builder.getContext(), shapeShiftOperands.size() / 2);
|
|
return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
|
|
shapeShiftType, shapeShiftOperands);
|
|
}
|
|
|
|
llvm::SmallVector<mlir::Value> shapeOperands;
|
|
size_t shapeIdx = 0;
|
|
for (auto extent : targetShapeCreationInfo.extents) {
|
|
shapeOperands.push_back(Fortran::utils::openmp::mapTemporaryValue(
|
|
builder, targetOp, extent,
|
|
liveInName + ".extent.dim" + std::to_string(shapeIdx)));
|
|
++shapeIdx;
|
|
}
|
|
|
|
return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
|
|
}();
|
|
|
|
return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
|
|
liveInName, shape);
|
|
}
|
|
|
|
mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
|
|
fir::DoConcurrentLoopOp loop,
|
|
mlir::IRMapping &mapper) const {
|
|
mlir::omp::TeamsOperands teamsOps;
|
|
genReductions(rewriter, mapper, loop, teamsOps);
|
|
|
|
mlir::Location loc = loop.getLoc();
|
|
auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
|
|
Fortran::common::openmp::EntryBlockArgs teamsArgs;
|
|
teamsArgs.reduction.vars = teamsOps.reductionVars;
|
|
Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
|
|
teamsOp.getRegion());
|
|
|
|
rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
|
|
|
|
for (auto [loopVar, teamsArg] : llvm::zip_equal(
|
|
loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
|
|
mapper.map(loopVar, teamsArg);
|
|
}
|
|
|
|
return teamsOp;
|
|
}
|
|
|
|
mlir::omp::DistributeOp
|
|
genDistributeOp(mlir::Location loc,
|
|
mlir::ConversionPatternRewriter &rewriter) const {
|
|
auto distOp = mlir::omp::DistributeOp::create(
|
|
rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});
|
|
|
|
rewriter.createBlock(&distOp.getRegion());
|
|
return distOp;
|
|
}
|
|
|
|
void cloneFIRRegionToOMP(mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::Region &firRegion,
|
|
mlir::Region &ompRegion) const {
|
|
if (!firRegion.empty()) {
|
|
rewriter.cloneRegionBefore(firRegion, ompRegion, ompRegion.begin());
|
|
auto firYield =
|
|
mlir::cast<fir::YieldOp>(ompRegion.back().getTerminator());
|
|
rewriter.setInsertionPoint(firYield);
|
|
mlir::omp::YieldOp::create(rewriter, firYield.getLoc(),
|
|
firYield.getOperands());
|
|
rewriter.eraseOp(firYield);
|
|
}
|
|
}
|
|
|
|
/// Generate bodies of OpenMP privatizers by cloning the bodies of FIR
|
|
/// privatizers.
|
|
///
|
|
/// \param [in] rewriter - used to driver IR generation for privatizers.
|
|
/// \param [in] mapper - value mapping from FIR to OpenMP constructs.
|
|
/// \param [in] loop - FIR loop to convert its localizers.
|
|
///
|
|
/// \param [out] privateClauseOps - OpenMP privatizers to gen their bodies.
|
|
void genPrivatizers(mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
|
|
mlir::omp::PrivateClauseOps &privateClauseOps) const {
|
|
// For `local` (and `local_init`) operands, emit corresponding `private`
|
|
// clauses and attach these clauses to the workshare loop.
|
|
if (!loop.getLocalVars().empty())
|
|
for (auto [var, sym, arg] : llvm::zip_equal(
|
|
loop.getLocalVars(),
|
|
loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
|
|
loop.getRegionLocalArgs())) {
|
|
auto localizer = moduleSymbolTable.lookup<fir::LocalitySpecifierOp>(
|
|
sym.getLeafReference());
|
|
if (localizer.getLocalitySpecifierType() ==
|
|
fir::LocalitySpecifierType::LocalInit)
|
|
TODO(localizer.getLoc(),
|
|
"local_init conversion is not supported yet");
|
|
|
|
mlir::OpBuilder::InsertionGuard guard(rewriter);
|
|
rewriter.setInsertionPointAfter(localizer);
|
|
|
|
auto privatizer = mlir::omp::PrivateClauseOp::create(
|
|
rewriter, localizer.getLoc(), sym.getLeafReference().str() + ".omp",
|
|
localizer.getTypeAttr().getValue(),
|
|
mlir::omp::DataSharingClauseType::Private);
|
|
|
|
cloneFIRRegionToOMP(rewriter, localizer.getInitRegion(),
|
|
privatizer.getInitRegion());
|
|
cloneFIRRegionToOMP(rewriter, localizer.getDeallocRegion(),
|
|
privatizer.getDeallocRegion());
|
|
|
|
moduleSymbolTable.insert(privatizer);
|
|
|
|
privateClauseOps.privateVars.push_back(mapToDevice ? mapper.lookup(var)
|
|
: var);
|
|
privateClauseOps.privateSyms.push_back(
|
|
mlir::SymbolRefAttr::get(privatizer));
|
|
}
|
|
}
|
|
|
|
void genReductions(mlir::ConversionPatternRewriter &rewriter,
|
|
mlir::IRMapping &mapper, fir::DoConcurrentLoopOp loop,
|
|
mlir::omp::ReductionClauseOps &reductionClauseOps) const {
|
|
if (!loop.getReduceVars().empty()) {
|
|
for (auto [var, byRef, sym, arg] : llvm::zip_equal(
|
|
loop.getReduceVars(), loop.getReduceByrefAttr().asArrayRef(),
|
|
loop.getReduceSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
|
|
loop.getRegionReduceArgs())) {
|
|
auto firReducer = moduleSymbolTable.lookup<fir::DeclareReductionOp>(
|
|
sym.getLeafReference());
|
|
|
|
mlir::OpBuilder::InsertionGuard guard(rewriter);
|
|
rewriter.setInsertionPointAfter(firReducer);
|
|
std::string ompReducerName = sym.getLeafReference().str() + ".omp";
|
|
|
|
auto ompReducer =
|
|
moduleSymbolTable.lookup<mlir::omp::DeclareReductionOp>(
|
|
rewriter.getStringAttr(ompReducerName));
|
|
|
|
if (!ompReducer) {
|
|
ompReducer = mlir::omp::DeclareReductionOp::create(
|
|
rewriter, firReducer.getLoc(), ompReducerName,
|
|
firReducer.getTypeAttr().getValue(),
|
|
firReducer.getByrefElementTypeAttr());
|
|
|
|
cloneFIRRegionToOMP(rewriter, firReducer.getAllocRegion(),
|
|
ompReducer.getAllocRegion());
|
|
cloneFIRRegionToOMP(rewriter, firReducer.getInitializerRegion(),
|
|
ompReducer.getInitializerRegion());
|
|
cloneFIRRegionToOMP(rewriter, firReducer.getReductionRegion(),
|
|
ompReducer.getReductionRegion());
|
|
cloneFIRRegionToOMP(rewriter, firReducer.getAtomicReductionRegion(),
|
|
ompReducer.getAtomicReductionRegion());
|
|
cloneFIRRegionToOMP(rewriter, firReducer.getCleanupRegion(),
|
|
ompReducer.getCleanupRegion());
|
|
moduleSymbolTable.insert(ompReducer);
|
|
}
|
|
|
|
reductionClauseOps.reductionVars.push_back(
|
|
mapToDevice ? mapper.lookup(var) : var);
|
|
reductionClauseOps.reductionByref.push_back(byRef);
|
|
reductionClauseOps.reductionSyms.push_back(
|
|
mlir::SymbolRefAttr::get(ompReducer));
|
|
}
|
|
}
|
|
}
|
|
|
|
bool mapToDevice;
|
|
llvm::DenseSet<fir::DoConcurrentOp> &concurrentLoopsToSkip;
|
|
mlir::SymbolTable &moduleSymbolTable;
|
|
};
|
|
|
|
/// A listener that forwards notifyOperationErased to the given callback.
|
|
struct CallbackListener : public mlir::RewriterBase::Listener {
|
|
CallbackListener(std::function<void(mlir::Operation *op)> onOperationErased)
|
|
: onOperationErased(onOperationErased) {}
|
|
|
|
void notifyOperationErased(mlir::Operation *op) override {
|
|
onOperationErased(op);
|
|
}
|
|
|
|
std::function<void(mlir::Operation *op)> onOperationErased;
|
|
};
|
|
|
|
class DoConcurrentConversionPass
|
|
: public flangomp::impl::DoConcurrentConversionPassBase<
|
|
DoConcurrentConversionPass> {
|
|
public:
|
|
DoConcurrentConversionPass() = default;
|
|
|
|
DoConcurrentConversionPass(
|
|
const flangomp::DoConcurrentConversionPassOptions &options)
|
|
: DoConcurrentConversionPassBase(options) {}
|
|
|
|
void runOnOperation() override {
|
|
mlir::ModuleOp module = getOperation();
|
|
mlir::MLIRContext *context = &getContext();
|
|
mlir::SymbolTable moduleSymbolTable(module);
|
|
|
|
if (mapTo != flangomp::DoConcurrentMappingKind::DCMK_Host &&
|
|
mapTo != flangomp::DoConcurrentMappingKind::DCMK_Device) {
|
|
mlir::emitWarning(mlir::UnknownLoc::get(context),
|
|
"DoConcurrentConversionPass: invalid `map-to` value. "
|
|
"Valid values are: `host` or `device`");
|
|
return;
|
|
}
|
|
|
|
llvm::DenseSet<fir::DoConcurrentOp> concurrentLoopsToSkip;
|
|
CallbackListener callbackListener([&](mlir::Operation *op) {
|
|
if (auto loop = mlir::dyn_cast<fir::DoConcurrentOp>(op))
|
|
concurrentLoopsToSkip.erase(loop);
|
|
});
|
|
mlir::RewritePatternSet patterns(context);
|
|
patterns.insert<DoConcurrentConversion>(
|
|
context, mapTo == flangomp::DoConcurrentMappingKind::DCMK_Device,
|
|
concurrentLoopsToSkip, moduleSymbolTable);
|
|
mlir::ConversionTarget target(*context);
|
|
target.addDynamicallyLegalOp<fir::DoConcurrentOp>(
|
|
[&](fir::DoConcurrentOp op) {
|
|
return concurrentLoopsToSkip.contains(op);
|
|
});
|
|
target.markUnknownOpDynamicallyLegal(
|
|
[](mlir::Operation *) { return true; });
|
|
|
|
mlir::ConversionConfig config;
|
|
config.allowPatternRollback = false;
|
|
config.listener = &callbackListener;
|
|
if (mlir::failed(mlir::applyFullConversion(module, target,
|
|
std::move(patterns), config))) {
|
|
signalPassFailure();
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
std::unique_ptr<mlir::Pass>
|
|
flangomp::createDoConcurrentConversionPass(bool mapToDevice) {
|
|
DoConcurrentConversionPassOptions options;
|
|
options.mapTo = mapToDevice ? flangomp::DoConcurrentMappingKind::DCMK_Device
|
|
: flangomp::DoConcurrentMappingKind::DCMK_Host;
|
|
|
|
return std::make_unique<DoConcurrentConversionPass>(options);
|
|
}
|