llvm-project/flang/lib/Optimizer/OpenMP/LowerWorkshare.cpp
Carlos Seo db5cd626b9
[flang][OpenMP] Restrict isSafeToParallelize to write-only thread-local effects (#188595)
This is a follow-up fix for commit 0f5e9bee.

Only write effects to thread-local memory should be considered safe to
parallelize in workshare lowering, not reads. When both reads and writes
were safe, the cascading effect in moveToSingle could cause entire
SingleRegions to become fully parallelized, eliminating the omp.single
and its implicit barrier. This removed synchronization points needed to
keep threads coordinated inside sequential loops containing workshared
operations, causing race conditions in forall-workshare patterns.

This was exposed by the Fujitsu Test Suite and made the following tests
regress:

FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0031.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0013.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0030.test
FAIL: test-suite :: Fujitsu/Fortran/0398/Fujitsu-Fortran-0398_0014.test

Updates #143330
2026-03-27 12:11:27 -03:00

658 lines
25 KiB
C++

//===- LowerWorkshare.cpp - special cases for bufferization -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the lowering of omp.workshare to other omp constructs.
//
// This pass is tasked with parallelizing the loops nested in
// workshare.loop_wrapper while both the Fortran to mlir lowering and the hlfir
// to fir lowering pipelines are responsible for emitting the
// workshare.loop_wrapper ops where appropriate according to the
// `shouldUseWorkshareLowering` function.
//
//===----------------------------------------------------------------------===//
#include <flang/Optimizer/Analysis/AliasAnalysis.h>
#include <flang/Optimizer/Builder/FIRBuilder.h>
#include <flang/Optimizer/Dialect/FIROps.h>
#include <flang/Optimizer/Dialect/FIRType.h>
#include <flang/Optimizer/HLFIR/HLFIROps.h>
#include <flang/Optimizer/OpenMP/Passes.h>
#include <llvm/ADT/BreadthFirstIterator.h>
#include <llvm/ADT/STLExtras.h>
#include <llvm/ADT/SmallVectorExtras.h>
#include <llvm/ADT/iterator_range.h>
#include <llvm/Support/ErrorHandling.h>
#include <mlir/Dialect/Arith/IR/Arith.h>
#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
#include <mlir/Dialect/OpenMP/OpenMPClauseOperands.h>
#include <mlir/Dialect/OpenMP/OpenMPDialect.h>
#include <mlir/Dialect/SCF/IR/SCF.h>
#include <mlir/IR/BuiltinOps.h>
#include <mlir/IR/IRMapping.h>
#include <mlir/IR/OpDefinition.h>
#include <mlir/IR/PatternMatch.h>
#include <mlir/IR/Value.h>
#include <mlir/IR/Visitors.h>
#include <mlir/Interfaces/LoopLikeInterface.h>
#include <mlir/Interfaces/SideEffectInterfaces.h>
#include <mlir/Support/LLVM.h>
#include <variant>
namespace flangomp {
#define GEN_PASS_DEF_LOWERWORKSHARE
#include "flang/Optimizer/OpenMP/Passes.h.inc"
} // namespace flangomp
#define DEBUG_TYPE "lower-workshare"
using namespace mlir;
namespace flangomp {
// Checks for nesting pattern below as we need to avoid sharing the work of
// statements which are nested in some constructs such as omp.critical or
// another omp.parallel.
//
// omp.workshare { // `wsOp`
// ...
// omp.T { // `parent`
// ...
// `op`
//
template <typename T>
static bool isNestedIn(omp::WorkshareOp wsOp, Operation *op) {
T parent = op->getParentOfType<T>();
if (!parent)
return false;
return wsOp->isProperAncestor(parent);
}
bool shouldUseWorkshareLowering(Operation *op) {
auto parentWorkshare = op->getParentOfType<omp::WorkshareOp>();
if (!parentWorkshare)
return false;
if (isNestedIn<omp::CriticalOp>(parentWorkshare, op))
return false;
// 2.8.3 workshare Construct
// For a parallel construct, the construct is a unit of work with respect to
// the workshare construct. The statements contained in the parallel construct
// are executed by a new thread team.
if (isNestedIn<omp::ParallelOp>(parentWorkshare, op))
return false;
// 2.8.2 single Construct
// Binding The binding thread set for a single region is the current team. A
// single region binds to the innermost enclosing parallel region.
// Description Only one of the encountering threads will execute the
// structured block associated with the single construct.
if (isNestedIn<omp::SingleOp>(parentWorkshare, op))
return false;
// Do not use workshare lowering until we support CFG in omp.workshare
if (parentWorkshare.getRegion().getBlocks().size() != 1)
return false;
return true;
}
} // namespace flangomp
namespace {
struct SingleRegion {
Block::iterator begin, end;
};
static bool mustParallelizeOp(Operation *op) {
return op
->walk([&](Operation *nested) {
// We need to be careful not to pick up workshare.loop_wrapper in nested
// omp.parallel{omp.workshare} regions, i.e. make sure that `nested`
// binds to the workshare region we are currently handling.
//
// For example:
//
// omp.parallel {
// omp.workshare { // currently handling this
// omp.parallel {
// omp.workshare { // nested workshare
// omp.workshare.loop_wrapper {}
//
// Therefore, we skip if we encounter a nested omp.workshare.
if (isa<omp::WorkshareOp>(nested))
return WalkResult::skip();
if (isa<omp::WorkshareLoopWrapperOp>(nested))
return WalkResult::interrupt();
return WalkResult::advance();
})
.wasInterrupted();
}
// Determines if a memory reference is thread-local in an OpenMP context.
//
// This is a best-effort analysis. We cannot definitively determine if code
// is inside a parallel region when it's in a function called from that
// region. However, we can identify common patterns of thread-local memory:
//
// 1. Memory allocated via fir.alloca inside the enclosing omp.parallel region
// 2. Memory that comes from OpenMP clause block arguments that create
// thread-local storage (private, firstprivate, lastprivate, reduction,
// linear clauses)
//
// Returns true if the memory reference appears to be thread-local and thus
// safe to parallelize (each thread should access its own copy).
static bool isOpenMPThreadLocalMemory(Operation *op, Value mem) {
// Use AliasAnalysis to trace through declares, converts, reboxes, etc.
// to find the underlying source of the memory reference.
fir::AliasAnalysis aliasAnalysis;
fir::AliasAnalysis::Source source = aliasAnalysis.getSource(mem);
// Check if the source is a Value (not a global symbol).
mlir::Value sourceValue =
llvm::dyn_cast_if_present<mlir::Value>(source.origin.u);
if (!sourceValue)
return false;
// Case 1: Memory allocated by fir.alloca inside the enclosing parallel
// region is thread-private (each thread gets its own stack allocation).
// Note: fir.allocmem is NOT thread-local even inside omp.parallel.
if (source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
if (auto alloca = sourceValue.getDefiningOp<fir::AllocaOp>()) {
if (auto parallelOp = alloca->getParentOfType<omp::ParallelOp>()) {
if (op->getParentOfType<omp::ParallelOp>() == parallelOp)
return true;
}
}
// When the alias analysis encounters an hlfir.declare/fir.declare on a
// private clause block argument, it marks the SourceKind as Allocate and
// sets the source value to the declare op result (not the block arg).
// Trace through the declare to check if the underlying Memref is a
// private block argument.
Value declMemref;
if (auto hlfirDecl = sourceValue.getDefiningOp<hlfir::DeclareOp>())
declMemref = hlfirDecl.getMemref();
else if (auto firDecl = sourceValue.getDefiningOp<fir::DeclareOp>())
declMemref = firDecl.getMemref();
if (declMemref) {
if (auto blockArg = llvm::dyn_cast<BlockArgument>(declMemref)) {
Operation *parentOp = blockArg.getOwner()->getParentOp();
if (auto argIface =
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
if (llvm::is_contained(argIface.getPrivateBlockArgs(), blockArg))
return true;
}
}
}
}
// Case 2: Memory from OpenMP clause block arguments that create thread-local
// storage. These clauses create private copies for each thread:
// - private: uninitialized thread-local copy
// - firstprivate: thread-local copy initialized from original
// - lastprivate: thread-local copy, value copied back after construct
// - reduction: thread-local copy for reduction operations
// - linear: thread-local copy with linear modification
//
// Check if the source value is a block argument of an OpenMP operation
// that implements BlockArgOpenMPOpInterface.
if (auto blockArg = llvm::dyn_cast<BlockArgument>(sourceValue)) {
Operation *parentOp = blockArg.getOwner()->getParentOp();
if (auto argIface =
llvm::dyn_cast<omp::BlockArgOpenMPOpInterface>(parentOp)) {
// Check if this block argument corresponds to a privatizing clause.
// Private, reduction, and in_reduction clauses create thread-local
// memory.
auto isInBlockArgs = [&](auto blockArgs) {
return llvm::is_contained(blockArgs, blockArg);
};
if (isInBlockArgs(argIface.getPrivateBlockArgs()))
return true;
if (isInBlockArgs(argIface.getReductionBlockArgs()))
return true;
if (isInBlockArgs(argIface.getInReductionBlockArgs()))
return true;
if (isInBlockArgs(argIface.getTaskReductionBlockArgs()))
return true;
}
}
return false;
}
static bool isSafeToParallelize(Operation *op) {
if (isa<hlfir::DeclareOp>(op) || isa<fir::DeclareOp>(op) ||
isMemoryEffectFree(op))
return true;
// Thread-local variables allocated in the OpenMP parallel region or coming
// from privatizing clauses are private to each thread and thus safe (and
// sometimes required) to parallelize. If the compiler wraps stores to
// thread-local variables in an omp.single block, only one thread updates
// its local copy, while all other threads read uninitialized data (see
// issue #143330).
//
// Only WRITE effects to thread-local memory are considered safe here, not
// reads. If reads were also safe, the cascading effect in moveToSingle
// could cause entire SingleRegions to become fully parallelized (all ops
// safe), eliminating the omp.single and its implicit barrier. This removes
// synchronization points needed to keep threads coordinated inside
// sequential loops that contain workshared operations.
if (auto memEffects = dyn_cast<MemoryEffectOpInterface>(op)) {
SmallVector<MemoryEffects::EffectInstance> effects;
memEffects.getEffects(effects);
if (!effects.empty() &&
llvm::all_of(effects, [&](const MemoryEffects::EffectInstance &effect) {
Value val = effect.getValue();
return val && isa<MemoryEffects::Write>(effect.getEffect()) &&
isOpenMPThreadLocalMemory(op, val);
}))
return true;
}
return false;
}
/// Simple shallow copies suffice for our purposes in this pass, so we implement
/// this simpler alternative to the full fledged `createCopyFunc` in the
/// frontend
static mlir::func::FuncOp createCopyFunc(mlir::Location loc, mlir::Type varType,
fir::FirOpBuilder builder) {
mlir::ModuleOp module = builder.getModule();
auto rt = cast<fir::ReferenceType>(varType);
mlir::Type eleTy = rt.getEleTy();
std::string copyFuncName =
fir::getTypeAsString(eleTy, builder.getKindMap(), "_workshare_copy");
if (auto decl = module.lookupSymbol<mlir::func::FuncOp>(copyFuncName))
return decl;
// create function
mlir::OpBuilder::InsertionGuard guard(builder);
mlir::OpBuilder modBuilder(module.getBodyRegion());
llvm::SmallVector<mlir::Type> argsTy = {varType, varType};
auto funcType = mlir::FunctionType::get(builder.getContext(), argsTy, {});
mlir::func::FuncOp funcOp =
mlir::func::FuncOp::create(modBuilder, loc, copyFuncName, funcType);
funcOp.setVisibility(mlir::SymbolTable::Visibility::Private);
fir::factory::setInternalLinkage(funcOp);
builder.createBlock(&funcOp.getRegion(), funcOp.getRegion().end(), argsTy,
{loc, loc});
builder.setInsertionPointToStart(&funcOp.getRegion().back());
Value loaded = fir::LoadOp::create(builder, loc, funcOp.getArgument(1));
fir::StoreOp::create(builder, loc, loaded, funcOp.getArgument(0));
mlir::func::ReturnOp::create(builder, loc);
return funcOp;
}
static bool isUserOutsideSR(Operation *user, Operation *parentOp,
SingleRegion sr) {
while (user->getParentOp() != parentOp)
user = user->getParentOp();
return sr.begin->getBlock() != user->getBlock() ||
!(user->isBeforeInBlock(&*sr.end) && sr.begin->isBeforeInBlock(user));
}
static bool isTransitivelyUsedOutside(Value v, SingleRegion sr) {
Block *srBlock = sr.begin->getBlock();
Operation *parentOp = srBlock->getParentOp();
for (auto &use : v.getUses()) {
Operation *user = use.getOwner();
if (isUserOutsideSR(user, parentOp, sr))
return true;
// Now we know user is inside `sr`.
// Results of nested users cannot be used outside of `sr`.
if (user->getBlock() != srBlock)
continue;
// A non-safe to parallelize operation will be checked for uses outside
// separately.
if (!isSafeToParallelize(user))
continue;
// For safe to parallelize operations, we need to check if there is a
// transitive use of `v` through them.
for (auto res : user->getResults())
if (isTransitivelyUsedOutside(res, sr))
return true;
}
return false;
}
/// We clone pure operations in both the parallel and single blocks. this
/// functions cleans them up if they end up with no uses
static void cleanupBlock(Block *block) {
for (Operation &op : llvm::make_early_inc_range(
llvm::make_range(block->rbegin(), block->rend())))
if (isOpTriviallyDead(&op))
op.erase();
}
static void parallelizeRegion(Region &sourceRegion, Region &targetRegion,
IRMapping &rootMapping, Location loc,
mlir::DominanceInfo &di) {
OpBuilder rootBuilder(sourceRegion.getContext());
ModuleOp m = sourceRegion.getParentOfType<ModuleOp>();
OpBuilder copyFuncBuilder(m.getBodyRegion());
fir::FirOpBuilder firCopyFuncBuilder(copyFuncBuilder, m);
auto mapReloadedValue =
[&](Value v, OpBuilder allocaBuilder, OpBuilder singleBuilder,
OpBuilder parallelBuilder, IRMapping singleMapping) -> Value {
if (auto reloaded = rootMapping.lookupOrNull(v))
return nullptr;
Type ty = v.getType();
Value alloc = fir::AllocaOp::create(allocaBuilder, loc, ty);
fir::StoreOp::create(singleBuilder, loc, singleMapping.lookup(v), alloc);
Value reloaded = fir::LoadOp::create(parallelBuilder, loc, ty, alloc);
rootMapping.map(v, reloaded);
return alloc;
};
auto moveToSingle =
[&](SingleRegion sr, OpBuilder allocaBuilder, OpBuilder singleBuilder,
OpBuilder parallelBuilder) -> std::pair<bool, SmallVector<Value>> {
IRMapping singleMapping = rootMapping;
SmallVector<Value> copyPrivate;
bool allParallelized = true;
for (Operation &op : llvm::make_range(sr.begin, sr.end)) {
if (isSafeToParallelize(&op)) {
singleBuilder.clone(op, singleMapping);
if (llvm::all_of(op.getOperands(), [&](Value opr) {
// Either we have already remapped it
bool remapped = rootMapping.contains(opr);
// Or it is available because it dominates `sr`
bool dominates = di.properlyDominates(opr, &*sr.begin);
return remapped || dominates;
})) {
// Safe to parallelize operations which have all operands available in
// the root parallel block can be executed there.
parallelBuilder.clone(op, rootMapping);
} else {
// If any operand was not available, it means that there was no
// transitive use of a non-safe-to-parallelize operation outside `sr`.
// This means that there should be no transitive uses outside `sr` of
// `op`.
assert(llvm::all_of(op.getResults(), [&](Value v) {
return !isTransitivelyUsedOutside(v, sr);
}));
allParallelized = false;
}
} else if (auto alloca = dyn_cast<fir::AllocaOp>(&op)) {
auto hoisted =
cast<fir::AllocaOp>(allocaBuilder.clone(*alloca, singleMapping));
rootMapping.map(&*alloca, &*hoisted);
rootMapping.map(alloca.getResult(), hoisted.getResult());
copyPrivate.push_back(hoisted);
allParallelized = false;
} else {
singleBuilder.clone(op, singleMapping);
// Prepare reloaded values for results of operations that cannot be
// safely parallelized and which are used after the region `sr`.
for (auto res : op.getResults()) {
if (isTransitivelyUsedOutside(res, sr)) {
auto alloc = mapReloadedValue(res, allocaBuilder, singleBuilder,
parallelBuilder, singleMapping);
if (alloc)
copyPrivate.push_back(alloc);
}
}
allParallelized = false;
}
}
omp::TerminatorOp::create(singleBuilder, loc);
return {allParallelized, copyPrivate};
};
for (Block &block : sourceRegion) {
Block *targetBlock = rootBuilder.createBlock(
&targetRegion, {}, block.getArgumentTypes(),
llvm::map_to_vector(block.getArguments(),
[](BlockArgument arg) { return arg.getLoc(); }));
rootMapping.map(&block, targetBlock);
rootMapping.map(block.getArguments(), targetBlock->getArguments());
}
auto handleOneBlock = [&](Block &block) {
Block &targetBlock = *rootMapping.lookup(&block);
rootBuilder.setInsertionPointToStart(&targetBlock);
Operation *terminator = block.getTerminator();
SmallVector<std::variant<SingleRegion, Operation *>> regions;
auto it = block.begin();
auto getOneRegion = [&]() {
if (&*it == terminator)
return false;
if (mustParallelizeOp(&*it)) {
regions.push_back(&*it);
it++;
return true;
}
SingleRegion sr;
sr.begin = it;
while (&*it != terminator && !mustParallelizeOp(&*it))
it++;
sr.end = it;
assert(sr.begin != sr.end);
regions.push_back(sr);
return true;
};
while (getOneRegion())
;
for (auto [i, opOrSingle] : llvm::enumerate(regions)) {
bool isLast = i + 1 == regions.size();
// Make sure shared runtime calls are synchronized: disable `nowait`
// insertion, and rely on the implicit barrier at the end of the
// omp.workshare block. This applies to any loop-like operation
// (fir.do_loop, fir.iterate_while, fir.do_concurrent.loop, etc.)
// because iterations could overlap if nowait is used.
if (isa<LoopLikeOpInterface>(block.getParentOp()))
isLast = false;
if (std::holds_alternative<SingleRegion>(opOrSingle)) {
OpBuilder singleBuilder(sourceRegion.getContext());
Block *singleBlock = new Block();
singleBuilder.setInsertionPointToStart(singleBlock);
OpBuilder allocaBuilder(sourceRegion.getContext());
Block *allocaBlock = new Block();
allocaBuilder.setInsertionPointToStart(allocaBlock);
OpBuilder parallelBuilder(sourceRegion.getContext());
Block *parallelBlock = new Block();
parallelBuilder.setInsertionPointToStart(parallelBlock);
auto [allParallelized, copyprivateVars] =
moveToSingle(std::get<SingleRegion>(opOrSingle), allocaBuilder,
singleBuilder, parallelBuilder);
if (allParallelized) {
// The single region was not required as all operations were safe to
// parallelize
assert(copyprivateVars.empty());
assert(allocaBlock->empty());
delete singleBlock;
} else {
omp::SingleOperands singleOperands;
if (isLast)
singleOperands.nowait = rootBuilder.getUnitAttr();
singleOperands.copyprivateVars = copyprivateVars;
cleanupBlock(singleBlock);
for (auto var : singleOperands.copyprivateVars) {
mlir::func::FuncOp funcOp =
createCopyFunc(loc, var.getType(), firCopyFuncBuilder);
singleOperands.copyprivateSyms.push_back(
SymbolRefAttr::get(funcOp));
}
omp::SingleOp singleOp =
omp::SingleOp::create(rootBuilder, loc, singleOperands);
singleOp.getRegion().push_back(singleBlock);
targetRegion.front().getOperations().splice(
singleOp->getIterator(), allocaBlock->getOperations());
}
rootBuilder.getInsertionBlock()->getOperations().splice(
rootBuilder.getInsertionPoint(), parallelBlock->getOperations());
delete allocaBlock;
delete parallelBlock;
} else {
auto op = std::get<Operation *>(opOrSingle);
if (auto wslw = dyn_cast<omp::WorkshareLoopWrapperOp>(op)) {
omp::WsloopOperands wsloopOperands;
if (isLast)
wsloopOperands.nowait = rootBuilder.getUnitAttr();
auto wsloop =
mlir::omp::WsloopOp::create(rootBuilder, loc, wsloopOperands);
auto clonedWslw = cast<omp::WorkshareLoopWrapperOp>(
rootBuilder.clone(*wslw, rootMapping));
wsloop.getRegion().takeBody(clonedWslw.getRegion());
clonedWslw->erase();
} else {
assert(mustParallelizeOp(op));
Operation *cloned = rootBuilder.cloneWithoutRegions(*op, rootMapping);
for (auto [region, clonedRegion] :
llvm::zip(op->getRegions(), cloned->getRegions()))
parallelizeRegion(region, clonedRegion, rootMapping, loc, di);
}
}
}
rootBuilder.clone(*block.getTerminator(), rootMapping);
};
if (sourceRegion.hasOneBlock()) {
handleOneBlock(sourceRegion.front());
} else if (!sourceRegion.empty()) {
auto &domTree = di.getDomTree(&sourceRegion);
for (auto node : llvm::breadth_first(domTree.getRootNode())) {
handleOneBlock(*node->getBlock());
}
}
for (Block &targetBlock : targetRegion)
cleanupBlock(&targetBlock);
}
/// Lowers workshare to a sequence of single-thread regions and parallel loops
///
/// For example:
///
/// omp.workshare {
/// %a = fir.allocmem
/// omp.workshare.loop_wrapper {}
/// fir.call Assign %b %a
/// fir.freemem %a
/// }
///
/// becomes
///
/// %tmp = fir.alloca
/// omp.single copyprivate(%tmp) {
/// %a = fir.allocmem
/// fir.store %a %tmp
/// }
/// %a_reloaded = fir.load %tmp
/// omp.workshare.loop_wrapper {}
/// omp.single {
/// fir.call Assign %b %a_reloaded
/// fir.freemem %a_reloaded
/// }
///
/// Note that we allocate temporary memory for values in omp.single's which need
/// to be accessed by all threads and broadcast them using single's copyprivate
LogicalResult lowerWorkshare(mlir::omp::WorkshareOp wsOp, DominanceInfo &di) {
Location loc = wsOp->getLoc();
IRMapping rootMapping;
OpBuilder rootBuilder(wsOp);
// FIXME Currently, we only support workshare constructs with structured
// control flow. The transformation itself supports CFG, however, once we
// transform the MLIR region in the omp.workshare, we need to inline that
// region in the parent block. We have no guarantees at this point of the
// pipeline that the parent op supports CFG (e.g. fir.if), thus this is not
// generally possible. The alternative is to put the lowered region in an
// operation akin to scf.execute_region, which will get lowered at the same
// time when fir ops get lowered to CFG. However, SCF is not registered in
// flang so we cannot use it. Remove this requirement once we have
// scf.execute_region or an alternative operation available.
if (wsOp.getRegion().getBlocks().size() == 1) {
// This operation is just a placeholder which will be erased later. We need
// it because our `parallelizeRegion` function works on regions and not
// blocks.
omp::WorkshareOp newOp =
omp::WorkshareOp::create(rootBuilder, loc, omp::WorkshareOperands());
if (!wsOp.getNowait())
omp::BarrierOp::create(rootBuilder, loc);
parallelizeRegion(wsOp.getRegion(), newOp.getRegion(), rootMapping, loc,
di);
// Inline the contents of the placeholder workshare op into its parent
// block.
Block *theBlock = &newOp.getRegion().front();
Operation *term = theBlock->getTerminator();
Block *parentBlock = wsOp->getBlock();
parentBlock->getOperations().splice(newOp->getIterator(),
theBlock->getOperations());
assert(term->getNumOperands() == 0);
term->erase();
newOp->erase();
wsOp->erase();
} else {
// Otherwise just change the operation to an omp.single.
wsOp->emitWarning(
"omp workshare with unstructured control flow is currently "
"unsupported and will be serialized.");
// `shouldUseWorkshareLowering` should have guaranteed that there are no
// omp.workshare_loop_wrapper's that bind to this omp.workshare.
assert(!wsOp->walk([&](Operation *op) {
// Nested omp.workshare can have their own
// omp.workshare_loop_wrapper's.
if (isa<omp::WorkshareOp>(op))
return WalkResult::skip();
if (isa<omp::WorkshareLoopWrapperOp>(op))
return WalkResult::interrupt();
return WalkResult::advance();
})
.wasInterrupted());
omp::SingleOperands operands;
operands.nowait = wsOp.getNowaitAttr();
omp::SingleOp newOp = omp::SingleOp::create(rootBuilder, loc, operands);
newOp.getRegion().getBlocks().splice(newOp.getRegion().getBlocks().begin(),
wsOp.getRegion().getBlocks());
wsOp->erase();
}
return success();
}
class LowerWorksharePass
: public flangomp::impl::LowerWorkshareBase<LowerWorksharePass> {
public:
void runOnOperation() override {
mlir::DominanceInfo &di = getAnalysis<mlir::DominanceInfo>();
getOperation()->walk([&](mlir::omp::WorkshareOp wsOp) {
if (failed(lowerWorkshare(wsOp, di)))
signalPassFailure();
});
}
};
} // namespace